Merge pull request 'bring over latest vorebot tweaks' (#4) from feature/nikocs/vorebot-tweaks into develop
All checks were successful
/ build-all-services (push) Successful in 9m13s

Reviewed-on: #4
This commit is contained in:
husky 2025-03-14 18:03:28 -07:00
commit 9c159c7170

View file

@ -67,7 +67,7 @@ pub async fn web_parse(
.create_key_value(kv::Config { .create_key_value(kv::Config {
bucket: "hosts".to_string(), bucket: "hosts".to_string(),
description: "prevent the same host from being scraped too quickly".to_string(), description: "prevent the same host from being scraped too quickly".to_string(),
max_age: Duration::from_secs(60 * 5), max_age: Duration::from_secs(60 * 10),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -93,7 +93,7 @@ pub async fn web_parse(
if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await { if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await {
let count = *host.first().unwrap_or(&0); let count = *host.first().unwrap_or(&0);
if count > 100 { if count > 10 {
warn!("scraping {} too quickly, avoiding for one minute", robots_url); warn!("scraping {} too quickly, avoiding for one minute", robots_url);
return Err(()); return Err(());
} }
@ -220,6 +220,17 @@ pub async fn web_parse(
// i guess we're good // i guess we're good
driver.goto(url).await.map_err(|_| ())?; driver.goto(url).await.map_err(|_| ())?;
let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?;
if let Some(lang) = html_element.attr("lang").await.ok().flatten() {
if !lang.starts_with("en") && !lang.starts_with("unknown") {
// i.e. non-english language
// fixme: remove this once we start expanding to non-english-speaking markets?
warn!("skipping {} due to {} language (currently prioritizing english", url, lang);
return Err(());
}
}
let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?; let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?;
let title = driver.title().await.map_err(|_| ())?; let title = driver.title().await.map_err(|_| ())?;