diff --git a/vorebot/src/webparse/mod.rs b/vorebot/src/webparse/mod.rs index d6d6580..0d34b0c 100644 --- a/vorebot/src/webparse/mod.rs +++ b/vorebot/src/webparse/mod.rs @@ -67,7 +67,7 @@ pub async fn web_parse( .create_key_value(kv::Config { bucket: "hosts".to_string(), description: "prevent the same host from being scraped too quickly".to_string(), - max_age: Duration::from_secs(60 * 5), + max_age: Duration::from_secs(60 * 10), ..Default::default() }) .await; @@ -93,7 +93,7 @@ pub async fn web_parse( if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await { let count = *host.first().unwrap_or(&0); - if count > 100 { + if count > 10 { warn!("scraping {} too quickly, avoiding for one minute", robots_url); return Err(()); } @@ -220,6 +220,17 @@ pub async fn web_parse( // i guess we're good driver.goto(url).await.map_err(|_| ())?; + let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?; + + if let Some(lang) = html_element.attr("lang").await.ok().flatten() { + if !lang.starts_with("en") && !lang.starts_with("unknown") { + // i.e. non-english language + // fixme: remove this once we start expanding to non-english-speaking markets? + warn!("skipping {} due to {} language (currently prioritizing english", url, lang); + return Err(()); + } + } + let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?; let title = driver.title().await.map_err(|_| ())?;