From c754338bf4eab7dbc694df67ba4ccad1ef6cd319 Mon Sep 17 00:00:00 2001 From: husky Date: Fri, 14 Mar 2025 12:13:45 -0700 Subject: [PATCH 1/2] favor newer hostnames --- vorebot/src/webparse/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vorebot/src/webparse/mod.rs b/vorebot/src/webparse/mod.rs index d6d6580..ba84f74 100644 --- a/vorebot/src/webparse/mod.rs +++ b/vorebot/src/webparse/mod.rs @@ -67,7 +67,7 @@ pub async fn web_parse( .create_key_value(kv::Config { bucket: "hosts".to_string(), description: "prevent the same host from being scraped too quickly".to_string(), - max_age: Duration::from_secs(60 * 5), + max_age: Duration::from_secs(60 * 10), ..Default::default() }) .await; @@ -93,7 +93,7 @@ pub async fn web_parse( if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await { let count = *host.first().unwrap_or(&0); - if count > 100 { + if count > 10 { warn!("scraping {} too quickly, avoiding for one minute", robots_url); return Err(()); } -- 2.39.5 From d341c66390e38401a2f17522fc89977e2f72226c Mon Sep 17 00:00:00 2001 From: husky Date: Fri, 14 Mar 2025 14:29:46 -0700 Subject: [PATCH 2/2] prioritize english --- vorebot/src/webparse/mod.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vorebot/src/webparse/mod.rs b/vorebot/src/webparse/mod.rs index ba84f74..0d34b0c 100644 --- a/vorebot/src/webparse/mod.rs +++ b/vorebot/src/webparse/mod.rs @@ -220,6 +220,17 @@ pub async fn web_parse( // i guess we're good driver.goto(url).await.map_err(|_| ())?; + let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?; + + if let Some(lang) = html_element.attr("lang").await.ok().flatten() { + if !lang.starts_with("en") && !lang.starts_with("unknown") { + // i.e. non-english language + // fixme: remove this once we start expanding to non-english-speaking markets? + warn!("skipping {} due to {} language (currently prioritizing english", url, lang); + return Err(()); + } + } + let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?; let title = driver.title().await.map_err(|_| ())?; -- 2.39.5