From d341c66390e38401a2f17522fc89977e2f72226c Mon Sep 17 00:00:00 2001 From: husky Date: Fri, 14 Mar 2025 14:29:46 -0700 Subject: [PATCH] prioritize english --- vorebot/src/webparse/mod.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vorebot/src/webparse/mod.rs b/vorebot/src/webparse/mod.rs index ba84f74..0d34b0c 100644 --- a/vorebot/src/webparse/mod.rs +++ b/vorebot/src/webparse/mod.rs @@ -220,6 +220,17 @@ pub async fn web_parse( // i guess we're good driver.goto(url).await.map_err(|_| ())?; + let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?; + + if let Some(lang) = html_element.attr("lang").await.ok().flatten() { + if !lang.starts_with("en") && !lang.starts_with("unknown") { + // i.e. non-english language + // fixme: remove this once we start expanding to non-english-speaking markets? + warn!("skipping {} due to {} language (currently prioritizing english", url, lang); + return Err(()); + } + } + let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?; let title = driver.title().await.map_err(|_| ())?;