bring over latest vorebot tweaks #4
1 changed files with 13 additions and 2 deletions
|
@ -67,7 +67,7 @@ pub async fn web_parse(
|
|||
.create_key_value(kv::Config {
|
||||
bucket: "hosts".to_string(),
|
||||
description: "prevent the same host from being scraped too quickly".to_string(),
|
||||
max_age: Duration::from_secs(60 * 5),
|
||||
max_age: Duration::from_secs(60 * 10),
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
|
@ -93,7 +93,7 @@ pub async fn web_parse(
|
|||
|
||||
if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await {
|
||||
let count = *host.first().unwrap_or(&0);
|
||||
if count > 100 {
|
||||
if count > 10 {
|
||||
warn!("scraping {} too quickly, avoiding for one minute", robots_url);
|
||||
return Err(());
|
||||
}
|
||||
|
@ -220,6 +220,17 @@ pub async fn web_parse(
|
|||
// i guess we're good
|
||||
driver.goto(url).await.map_err(|_| ())?;
|
||||
|
||||
let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?;
|
||||
|
||||
if let Some(lang) = html_element.attr("lang").await.ok().flatten() {
|
||||
if !lang.starts_with("en") && !lang.starts_with("unknown") {
|
||||
// i.e. non-english language
|
||||
// fixme: remove this once we start expanding to non-english-speaking markets?
|
||||
warn!("skipping {} due to {} language (currently prioritizing english", url, lang);
|
||||
return Err(());
|
||||
}
|
||||
}
|
||||
|
||||
let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?;
|
||||
|
||||
let title = driver.title().await.map_err(|_| ())?;
|
||||
|
|
Loading…
Add table
Reference in a new issue