Merge pull request 'bring over latest vorebot tweaks' (#4) from feature/nikocs/vorebot-tweaks into develop
	
		
			
	
		
	
	
		
	
		
			All checks were successful
		
		
	
	
		
			
				
	
				/ build-all-services (push) Successful in 9m13s
				
			
		
		
	
	
				
					
				
			
		
			All checks were successful
		
		
	
	/ build-all-services (push) Successful in 9m13s
				
			Reviewed-on: #4
This commit is contained in:
		
						commit
						9c159c7170
					
				
					 1 changed files with 13 additions and 2 deletions
				
			
		|  | @ -67,7 +67,7 @@ pub async fn web_parse( | |||
|             .create_key_value(kv::Config { | ||||
|                 bucket: "hosts".to_string(), | ||||
|                 description: "prevent the same host from being scraped too quickly".to_string(), | ||||
|                 max_age: Duration::from_secs(60 * 5), | ||||
|                 max_age: Duration::from_secs(60 * 10), | ||||
|                 ..Default::default() | ||||
|             }) | ||||
|             .await; | ||||
|  | @ -93,7 +93,7 @@ pub async fn web_parse( | |||
| 
 | ||||
|     if let Ok(Some(host)) = hosts_bucket.get(hash.to_string()).await { | ||||
|         let count = *host.first().unwrap_or(&0); | ||||
|         if count > 100 { | ||||
|         if count > 10 { | ||||
|             warn!("scraping {} too quickly, avoiding for one minute", robots_url); | ||||
|             return Err(()); | ||||
|         } | ||||
|  | @ -220,6 +220,17 @@ pub async fn web_parse( | |||
|     // i guess we're good
 | ||||
|     driver.goto(url).await.map_err(|_| ())?; | ||||
| 
 | ||||
|     let html_element = driver.find(By::Tag("html")).await.map_err(|_| ())?; | ||||
| 
 | ||||
|     if let Some(lang) = html_element.attr("lang").await.ok().flatten() { | ||||
|         if !lang.starts_with("en") && !lang.starts_with("unknown") { | ||||
|             // i.e. non-english language
 | ||||
|             // fixme: remove this once we start expanding to non-english-speaking markets?
 | ||||
|             warn!("skipping {} due to {} language (currently prioritizing english", url, lang); | ||||
|             return Err(()); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     let meta_elements = driver.find_all(By::Tag("meta")).await.map_err(|_| ())?; | ||||
| 
 | ||||
|     let title = driver.title().await.map_err(|_| ())?; | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue