/* * googleservice google.rs * - google parsing * * Copyright (C) 2025 Real Microsoft, LLC * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ use std::sync::atomic::Ordering; use std::time::Duration; use asklyphe_common::nats::bingservice::{BingImageResult, BingSearchResult}; use isahc::{AsyncReadResponseExt, HttpClient, Request, RequestExt}; use isahc::auth::Authentication; use isahc::config::RedirectPolicy; use isahc::prelude::Configurable; use log::{debug, error, warn}; use scraper::{CaseSensitivity, Element, Html, Selector}; use scraper::selector::CssLocalName; use serde::Deserialize; use crate::{NEXT_PROXY, PROXIES}; use crate::proxy::Proxy; #[derive(Debug)] pub enum GoogleQueryError { CouldNotSendRequest, CouldNotDeserialize, } pub async fn proxy() -> Option { if !PROXIES.is_empty() { let mut next_proxy = NEXT_PROXY.fetch_add(1, Ordering::Relaxed); if next_proxy >= PROXIES.len() { next_proxy = 0; NEXT_PROXY.store(1, Ordering::Relaxed); } //debug!("proxy {}", next_proxy); let p = PROXIES[next_proxy].clone(); return Some(p); } None } pub async fn query_google(query: String) -> Result, GoogleQueryError> { let query = url_encoded_data::stringify(&[("q", query.as_str())]); let proxy = proxy().await; let mut client = HttpClient::builder() .redirect_policy(RedirectPolicy::Limit(10)); let mut proxy_name = "none".to_string(); if let Some(proxy) = proxy { client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR"))); proxy_name = proxy.address.clone(); if let Some(proxy_auth) = proxy.credentials { client = client.proxy_authentication(Authentication::basic()); client = client.proxy_credentials(proxy_auth); } } let client = client .build().expect("FAILED TO CONSTRUCT HTTP CLIENT"); let url = format!("https://www.google.com/search?{query}&lr=lang_en"); let res = Request::get(&url) .header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619") .header("accept-language", "en-US,en;q=0.5") .body(()).unwrap(); let mut res = client.send_async(res).await.map_err(|e| { error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})"); GoogleQueryError::CouldNotSendRequest })?; let body = res.text().await.map_err(|e| { error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}"); GoogleQueryError::CouldNotDeserialize })?; let document = Html::parse_document(body.as_str()); // windows phone google layout // div 0 - search bar // div 1 - results + (potential) info panel let res_sel = Selector::parse("body>div>div>div>div>div").unwrap(); let link_sel = Selector::parse("a").unwrap(); let title_sel = Selector::parse("a>span").unwrap(); let desc_sel = Selector::parse("td>div>div>span").unwrap(); let span_sel = Selector::parse("span").unwrap(); let link_results = document.select(&res_sel).collect::>(); let mut final_results = vec![]; for element in link_results { let link = element.select(&link_sel).next(); if link.is_none() { warn!("link is none"); continue; } let link = link.unwrap(); let link = link.value().attr("href").unwrap(); let link = link.trim_start_matches("/url"); let link = url_encoded_data::from(link); let link = link.get("q"); if link.is_none() { warn!("link is none"); continue; } let link = link.unwrap(); let url = link[0].to_string(); let title = element.select(&title_sel).next(); if title.is_none() { warn!("title is none"); continue; } let title = title.unwrap(); let title = title.text().collect::(); let desc = element.select(&desc_sel).next(); if desc.is_none() { warn!("desc is none"); continue; } let desc = desc.unwrap(); let desc = desc.select(&span_sel).last(); if desc.is_none() { warn!("desc is none"); continue; } let desc = desc.unwrap().text().collect::(); if title.is_empty() || desc.is_empty() { warn!("title or desc is empty"); continue; } let title = title.trim(); let desc = desc.trim(); final_results.push(BingSearchResult { url, title: Some(title.to_string()), description: Some(desc.to_string()), }) } if final_results.is_empty() { debug!("no results, body: {body}"); } Ok(final_results) } // first starts at 0 and goes up by multiples of 20 pub async fn query_google_images(query: String, first: usize) -> Result, GoogleQueryError> { let query = url_encoded_data::stringify(&[("q", query.as_str())]); let proxy = proxy().await; let mut client = HttpClient::builder() .redirect_policy(RedirectPolicy::Limit(10)); let mut proxy_name = "none".to_string(); if let Some(proxy) = proxy { client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR"))); proxy_name = proxy.address.clone(); if let Some(proxy_auth) = proxy.credentials { client = client.proxy_authentication(Authentication::basic()); client = client.proxy_credentials(proxy_auth); } } let client = client .build().expect("FAILED TO CONSTRUCT HTTP CLIENT"); let url = format!("https://www.google.com/search?{query}&lr=lang_en&tbm=isch&start={first}"); let res = Request::get(&url) .header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619") .header("accept-language", "en-US,en;q=0.5") .body(()).unwrap(); let mut res = client.send_async(res).await.map_err(|e| { error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})"); GoogleQueryError::CouldNotSendRequest })?; let body = res.text().await.map_err(|e| { error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}"); GoogleQueryError::CouldNotDeserialize })?; let document = Html::parse_document(body.as_str()); // windows phone google layout // div 0 - search bar // div 1 - results + (potential) info panel let res_sel = Selector::parse("div>div>a").unwrap(); let img_sel = Selector::parse("div>img").unwrap(); let link_results = document.select(&res_sel).collect::>(); let mut final_results = vec![]; for element in link_results { let img = element.select(&img_sel).next(); if img.is_none() { continue; } let url = element.attr("href"); if url.is_none() { warn!("no href in image result"); continue; } let url = url.unwrap(); let link = url.trim_start_matches("/imgres"); let link = url_encoded_data::from(link); let link = link.get("imgurl"); if link.is_none() { warn!("link is none"); continue; } let url = link.unwrap()[0].to_string(); // fixme: this is a bad hack for removing facebook urls since they don't play nicely with non-google clients if url.contains("lookaside.fbsbx.com") { continue; } final_results.push(BingImageResult { url, }) } if final_results.is_empty() { debug!("no results, body: {body}"); } Ok(final_results) }