/*
* googleservice google.rs
* - google parsing
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see .
*/
use std::sync::atomic::Ordering;
use std::time::Duration;
use asklyphe_common::nats::bingservice::{BingImageResult, BingSearchResult};
use isahc::{AsyncReadResponseExt, HttpClient, Request, RequestExt};
use isahc::auth::Authentication;
use isahc::config::RedirectPolicy;
use isahc::prelude::Configurable;
use log::{debug, error, warn};
use scraper::{CaseSensitivity, Element, Html, Selector};
use scraper::selector::CssLocalName;
use serde::Deserialize;
use crate::{NEXT_PROXY, PROXIES};
use crate::proxy::Proxy;
#[derive(Debug)]
pub enum GoogleQueryError {
CouldNotSendRequest,
CouldNotDeserialize,
}
pub async fn proxy() -> Option {
if !PROXIES.is_empty() {
let mut next_proxy = NEXT_PROXY.fetch_add(1, Ordering::Relaxed);
if next_proxy >= PROXIES.len() {
next_proxy = 0;
NEXT_PROXY.store(1, Ordering::Relaxed);
}
//debug!("proxy {}", next_proxy);
let p = PROXIES[next_proxy].clone();
return Some(p);
}
None
}
pub async fn query_google(query: String) -> Result, GoogleQueryError> {
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
let proxy = proxy().await;
let mut client = HttpClient::builder()
.redirect_policy(RedirectPolicy::Limit(10));
let mut proxy_name = "none".to_string();
if let Some(proxy) = proxy {
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
proxy_name = proxy.address.clone();
if let Some(proxy_auth) = proxy.credentials {
client = client.proxy_authentication(Authentication::basic());
client = client.proxy_credentials(proxy_auth);
}
}
let client = client
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
let url = format!("https://www.google.com/search?{query}&lr=lang_en");
let res = Request::get(&url)
.header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619")
.header("accept-language", "en-US,en;q=0.5")
.body(()).unwrap();
let mut res = client.send_async(res).await.map_err(|e| {
error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})");
GoogleQueryError::CouldNotSendRequest
})?;
let body = res.text().await.map_err(|e| {
error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}");
GoogleQueryError::CouldNotDeserialize
})?;
let document = Html::parse_document(body.as_str());
// windows phone google layout
// div 0 - search bar
// div 1 - results + (potential) info panel
let res_sel = Selector::parse("body>div>div>div>div>div").unwrap();
let link_sel = Selector::parse("a").unwrap();
let title_sel = Selector::parse("a>span").unwrap();
let desc_sel = Selector::parse("td>div>div>span").unwrap();
let span_sel = Selector::parse("span").unwrap();
let link_results = document.select(&res_sel).collect::>();
let mut final_results = vec![];
for element in link_results {
let link = element.select(&link_sel).next();
if link.is_none() {
warn!("link is none");
continue;
}
let link = link.unwrap();
let link = link.value().attr("href").unwrap();
let link = link.trim_start_matches("/url");
let link = url_encoded_data::from(link);
let link = link.get("q");
if link.is_none() {
warn!("link is none");
continue;
}
let link = link.unwrap();
let url = link[0].to_string();
let title = element.select(&title_sel).next();
if title.is_none() {
warn!("title is none");
continue;
}
let title = title.unwrap();
let title = title.text().collect::();
let desc = element.select(&desc_sel).next();
if desc.is_none() {
warn!("desc is none");
continue;
}
let desc = desc.unwrap();
let desc = desc.select(&span_sel).last();
if desc.is_none() {
warn!("desc is none");
continue;
}
let desc = desc.unwrap().text().collect::();
if title.is_empty() || desc.is_empty() {
warn!("title or desc is empty");
continue;
}
let title = title.trim();
let desc = desc.trim();
final_results.push(BingSearchResult {
url,
title: Some(title.to_string()),
description: Some(desc.to_string()),
})
}
if final_results.is_empty() {
debug!("no results, body: {body}");
}
Ok(final_results)
}
// first starts at 0 and goes up by multiples of 20
pub async fn query_google_images(query: String, first: usize) -> Result, GoogleQueryError> {
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
let proxy = proxy().await;
let mut client = HttpClient::builder()
.redirect_policy(RedirectPolicy::Limit(10));
let mut proxy_name = "none".to_string();
if let Some(proxy) = proxy {
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
proxy_name = proxy.address.clone();
if let Some(proxy_auth) = proxy.credentials {
client = client.proxy_authentication(Authentication::basic());
client = client.proxy_credentials(proxy_auth);
}
}
let client = client
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
let url = format!("https://www.google.com/search?{query}&lr=lang_en&tbm=isch&start={first}");
let res = Request::get(&url)
.header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619")
.header("accept-language", "en-US,en;q=0.5")
.body(()).unwrap();
let mut res = client.send_async(res).await.map_err(|e| {
error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})");
GoogleQueryError::CouldNotSendRequest
})?;
let body = res.text().await.map_err(|e| {
error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}");
GoogleQueryError::CouldNotDeserialize
})?;
let document = Html::parse_document(body.as_str());
// windows phone google layout
// div 0 - search bar
// div 1 - results + (potential) info panel
let res_sel = Selector::parse("div>div>a").unwrap();
let img_sel = Selector::parse("div>img").unwrap();
let link_results = document.select(&res_sel).collect::>();
let mut final_results = vec![];
for element in link_results {
let img = element.select(&img_sel).next();
if img.is_none() {
continue;
}
let url = element.attr("href");
if url.is_none() {
warn!("no href in image result");
continue;
}
let url = url.unwrap();
let link = url.trim_start_matches("/imgres");
let link = url_encoded_data::from(link);
let link = link.get("imgurl");
if link.is_none() {
warn!("link is none");
continue;
}
let url = link.unwrap()[0].to_string();
// fixme: this is a bad hack for removing facebook urls since they don't play nicely with non-google clients
if url.contains("lookaside.fbsbx.com") {
continue;
}
final_results.push(BingImageResult {
url,
})
}
if final_results.is_empty() {
debug!("no results, body: {body}");
}
Ok(final_results)
}