forked from asklyphe-public/asklyphe
240 lines
8.2 KiB
Rust
240 lines
8.2 KiB
Rust
|
/*
|
||
|
* googleservice google.rs
|
||
|
* - google parsing
|
||
|
*
|
||
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||
|
*
|
||
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
use std::sync::atomic::Ordering;
|
||
|
use std::time::Duration;
|
||
|
use asklyphe_common::nats::bingservice::{BingImageResult, BingSearchResult};
|
||
|
use isahc::{AsyncReadResponseExt, HttpClient, Request, RequestExt};
|
||
|
use isahc::auth::Authentication;
|
||
|
use isahc::config::RedirectPolicy;
|
||
|
use isahc::prelude::Configurable;
|
||
|
use log::{debug, error, warn};
|
||
|
use scraper::{CaseSensitivity, Element, Html, Selector};
|
||
|
use scraper::selector::CssLocalName;
|
||
|
use serde::Deserialize;
|
||
|
use crate::{NEXT_PROXY, PROXIES};
|
||
|
use crate::proxy::Proxy;
|
||
|
|
||
|
#[derive(Debug)]
|
||
|
pub enum GoogleQueryError {
|
||
|
CouldNotSendRequest,
|
||
|
CouldNotDeserialize,
|
||
|
}
|
||
|
|
||
|
pub async fn proxy() -> Option<Proxy> {
|
||
|
if !PROXIES.is_empty() {
|
||
|
let mut next_proxy = NEXT_PROXY.fetch_add(1, Ordering::Relaxed);
|
||
|
if next_proxy >= PROXIES.len() {
|
||
|
next_proxy = 0;
|
||
|
NEXT_PROXY.store(1, Ordering::Relaxed);
|
||
|
}
|
||
|
//debug!("proxy {}", next_proxy);
|
||
|
let p = PROXIES[next_proxy].clone();
|
||
|
return Some(p);
|
||
|
}
|
||
|
None
|
||
|
}
|
||
|
|
||
|
pub async fn query_google(query: String) -> Result<Vec<BingSearchResult>, GoogleQueryError> {
|
||
|
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
|
||
|
|
||
|
let proxy = proxy().await;
|
||
|
let mut client = HttpClient::builder()
|
||
|
.redirect_policy(RedirectPolicy::Limit(10));
|
||
|
let mut proxy_name = "none".to_string();
|
||
|
if let Some(proxy) = proxy {
|
||
|
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
|
||
|
proxy_name = proxy.address.clone();
|
||
|
if let Some(proxy_auth) = proxy.credentials {
|
||
|
client = client.proxy_authentication(Authentication::basic());
|
||
|
client = client.proxy_credentials(proxy_auth);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
let client = client
|
||
|
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
|
||
|
|
||
|
let url = format!("https://www.google.com/search?{query}&lr=lang_en");
|
||
|
|
||
|
let res = Request::get(&url)
|
||
|
.header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619")
|
||
|
.header("accept-language", "en-US,en;q=0.5")
|
||
|
.body(()).unwrap();
|
||
|
let mut res = client.send_async(res).await.map_err(|e| {
|
||
|
error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})");
|
||
|
GoogleQueryError::CouldNotSendRequest
|
||
|
})?;
|
||
|
let body = res.text().await.map_err(|e| {
|
||
|
error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}");
|
||
|
GoogleQueryError::CouldNotDeserialize
|
||
|
})?;
|
||
|
|
||
|
let document = Html::parse_document(body.as_str());
|
||
|
|
||
|
// windows phone google layout
|
||
|
// div 0 - search bar
|
||
|
// div 1 - results + (potential) info panel
|
||
|
|
||
|
let res_sel = Selector::parse("body>div>div>div>div>div").unwrap();
|
||
|
|
||
|
let link_sel = Selector::parse("a").unwrap();
|
||
|
let title_sel = Selector::parse("a>span").unwrap();
|
||
|
let desc_sel = Selector::parse("td>div>div>span").unwrap();
|
||
|
let span_sel = Selector::parse("span").unwrap();
|
||
|
|
||
|
let link_results = document.select(&res_sel).collect::<Vec<_>>();
|
||
|
|
||
|
let mut final_results = vec![];
|
||
|
|
||
|
for element in link_results {
|
||
|
let link = element.select(&link_sel).next();
|
||
|
if link.is_none() {
|
||
|
warn!("link is none");
|
||
|
continue;
|
||
|
}
|
||
|
let link = link.unwrap();
|
||
|
let link = link.value().attr("href").unwrap();
|
||
|
let link = link.trim_start_matches("/url");
|
||
|
let link = url_encoded_data::from(link);
|
||
|
let link = link.get("q");
|
||
|
if link.is_none() {
|
||
|
warn!("link is none");
|
||
|
continue;
|
||
|
}
|
||
|
let link = link.unwrap();
|
||
|
let url = link[0].to_string();
|
||
|
let title = element.select(&title_sel).next();
|
||
|
if title.is_none() {
|
||
|
warn!("title is none");
|
||
|
continue;
|
||
|
}
|
||
|
let title = title.unwrap();
|
||
|
let title = title.text().collect::<String>();
|
||
|
|
||
|
let desc = element.select(&desc_sel).next();
|
||
|
if desc.is_none() {
|
||
|
warn!("desc is none");
|
||
|
continue;
|
||
|
}
|
||
|
let desc = desc.unwrap();
|
||
|
let desc = desc.select(&span_sel).last();
|
||
|
if desc.is_none() {
|
||
|
warn!("desc is none");
|
||
|
continue;
|
||
|
}
|
||
|
let desc = desc.unwrap().text().collect::<String>();
|
||
|
|
||
|
if title.is_empty() || desc.is_empty() {
|
||
|
warn!("title or desc is empty");
|
||
|
continue;
|
||
|
}
|
||
|
let title = title.trim();
|
||
|
let desc = desc.trim();
|
||
|
final_results.push(BingSearchResult {
|
||
|
url,
|
||
|
title: Some(title.to_string()),
|
||
|
description: Some(desc.to_string()),
|
||
|
})
|
||
|
}
|
||
|
|
||
|
if final_results.is_empty() {
|
||
|
debug!("no results, body: {body}");
|
||
|
}
|
||
|
|
||
|
Ok(final_results)
|
||
|
}
|
||
|
|
||
|
// first starts at 0 and goes up by multiples of 20
|
||
|
pub async fn query_google_images(query: String, first: usize) -> Result<Vec<BingImageResult>, GoogleQueryError> {
|
||
|
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
|
||
|
|
||
|
let proxy = proxy().await;
|
||
|
let mut client = HttpClient::builder()
|
||
|
.redirect_policy(RedirectPolicy::Limit(10));
|
||
|
let mut proxy_name = "none".to_string();
|
||
|
if let Some(proxy) = proxy {
|
||
|
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
|
||
|
proxy_name = proxy.address.clone();
|
||
|
if let Some(proxy_auth) = proxy.credentials {
|
||
|
client = client.proxy_authentication(Authentication::basic());
|
||
|
client = client.proxy_credentials(proxy_auth);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
let client = client
|
||
|
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
|
||
|
|
||
|
let url = format!("https://www.google.com/search?{query}&lr=lang_en&tbm=isch&start={first}");
|
||
|
|
||
|
let res = Request::get(&url)
|
||
|
.header("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)-619")
|
||
|
.header("accept-language", "en-US,en;q=0.5")
|
||
|
.body(()).unwrap();
|
||
|
let mut res = client.send_async(res).await.map_err(|e| {
|
||
|
error!("FAILED TO SEND REQUEST TO GOOGLE: {e} ({query}) (proxy: {proxy_name})");
|
||
|
GoogleQueryError::CouldNotSendRequest
|
||
|
})?;
|
||
|
let body = res.text().await.map_err(|e| {
|
||
|
error!("COULD NOT READ GOOGLE RESPONSE TEXT: {e}");
|
||
|
GoogleQueryError::CouldNotDeserialize
|
||
|
})?;
|
||
|
|
||
|
let document = Html::parse_document(body.as_str());
|
||
|
|
||
|
// windows phone google layout
|
||
|
// div 0 - search bar
|
||
|
// div 1 - results + (potential) info panel
|
||
|
|
||
|
let res_sel = Selector::parse("div>div>a").unwrap();
|
||
|
|
||
|
let img_sel = Selector::parse("div>img").unwrap();
|
||
|
|
||
|
let link_results = document.select(&res_sel).collect::<Vec<_>>();
|
||
|
|
||
|
let mut final_results = vec![];
|
||
|
|
||
|
for element in link_results {
|
||
|
let img = element.select(&img_sel).next();
|
||
|
if img.is_none() {
|
||
|
continue;
|
||
|
}
|
||
|
let url = element.attr("href");
|
||
|
if url.is_none() {
|
||
|
warn!("no href in image result");
|
||
|
continue;
|
||
|
}
|
||
|
let url = url.unwrap();
|
||
|
let link = url.trim_start_matches("/imgres");
|
||
|
let link = url_encoded_data::from(link);
|
||
|
let link = link.get("imgurl");
|
||
|
if link.is_none() {
|
||
|
warn!("link is none");
|
||
|
continue;
|
||
|
}
|
||
|
let url = link.unwrap()[0].to_string();
|
||
|
// fixme: this is a bad hack for removing facebook urls since they don't play nicely with non-google clients
|
||
|
if url.contains("lookaside.fbsbx.com") {
|
||
|
continue;
|
||
|
}
|
||
|
final_results.push(BingImageResult {
|
||
|
url,
|
||
|
})
|
||
|
}
|
||
|
|
||
|
if final_results.is_empty() {
|
||
|
debug!("no results, body: {body}");
|
||
|
}
|
||
|
|
||
|
Ok(final_results)
|
||
|
}
|