260 lines
9.5 KiB
Rust
260 lines
9.5 KiB
Rust
|
|
/*
|
||
|
|
* bingservice bing.rs
|
||
|
|
* - parsing of bing
|
||
|
|
*
|
||
|
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||
|
|
*
|
||
|
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||
|
|
*
|
||
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||
|
|
*
|
||
|
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
|
*/
|
||
|
|
|
||
|
|
use std::sync::atomic::Ordering;
|
||
|
|
use std::time::Duration;
|
||
|
|
use asklyphe_common::nats::bingservice::{BingImageResult, BingSearchResult};
|
||
|
|
use isahc::{AsyncReadResponseExt, HttpClient, Request, RequestExt};
|
||
|
|
use isahc::auth::Authentication;
|
||
|
|
use isahc::config::RedirectPolicy;
|
||
|
|
use isahc::prelude::Configurable;
|
||
|
|
use log::{debug, error, warn};
|
||
|
|
use scraper::{CaseSensitivity, Element, Html, Selector};
|
||
|
|
use scraper::selector::CssLocalName;
|
||
|
|
use serde::Deserialize;
|
||
|
|
use crate::{BING_KEY, NEXT_PROXY, PROXIES};
|
||
|
|
use crate::proxy::Proxy;
|
||
|
|
|
||
|
|
#[derive(Debug)]
|
||
|
|
pub enum BingQueryError {
|
||
|
|
CouldNotSendRequest,
|
||
|
|
CouldNotDeserialize,
|
||
|
|
}
|
||
|
|
|
||
|
|
pub async fn proxy() -> Option<Proxy> {
|
||
|
|
if !PROXIES.is_empty() {
|
||
|
|
let mut next_proxy = NEXT_PROXY.fetch_add(1, Ordering::Relaxed);
|
||
|
|
if next_proxy >= PROXIES.len() {
|
||
|
|
next_proxy = 0;
|
||
|
|
NEXT_PROXY.store(1, Ordering::Relaxed);
|
||
|
|
}
|
||
|
|
//debug!("proxy {}", next_proxy);
|
||
|
|
let p = PROXIES[next_proxy].clone();
|
||
|
|
return Some(p);
|
||
|
|
}
|
||
|
|
None
|
||
|
|
}
|
||
|
|
|
||
|
|
pub async fn generate_cvid() -> Result<String, BingQueryError> {
|
||
|
|
// query https://www.bing.com
|
||
|
|
// look for cookie header setting the "IG" cookie
|
||
|
|
let proxy = proxy().await;
|
||
|
|
let mut client = HttpClient::builder()
|
||
|
|
.redirect_policy(RedirectPolicy::Limit(10));
|
||
|
|
if let Some(proxy) = proxy {
|
||
|
|
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
|
||
|
|
if let Some(proxy_auth) = proxy.credentials {
|
||
|
|
client = client.proxy_authentication(Authentication::basic());
|
||
|
|
client = client.proxy_credentials(proxy_auth);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let client = client
|
||
|
|
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
|
||
|
|
|
||
|
|
|
||
|
|
let res = Request::get("https://www.bing.com/")
|
||
|
|
.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8")
|
||
|
|
.header("accept-language", "en-US,en;q=0.5")
|
||
|
|
.redirect_policy(RedirectPolicy::Follow)
|
||
|
|
.body(()).unwrap();
|
||
|
|
let mut res = client.send_async(res).await.map_err(|e| {
|
||
|
|
error!("FAILED TO ATTEMPT CVID GEN, COULDN'T SEND HTTP REQUEST: {e}");
|
||
|
|
BingQueryError::CouldNotSendRequest
|
||
|
|
})?;
|
||
|
|
|
||
|
|
for (name, val) in res.headers().iter() {
|
||
|
|
if name.as_str() == "set-cookie" {
|
||
|
|
let val_str = String::from_utf8_lossy(val.as_bytes());
|
||
|
|
if val_str.contains("IG=") {
|
||
|
|
let start = val_str.find("IG=").unwrap();
|
||
|
|
let start = start + "IG=".len();
|
||
|
|
let end = val_str[start..].find(";").ok_or(BingQueryError::CouldNotDeserialize)?;
|
||
|
|
let cvid = val_str[start..(start + end)].trim();
|
||
|
|
if cvid.len() != 32 {
|
||
|
|
warn!("cvid len was not 32 (actually \"{}\") while retrieving cvid, continuing but this may cause issues", cvid);
|
||
|
|
}
|
||
|
|
return Ok(cvid.to_string());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
error!("FAILED CVID GEN! NO CVID FOUND!");
|
||
|
|
Err(BingQueryError::CouldNotDeserialize)
|
||
|
|
}
|
||
|
|
|
||
|
|
pub async fn query_bing(query: String, cvid: &str, first: usize) -> Result<Vec<BingSearchResult>, BingQueryError> {
|
||
|
|
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
|
||
|
|
|
||
|
|
let proxy = proxy().await;
|
||
|
|
let mut client = HttpClient::builder()
|
||
|
|
.redirect_policy(RedirectPolicy::Limit(10));
|
||
|
|
let mut proxy_name = "none".to_string();
|
||
|
|
if let Some(proxy) = proxy {
|
||
|
|
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
|
||
|
|
proxy_name = proxy.address.clone();
|
||
|
|
if let Some(proxy_auth) = proxy.credentials {
|
||
|
|
client = client.proxy_authentication(Authentication::basic());
|
||
|
|
client = client.proxy_credentials(proxy_auth);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let client = client
|
||
|
|
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
|
||
|
|
|
||
|
|
let url = format!("https://www.bing.com/search?{query}&form=QBLH&sp=-1&ghc=1&lq=0&p{query}&sc=11-9&qs=n&sk=&cvid={cvid}&ghsh=0&ghacc=0&ghpl=");
|
||
|
|
|
||
|
|
let res = Request::get(&url)
|
||
|
|
.header("user-agent", "Mozilla/5.0 (Linux; Android 9; KFTRWI) AppleWebKit/537.36 (KHTML, like Gecko) Silk/126.6.2 like Chrome/126.0.6478.238 Safari/537.36")
|
||
|
|
.header("accept-language", "en-US,en;q=0.5")
|
||
|
|
.body(()).unwrap();
|
||
|
|
let mut res = client.send_async(res).await.map_err(|e| {
|
||
|
|
error!("FAILED TO SEND REQUEST TO BING: {e} ({query}) (proxy: {proxy_name})");
|
||
|
|
BingQueryError::CouldNotSendRequest
|
||
|
|
})?;
|
||
|
|
let body = res.text().await.map_err(|e| {
|
||
|
|
error!("COULD NOT READ BING RESPONSE TEXT: {e}");
|
||
|
|
BingQueryError::CouldNotDeserialize
|
||
|
|
})?;
|
||
|
|
|
||
|
|
let document = Html::parse_document(body.as_str());
|
||
|
|
let ol_sel = Selector::parse("ol").unwrap();
|
||
|
|
|
||
|
|
let results = document.select(&ol_sel).find(|v| {v.value().id().unwrap_or_default() == "b_results"}).ok_or_else(|| {
|
||
|
|
error!("COULD NOT FIND <ol> IN BING RESPONSE");
|
||
|
|
debug!("body: {body}");
|
||
|
|
BingQueryError::CouldNotDeserialize
|
||
|
|
})?;
|
||
|
|
|
||
|
|
let li_sel = Selector::parse("li").unwrap();
|
||
|
|
let h2_sel = Selector::parse("h2").unwrap();
|
||
|
|
let cite_sel = Selector::parse("a.tilk").unwrap();
|
||
|
|
let p_sel = Selector::parse("p").unwrap();
|
||
|
|
|
||
|
|
let mut final_results = vec![];
|
||
|
|
|
||
|
|
for element in results.select(&li_sel) {
|
||
|
|
if !element.value().has_class("b_algo", CaseSensitivity::AsciiCaseInsensitive) {
|
||
|
|
// non-text search result
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let title = element.select(&h2_sel).next();
|
||
|
|
if title.is_none() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
let url = element.select(&cite_sel).next().map(|v| v.attr("href").map(|v| v.to_string())).flatten();
|
||
|
|
if url.is_none() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let desc = element.select(&p_sel).next();
|
||
|
|
|
||
|
|
let title = title.unwrap().text().collect::<Vec<_>>();
|
||
|
|
let title = title.join(" ");
|
||
|
|
|
||
|
|
let url = url.unwrap();
|
||
|
|
|
||
|
|
if url.contains("javascript") {
|
||
|
|
warn!("javascript url encountered, full cite: {:?}", element.select(&cite_sel).collect::<Vec<_>>());
|
||
|
|
}
|
||
|
|
|
||
|
|
let desc = if let Some(desc) = desc {
|
||
|
|
let desc = desc.text().into_iter().collect::<Vec<_>>();
|
||
|
|
let desc = desc.join(" ");
|
||
|
|
Some(desc)
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
final_results.push(BingSearchResult {
|
||
|
|
url,
|
||
|
|
title: Some(title),
|
||
|
|
description: desc,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
if final_results.is_empty() {
|
||
|
|
debug!("no results, body: {body}");
|
||
|
|
}
|
||
|
|
|
||
|
|
Ok(final_results)
|
||
|
|
}
|
||
|
|
|
||
|
|
// first starts at 1 and goes up by multiples of 12, first = (page * 12) + 1
|
||
|
|
pub async fn query_bing_images(query: String, first: usize) -> Result<Vec<BingImageResult>, BingQueryError> {
|
||
|
|
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
|
||
|
|
|
||
|
|
let proxy = proxy().await;
|
||
|
|
let mut client = HttpClient::builder()
|
||
|
|
.redirect_policy(RedirectPolicy::Limit(10));
|
||
|
|
let mut proxy_name = "none".to_string();
|
||
|
|
if let Some(proxy) = proxy {
|
||
|
|
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
|
||
|
|
proxy_name = proxy.address.clone();
|
||
|
|
if let Some(proxy_auth) = proxy.credentials {
|
||
|
|
client = client.proxy_authentication(Authentication::basic());
|
||
|
|
client = client.proxy_credentials(proxy_auth);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let client = client
|
||
|
|
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
|
||
|
|
|
||
|
|
let url = format!("https://www.bing.com/images/search?{query}&first={first}");
|
||
|
|
|
||
|
|
let res = Request::get(&url)
|
||
|
|
.header("user-agent", "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba")
|
||
|
|
.header("accept-language", "en-US,en;q=0.5")
|
||
|
|
.body(()).unwrap();
|
||
|
|
let mut res = client.send_async(res).await.map_err(|e| {
|
||
|
|
error!("FAILED TO SEND REQUEST TO BING: {e} ({query}) (proxy: {proxy_name})");
|
||
|
|
BingQueryError::CouldNotSendRequest
|
||
|
|
})?;
|
||
|
|
let body = res.text().await.map_err(|e| {
|
||
|
|
error!("COULD NOT READ BING RESPONSE TEXT: {e}");
|
||
|
|
BingQueryError::CouldNotDeserialize
|
||
|
|
})?;
|
||
|
|
|
||
|
|
let document = Html::parse_document(body.as_str());
|
||
|
|
let link_sel = Selector::parse("div>a").unwrap();
|
||
|
|
let img_sel = Selector::parse("div>img").unwrap();
|
||
|
|
|
||
|
|
let results = document.select(&link_sel).collect::<Vec<_>>();
|
||
|
|
|
||
|
|
let mut final_results = vec![];
|
||
|
|
|
||
|
|
for element in results {
|
||
|
|
let img = element.select(&img_sel).next();
|
||
|
|
if img.is_none() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
let url = element.attr("href");
|
||
|
|
if url.is_none() {
|
||
|
|
warn!("no href in image result");
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
let url = url.unwrap().to_string();
|
||
|
|
|
||
|
|
final_results.push(BingImageResult {
|
||
|
|
url,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
if final_results.is_empty() {
|
||
|
|
debug!("no results, body: {body}");
|
||
|
|
}
|
||
|
|
|
||
|
|
Ok(final_results)
|
||
|
|
}
|