asklyphe/bingservice/src/bing.rs

260 lines
9.5 KiB
Rust
Raw Normal View History

2025-03-12 12:32:15 -07:00
/*
* bingservice bing.rs
* - parsing of bing
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::sync::atomic::Ordering;
use std::time::Duration;
use asklyphe_common::nats::bingservice::{BingImageResult, BingSearchResult};
use isahc::{AsyncReadResponseExt, HttpClient, Request, RequestExt};
use isahc::auth::Authentication;
use isahc::config::RedirectPolicy;
use isahc::prelude::Configurable;
use log::{debug, error, warn};
use scraper::{CaseSensitivity, Element, Html, Selector};
use scraper::selector::CssLocalName;
use serde::Deserialize;
use crate::{BING_KEY, NEXT_PROXY, PROXIES};
use crate::proxy::Proxy;
#[derive(Debug)]
pub enum BingQueryError {
CouldNotSendRequest,
CouldNotDeserialize,
}
pub async fn proxy() -> Option<Proxy> {
if !PROXIES.is_empty() {
let mut next_proxy = NEXT_PROXY.fetch_add(1, Ordering::Relaxed);
if next_proxy >= PROXIES.len() {
next_proxy = 0;
NEXT_PROXY.store(1, Ordering::Relaxed);
}
//debug!("proxy {}", next_proxy);
let p = PROXIES[next_proxy].clone();
return Some(p);
}
None
}
pub async fn generate_cvid() -> Result<String, BingQueryError> {
// query https://www.bing.com
// look for cookie header setting the "IG" cookie
let proxy = proxy().await;
let mut client = HttpClient::builder()
.redirect_policy(RedirectPolicy::Limit(10));
if let Some(proxy) = proxy {
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
if let Some(proxy_auth) = proxy.credentials {
client = client.proxy_authentication(Authentication::basic());
client = client.proxy_credentials(proxy_auth);
}
}
let client = client
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
let res = Request::get("https://www.bing.com/")
.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8")
.header("accept-language", "en-US,en;q=0.5")
.redirect_policy(RedirectPolicy::Follow)
.body(()).unwrap();
let mut res = client.send_async(res).await.map_err(|e| {
error!("FAILED TO ATTEMPT CVID GEN, COULDN'T SEND HTTP REQUEST: {e}");
BingQueryError::CouldNotSendRequest
})?;
for (name, val) in res.headers().iter() {
if name.as_str() == "set-cookie" {
let val_str = String::from_utf8_lossy(val.as_bytes());
if val_str.contains("IG=") {
let start = val_str.find("IG=").unwrap();
let start = start + "IG=".len();
let end = val_str[start..].find(";").ok_or(BingQueryError::CouldNotDeserialize)?;
let cvid = val_str[start..(start + end)].trim();
if cvid.len() != 32 {
warn!("cvid len was not 32 (actually \"{}\") while retrieving cvid, continuing but this may cause issues", cvid);
}
return Ok(cvid.to_string());
}
}
}
error!("FAILED CVID GEN! NO CVID FOUND!");
Err(BingQueryError::CouldNotDeserialize)
}
pub async fn query_bing(query: String, cvid: &str, first: usize) -> Result<Vec<BingSearchResult>, BingQueryError> {
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
let proxy = proxy().await;
let mut client = HttpClient::builder()
.redirect_policy(RedirectPolicy::Limit(10));
let mut proxy_name = "none".to_string();
if let Some(proxy) = proxy {
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
proxy_name = proxy.address.clone();
if let Some(proxy_auth) = proxy.credentials {
client = client.proxy_authentication(Authentication::basic());
client = client.proxy_credentials(proxy_auth);
}
}
let client = client
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
let url = format!("https://www.bing.com/search?{query}&form=QBLH&sp=-1&ghc=1&lq=0&p{query}&sc=11-9&qs=n&sk=&cvid={cvid}&ghsh=0&ghacc=0&ghpl=");
let res = Request::get(&url)
.header("user-agent", "Mozilla/5.0 (Linux; Android 9; KFTRWI) AppleWebKit/537.36 (KHTML, like Gecko) Silk/126.6.2 like Chrome/126.0.6478.238 Safari/537.36")
.header("accept-language", "en-US,en;q=0.5")
.body(()).unwrap();
let mut res = client.send_async(res).await.map_err(|e| {
error!("FAILED TO SEND REQUEST TO BING: {e} ({query}) (proxy: {proxy_name})");
BingQueryError::CouldNotSendRequest
})?;
let body = res.text().await.map_err(|e| {
error!("COULD NOT READ BING RESPONSE TEXT: {e}");
BingQueryError::CouldNotDeserialize
})?;
let document = Html::parse_document(body.as_str());
let ol_sel = Selector::parse("ol").unwrap();
let results = document.select(&ol_sel).find(|v| {v.value().id().unwrap_or_default() == "b_results"}).ok_or_else(|| {
error!("COULD NOT FIND <ol> IN BING RESPONSE");
debug!("body: {body}");
BingQueryError::CouldNotDeserialize
})?;
let li_sel = Selector::parse("li").unwrap();
let h2_sel = Selector::parse("h2").unwrap();
let cite_sel = Selector::parse("a.tilk").unwrap();
let p_sel = Selector::parse("p").unwrap();
let mut final_results = vec![];
for element in results.select(&li_sel) {
if !element.value().has_class("b_algo", CaseSensitivity::AsciiCaseInsensitive) {
// non-text search result
continue;
}
let title = element.select(&h2_sel).next();
if title.is_none() {
continue;
}
let url = element.select(&cite_sel).next().map(|v| v.attr("href").map(|v| v.to_string())).flatten();
if url.is_none() {
continue;
}
let desc = element.select(&p_sel).next();
let title = title.unwrap().text().collect::<Vec<_>>();
let title = title.join(" ");
let url = url.unwrap();
if url.contains("javascript") {
warn!("javascript url encountered, full cite: {:?}", element.select(&cite_sel).collect::<Vec<_>>());
}
let desc = if let Some(desc) = desc {
let desc = desc.text().into_iter().collect::<Vec<_>>();
let desc = desc.join(" ");
Some(desc)
} else {
None
};
final_results.push(BingSearchResult {
url,
title: Some(title),
description: desc,
});
}
if final_results.is_empty() {
debug!("no results, body: {body}");
}
Ok(final_results)
}
// first starts at 1 and goes up by multiples of 12, first = (page * 12) + 1
pub async fn query_bing_images(query: String, first: usize) -> Result<Vec<BingImageResult>, BingQueryError> {
let query = url_encoded_data::stringify(&[("q", query.as_str())]);
let proxy = proxy().await;
let mut client = HttpClient::builder()
.redirect_policy(RedirectPolicy::Limit(10));
let mut proxy_name = "none".to_string();
if let Some(proxy) = proxy {
client = client.proxy(Some(proxy.address.parse().expect("BAD PROXY ADDR")));
proxy_name = proxy.address.clone();
if let Some(proxy_auth) = proxy.credentials {
client = client.proxy_authentication(Authentication::basic());
client = client.proxy_credentials(proxy_auth);
}
}
let client = client
.build().expect("FAILED TO CONSTRUCT HTTP CLIENT");
let url = format!("https://www.bing.com/images/search?{query}&first={first}");
let res = Request::get(&url)
.header("user-agent", "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba")
.header("accept-language", "en-US,en;q=0.5")
.body(()).unwrap();
let mut res = client.send_async(res).await.map_err(|e| {
error!("FAILED TO SEND REQUEST TO BING: {e} ({query}) (proxy: {proxy_name})");
BingQueryError::CouldNotSendRequest
})?;
let body = res.text().await.map_err(|e| {
error!("COULD NOT READ BING RESPONSE TEXT: {e}");
BingQueryError::CouldNotDeserialize
})?;
let document = Html::parse_document(body.as_str());
let link_sel = Selector::parse("div>a").unwrap();
let img_sel = Selector::parse("div>img").unwrap();
let results = document.select(&link_sel).collect::<Vec<_>>();
let mut final_results = vec![];
for element in results {
let img = element.select(&img_sel).next();
if img.is_none() {
continue;
}
let url = element.attr("href");
if url.is_none() {
warn!("no href in image result");
continue;
}
let url = url.unwrap().to_string();
final_results.push(BingImageResult {
url,
});
}
if final_results.is_empty() {
debug!("no results, body: {body}");
}
Ok(final_results)
}