initial work on rewriting stuff; awaiting changes
All checks were successful
/ build-all-services (push) Successful in 9m21s

crawlers need to be storing which links link where for
pageranking to work; looks like we have to start over on crawling! :D
This commit is contained in:
husky 2025-03-14 18:00:37 -07:00
parent 56d07057c9
commit 0a7db89373
13 changed files with 1595 additions and 140 deletions

19
Cargo.lock generated
View file

@ -4276,6 +4276,25 @@ version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "searchservice"
version = "0.2.0"
dependencies = [
"asklyphe-common",
"async-nats",
"async-recursion",
"chrono",
"env_logger 0.10.2",
"futures",
"log",
"once_cell",
"rand 0.8.5",
"rmp-serde",
"serde",
"tokio",
"ulid",
]
[[package]]
name = "security-framework"
version = "2.11.1"

View file

@ -1,2 +1,2 @@
[workspace]
members = ["asklyphe-common", "asklyphe-frontend", "asklyphe-auth-frontend", "unit_converter", "authservice", "authservice/migration", "authservice/entity", "bingservice", "googleservice", "vorebot", "lyphedb", "lyphedb/ldbtesttool"]
members = ["asklyphe-common", "asklyphe-frontend", "asklyphe-auth-frontend", "unit_converter", "authservice", "authservice/migration", "authservice/entity", "bingservice", "googleservice", "vorebot", "lyphedb", "lyphedb/ldbtesttool", "searchservice"]

View file

@ -176,3 +176,35 @@ pub async fn get_website(db: &DBConn, url: &str) -> Result<WebsiteData, ()> {
}
}
}
pub async fn count_websites(db: &DBConn) -> Result<u64, ()> {
let key = construct_path(&[SITESTORE]).as_bytes().to_vec();
let cmd = LDBNatsMessage::Command(LypheDBCommand::CountKeys(KeyDirectory { key }));
match db.query(cmd).await {
LDBNatsMessage::Count(c) => {
Ok(c)
}
LDBNatsMessage::Entries(_) => {
warn!("lyphedb responded with \"entries\" to count_websites, treating as error");
Err(())
}
LDBNatsMessage::Success => {
warn!("lyphedb responded with \"success\" to count_websites, treating as error");
Err(())
}
LDBNatsMessage::BadRequest => {
error!("bad request for count_websites");
Err(())
}
LDBNatsMessage::NotFound => {
warn!("not found for count_websites");
Err(())
}
_ => {
warn!("lyphedb sent weird message as response, treating as error");
Err(())
}
}
}

View file

@ -1,6 +1,6 @@
[package]
name = "searchservice"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
license = "AGPL-3"
license-file = "LICENSE"
@ -8,7 +8,7 @@ license-file = "LICENSE"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
asklyphe-common = { path = "../asklyphe-common", features = ["foundationdb"] }
asklyphe-common = { path = "../asklyphe-common" }
tokio = { version = "1.0", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
rmp-serde = "1.1.2"

View file

@ -12,16 +12,13 @@
*/
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::ops::Mul;
use std::hash::{DefaultHasher, Hasher};
use async_recursion::async_recursion;
use once_cell::sync::Lazy;
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use log::{debug, info};
use rand::Rng;
use asklyphe_common::db;
use asklyphe_common::db::{STRHASH};
use asklyphe_common::foundationdb::Database;
use log::*;
use asklyphe_common::ldb::DBConn;
use asklyphe_common::nats::searchservice;
pub struct SearchResult {
@ -29,56 +26,20 @@ pub struct SearchResult {
pub title: Option<String>,
pub description: Option<String>,
pub url_contains_result: Option<usize>,
pub word_occurs: f64,
pub authorword_occurs: f64,
pub descriptionword_occurs: f64,
pub keyword_occurs: f64,
pub sitename_occurs: f64,
pub linkword_occurs: f64,
pub pagerank: f64,
pub relavence: f64,
pub words_contained: BTreeSet<usize>,
pub total_query_words: f64,
pub words: BTreeMap<usize, usize>,
pub closest_match: f64,
pub phrase_match_count: f64,
pub match_acc: f64,
pub match_c: f64,
pub highest_match: f64,
}
impl SearchResult {
pub fn relavence(&self, no_separator_flag: bool) -> f64 {
if !no_separator_flag {
((self.word_occurs * 1.3) + (self.descriptionword_occurs * 1.2) + (self.keyword_occurs * 0.5) + (self.authorword_occurs * 1.2) + (self.sitename_occurs * 1.3) + (self.linkword_occurs * 2.9) + (self.pagerank.powi(4)).max(0.001))
.max(0.01)
*
if self.total_query_words > 1.0 {
((1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
+ if !self.words_contained.is_empty() { (self.total_query_words / self.words_contained.len() as f64).max(0.002) } else { 1.0 }).max(0.001)
* self.phrase_match_count.max(0.02).powi(5)
} else {
1.0
}
} else {
if self.total_query_words > 1.0 {
(1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
* self.phrase_match_count.max(0.02).powi(8)
} else {
1.0
}
}
}
}
type STRHASH = u64;
pub static PRECALCED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static CACHED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static VISITED_PAGERANKS: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static VISITED_PAGERANKS2: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static HASH_CACHE: Lazy<HashCache<String, STRHASH>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static UNHASH_CACHE: Lazy<HashCache<STRHASH, String>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static TITLE_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static DESC_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub fn hash(data: &str) -> STRHASH {
let mut hash = DefaultHasher::new();
hash.write(data.as_bytes());
hash.finish()
}
pub struct HashCache<K: Ord, V: Clone> {
inner: Arc<Mutex<BTreeMap<K, V>>>,
@ -122,44 +83,12 @@ impl<K: Ord, V: Clone> HashCache<K, V> {
}
}
pub async fn cached_hash(db: &Database, key: String) -> STRHASH {
HASH_CACHE.entry(key.clone()).or_insert(db::foa_strhash(db, &key)).await
}
pub async fn cached_unhash(db: &Database, key: STRHASH) -> String {
UNHASH_CACHE.entry(key).or_insert(db::unhash(db, key)).await
}
pub async fn cached_title(db: &Database, key: STRHASH) -> Option<String> {
if let Some(title) = TITLE_CACHE.entry(key).or_insert(db::info_title(db, key)).await {
Some(title)
} else {
TITLE_CACHE.unconditional(key).or_insert(db::info_title(db, key)).await
}
}
pub async fn cached_desc(db: &Database, key: STRHASH) -> Option<String> {
if let Some(desc) = DESC_CACHE.entry(key).or_insert(db::info_description(db, key)).await {
Some(desc)
} else {
DESC_CACHE.unconditional(key).or_insert(db::info_description(db, key)).await
}
}
#[async_recursion]
pub async fn pagerank(db: &Database, url: STRHASH) -> f64 {
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
pub async fn pagerank(db: &DBConn, url: &str) -> f64 {
let uhash = hash(url);
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&uhash) {
return *precalc;
}
if let Some(precalc_db) = db::page_pagerank(db, url).await {
//debug!("url {} in db {}", url, precalc_db);
if precalc_db == 0.0 {
//debug!("but 0 ):");
} else {
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
return precalc_db;
}
}
let mut accum = 0.0;
let incoming = {
db::page_links_entering(db, url).await

View file

@ -27,11 +27,10 @@ use async_nats::jetstream::consumer::PullConsumer;
use async_nats::jetstream::stream::RetentionPolicy;
use futures::StreamExt;
use log::{error, info, warn};
use once_cell::sync::Lazy;
use once_cell::sync::{Lazy, OnceCell};
use tokio::sync::Mutex;
use tokio::task::JoinHandle;
use asklyphe_common::{db, foundationdb};
use asklyphe_common::foundationdb::Database;
use asklyphe_common::ldb::DBConn;
use asklyphe_common::nats::comms;
use asklyphe_common::nats::comms::{Service, ServiceResponse};
use asklyphe_common::nats::searchservice::SearchSrvcResponse;
@ -39,13 +38,16 @@ use asklyphe_common::nats::searchservice::SearchSrvcResponse;
pub static NATS_URL: Lazy<String> = Lazy::new(|| std::env::var("NATS_URL").expect("NO NATS DEFINED"));
pub static NATS_CERT: Lazy<String> = Lazy::new(|| std::env::var("NATS_CERT").expect("NO NATS_CERT DEFINED"));
pub static NATS_KEY: Lazy<String> = Lazy::new(|| std::env::var("NATS_KEY").expect("NO NATS_KEY DEFINED"));
pub static DB_NAME: Lazy<String> =
Lazy::new(|| std::env::var("DB_NAME").expect("PLEASE ADD DB_NAME"));
pub static PROCESSES_HANDLED: AtomicU64 = AtomicU64::new(0);
pub static LAST_MESSAGE: AtomicI64 = AtomicI64::new(0);
static DB_CONN: OnceCell<DBConn> = OnceCell::new();
#[tokio::main]
async fn main() {
let _guard = unsafe { foundationdb::boot() };
env_logger::init();
info!("searchservice began at {}", chrono::Utc::now().to_string());
let nats = async_nats::ConnectOptions::new()
@ -57,12 +59,14 @@ async fn main() {
return;
}
let nats = nats.unwrap();
let dbconn = DBConn::new(nats.clone(), DB_NAME.to_string());
let _ = DB_CONN.set(dbconn);
let nats = jetstream::new(nats);
info!("searchbot spawned");
async fn on_recv(query: comms::Query) -> ServiceResponse {
let db = Database::default().expect("couldn't connect to foundation db!");
let db = DB_CONN.get().unwrap().clone();
let now = chrono::Utc::now().timestamp();
LAST_MESSAGE.store(now, Ordering::Relaxed);
@ -85,7 +89,4 @@ async fn main() {
if let Err(e) = comms::subscribe_service(Service::SearchService, &nats, Arc::new(on_recv)).await {
error!("failed to subscribe to searchservice nats! reason {:?}", e);
}
// we will respawn tasks if they crash
drop(_guard);
}

View file

@ -16,15 +16,12 @@ use async_nats::{jetstream, Subject};
use log::{debug, error, warn};
use tokio::sync::Mutex;
use futures::StreamExt;
use asklyphe_common::db::tables::{INFO_TABLE, WEBSITE_TABLE};
use asklyphe_common::foundationdb::{Database, KeySelector, RangeOption};
use asklyphe_common::foundationdb::options::{StreamingMode, TransactionOption};
use asklyphe_common::foundationdb::tuple::{pack, Subspace};
use asklyphe_common::ldb::{sitestore, DBConn};
use asklyphe_common::nats::searchservice;
use asklyphe_common::nats::searchservice::{SearchSrvcQuery, SearchSrvcRequest, SearchSrvcResponse, SearchRequest, SiteCountResponse, SearchResponse};
use crate::{algorithm, hacks};
pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse {
pub async fn process(query: SearchSrvcQuery, db: DBConn) -> SearchSrvcResponse {
// a future is used so that the whole program doesn't die if an algorithm panics
let response = tokio::spawn(async move {
match query.request {
@ -45,7 +42,7 @@ pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse
}
}
pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResponse {
pub async fn search_request(req: SearchRequest, db: &DBConn) -> SearchSrvcResponse {
let words_initial: Vec<String> = req.query.split_whitespace().map(|s| s.to_string()).collect();
let mut words = vec![];
let mut no_separator_flag = false;
@ -75,44 +72,12 @@ pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResp
}
}
pub async fn count_websites(db: &Database) -> SearchSrvcResponse {
let mut counter: u64 = 0;
let subspace = Subspace::from(WEBSITE_TABLE);
let mut last_key = subspace.range().0;
let final_key = subspace.range().1;
for _failsafe in 0..10000 {
let trx = db.create_trx();
if let Err(e) = trx {
error!("DATABASE ERROR page_links_exiting_count {e}");
} else {
let trx = trx.unwrap();
// link -> from -> *
let mut range = RangeOption::from((last_key.clone(), final_key.clone()));
range.mode = StreamingMode::Iterator;
range.limit = Some(10096);
let mut stream = trx.get_ranges_keyvalues(range, true);
let mut this_time = 0;
while let Some(kv) = stream.next().await {
if let Ok(kv) = kv {
counter += 1;
this_time += 1;
last_key = kv.key().to_vec();
} else if let Err(e) = kv {
eprintln!("err while counting {e}");
}
}
if this_time <= 10 {
return SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
count: counter / 8,
});
}
}
pub async fn count_websites(db: &DBConn) -> SearchSrvcResponse {
if let Ok(count) = sitestore::count_websites(db).await {
SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
count,
})
} else {
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
}
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
}

2
searchservice_old/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
.idea

View file

@ -0,0 +1,23 @@
[package]
name = "searchservice"
version = "0.1.0"
edition = "2021"
license = "AGPL-3"
license-file = "LICENSE"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
asklyphe-common = { path = "../asklyphe-common", features = ["foundationdb"] }
tokio = { version = "1.0", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
rmp-serde = "1.1.2"
async-nats = "0.38.0"
chrono = "0.4.26"
once_cell = "1.18.0"
ulid = "1.0.0"
rand = "0.8.5"
async-recursion = "1.0.5"
futures = "0.3.30"
log = "0.4.20"
env_logger = "0.10.2"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,62 @@
/*
* searchservice hacks.rs
* - awful awful solutions to our issues
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::ops::{ RangeInclusive};
const BURMESE_RANGE: RangeInclusive<char> = '\u{1000}'..='\u{104f}';
const CHINESE_RANGE1: RangeInclusive<char> = '\u{4e00}'..='\u{9fff}';
const CHINESE_RANGE2: RangeInclusive<char> = '\u{3400}'..='\u{4dbf}';
const CHINESE_RANGE3: RangeInclusive<char> = '\u{20000}'..='\u{2a6df}';
const CHINESE_RANGE4: RangeInclusive<char> = '\u{2A700}'..='\u{2B73F}';
const CHINESE_RANGE5: RangeInclusive<char> = '\u{2B740}'..='\u{2B81F}';
const CHINESE_RANGE6: RangeInclusive<char> = '\u{2B820}'..='\u{2CEAF}';
const CHINESE_RANGE7: RangeInclusive<char> = '\u{2CEB0}'..='\u{2EBEF}';
const CHINESE_RANGE8: RangeInclusive<char> = '\u{30000}'..='\u{3134F}';
const CHINESE_RANGE9: RangeInclusive<char> = '\u{31350}'..='\u{323AF}';
const CHINESE_RANGE10: RangeInclusive<char> = '\u{2EBF0}'..='\u{2EE5F}';
const CHINESE_RANGE11: RangeInclusive<char> = '\u{F900}'..='\u{FAFF}';
const JAPANESE_RANGE1: RangeInclusive<char> = '\u{3040}'..='\u{309F}';
/// KATAKANA
const JAPANESE_RANGE2: RangeInclusive<char> = '\u{30A0}'..='\u{30FF}';
const JAVANESE_RANGE: RangeInclusive<char> = '\u{A980}'..='\u{A9DF}';
const KHMER_RANGE1: RangeInclusive<char> = '\u{1780}'..='\u{17FF}';
const KHMER_RANGE2: RangeInclusive<char> = '\u{19E0}'..='\u{19FF}';
const LAO_RANGE: RangeInclusive<char> = '\u{0E80}'..='\u{0EFF}';
const PHAGSPA_RANGE: RangeInclusive<char> = '\u{A840}'..='\u{A87F}';
const TAITHAM_RANGE: RangeInclusive<char> = '\u{1A20}'..='\u{1AAF}';
const THAI_RANGE: RangeInclusive<char> = '\u{0E00}'..='\u{E07F}';
const TIBETAN_RANGE: RangeInclusive<char> = '\u{0F00}'..='\u{0FFF}';
const NO_WORD_BOUNDRIES: &[RangeInclusive<char>] = &[
BURMESE_RANGE,
CHINESE_RANGE1, CHINESE_RANGE2, CHINESE_RANGE3, CHINESE_RANGE4, CHINESE_RANGE5, CHINESE_RANGE6, CHINESE_RANGE7, CHINESE_RANGE8, CHINESE_RANGE9, CHINESE_RANGE10, CHINESE_RANGE11,
JAPANESE_RANGE1, JAPANESE_RANGE2,
JAVANESE_RANGE,
KHMER_RANGE1, KHMER_RANGE2,
LAO_RANGE,
PHAGSPA_RANGE,
TAITHAM_RANGE,
THAI_RANGE,
TIBETAN_RANGE,
];
pub fn is_from_language_that_doesnt_use_word_separators(str: &str) -> bool {
for c in str.chars() {
for range in NO_WORD_BOUNDRIES {
if range.contains(&c) {
return true;
}
}
}
false
}

View file

@ -0,0 +1,91 @@
/*
* searchservice main.rs
* - entrypoint for the asklyphe search service
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
pub mod algorithm;
pub mod hacks;
mod process;
use std::cmp::max;
use std::future::Future;
use std::io::Read;
use std::string::ToString;
use std::sync::Arc;
use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
use std::time::Duration;
use async_nats::jetstream;
use async_nats::jetstream::consumer::PullConsumer;
use async_nats::jetstream::stream::RetentionPolicy;
use futures::StreamExt;
use log::{error, info, warn};
use once_cell::sync::Lazy;
use tokio::sync::Mutex;
use tokio::task::JoinHandle;
use asklyphe_common::{db, foundationdb};
use asklyphe_common::foundationdb::Database;
use asklyphe_common::nats::comms;
use asklyphe_common::nats::comms::{Service, ServiceResponse};
use asklyphe_common::nats::searchservice::SearchSrvcResponse;
pub static NATS_URL: Lazy<String> = Lazy::new(|| std::env::var("NATS_URL").expect("NO NATS DEFINED"));
pub static NATS_CERT: Lazy<String> = Lazy::new(|| std::env::var("NATS_CERT").expect("NO NATS_CERT DEFINED"));
pub static NATS_KEY: Lazy<String> = Lazy::new(|| std::env::var("NATS_KEY").expect("NO NATS_KEY DEFINED"));
pub static PROCESSES_HANDLED: AtomicU64 = AtomicU64::new(0);
pub static LAST_MESSAGE: AtomicI64 = AtomicI64::new(0);
#[tokio::main]
async fn main() {
let _guard = unsafe { foundationdb::boot() };
env_logger::init();
info!("searchservice began at {}", chrono::Utc::now().to_string());
let nats = async_nats::ConnectOptions::new()
.add_client_certificate(NATS_CERT.as_str().into(), NATS_KEY.as_str().into())
.connect(NATS_URL.as_str())
.await;
if let Err(e) = nats {
error!("FATAL ERROR, COULDN'T CONNECT TO NATS: {}", e);
return;
}
let nats = nats.unwrap();
let nats = jetstream::new(nats);
info!("searchbot spawned");
async fn on_recv(query: comms::Query) -> ServiceResponse {
let db = Database::default().expect("couldn't connect to foundation db!");
let now = chrono::Utc::now().timestamp();
LAST_MESSAGE.store(now, Ordering::Relaxed);
let response = match query {
comms::Query::SearchService(query) => {
process::process(query, db).await
}
_ => {
SearchSrvcResponse::InvalidRequest
}
};
if PROCESSES_HANDLED.load(Ordering::Relaxed) % 100 == 0 {
info!("handled {} requests!", PROCESSES_HANDLED.load(Ordering::Relaxed));
}
PROCESSES_HANDLED.fetch_add(1, Ordering::Relaxed);
ServiceResponse::SearchService(response)
}
if let Err(e) = comms::subscribe_service(Service::SearchService, &nats, Arc::new(on_recv)).await {
error!("failed to subscribe to searchservice nats! reason {:?}", e);
}
// we will respawn tasks if they crash
drop(_guard);
}

View file

@ -0,0 +1,118 @@
/*
* searchservice process.rs
* - route incoming nats requests to their specific functions
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::sync::Arc;
use async_nats::{jetstream, Subject};
use log::{debug, error, warn};
use tokio::sync::Mutex;
use futures::StreamExt;
use asklyphe_common::db::tables::{INFO_TABLE, WEBSITE_TABLE};
use asklyphe_common::foundationdb::{Database, KeySelector, RangeOption};
use asklyphe_common::foundationdb::options::{StreamingMode, TransactionOption};
use asklyphe_common::foundationdb::tuple::{pack, Subspace};
use asklyphe_common::nats::searchservice;
use asklyphe_common::nats::searchservice::{SearchSrvcQuery, SearchSrvcRequest, SearchSrvcResponse, SearchRequest, SiteCountResponse, SearchResponse};
use crate::{algorithm, hacks};
pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse {
// a future is used so that the whole program doesn't die if an algorithm panics
let response = tokio::spawn(async move {
match query.request {
SearchSrvcRequest::SearchRequest(req) => {
search_request(req, &db).await
}
SearchSrvcRequest::SiteCountRequest => {
count_websites(&db).await
}
}}).await;
if let Ok(response) = response {
response
} else if let Err(e) = response {
SearchSrvcResponse::OtherError(e.to_string())
} else {
unreachable!()
}
}
pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResponse {
let words_initial: Vec<String> = req.query.split_whitespace().map(|s| s.to_string()).collect();
let mut words = vec![];
let mut no_separator_flag = false;
for word in words_initial {
if hacks::is_from_language_that_doesnt_use_word_separators(&word) {
words.extend(word.chars().map(|c| c.to_string()).collect::<Vec<String>>());
no_separator_flag = true;
} else {
words.push(word);
}
}
match algorithm::search(&db, words, no_separator_flag).await {
Some(results) => {
SearchSrvcResponse::SearchResponse(results)
}
None => {
SearchSrvcResponse::SearchResponse(SearchResponse {
results: vec![],
blocked: vec![],
pagerank_time_seconds: 0.0,
total_query_seconds: 0.0,
max_relevance: 0.0,
exact_phrase_found: false,
})
}
}
}
pub async fn count_websites(db: &Database) -> SearchSrvcResponse {
let mut counter: u64 = 0;
let subspace = Subspace::from(WEBSITE_TABLE);
let mut last_key = subspace.range().0;
let final_key = subspace.range().1;
for _failsafe in 0..10000 {
let trx = db.create_trx();
if let Err(e) = trx {
error!("DATABASE ERROR page_links_exiting_count {e}");
} else {
let trx = trx.unwrap();
// link -> from -> *
let mut range = RangeOption::from((last_key.clone(), final_key.clone()));
range.mode = StreamingMode::Iterator;
range.limit = Some(10096);
let mut stream = trx.get_ranges_keyvalues(range, true);
let mut this_time = 0;
while let Some(kv) = stream.next().await {
if let Ok(kv) = kv {
counter += 1;
this_time += 1;
last_key = kv.key().to_vec();
} else if let Err(e) = kv {
eprintln!("err while counting {e}");
}
}
if this_time <= 10 {
return SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
count: counter / 8,
});
}
}
}
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
}