initial work on rewriting stuff; awaiting changes
All checks were successful
/ build-all-services (push) Successful in 9m21s
All checks were successful
/ build-all-services (push) Successful in 9m21s
crawlers need to be storing which links link where for pageranking to work; looks like we have to start over on crawling! :D
This commit is contained in:
parent
56d07057c9
commit
0a7db89373
13 changed files with 1595 additions and 140 deletions
19
Cargo.lock
generated
19
Cargo.lock
generated
|
@ -4276,6 +4276,25 @@ version = "4.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
|
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "searchservice"
|
||||||
|
version = "0.2.0"
|
||||||
|
dependencies = [
|
||||||
|
"asklyphe-common",
|
||||||
|
"async-nats",
|
||||||
|
"async-recursion",
|
||||||
|
"chrono",
|
||||||
|
"env_logger 0.10.2",
|
||||||
|
"futures",
|
||||||
|
"log",
|
||||||
|
"once_cell",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"rmp-serde",
|
||||||
|
"serde",
|
||||||
|
"tokio",
|
||||||
|
"ulid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "security-framework"
|
name = "security-framework"
|
||||||
version = "2.11.1"
|
version = "2.11.1"
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
[workspace]
|
[workspace]
|
||||||
members = ["asklyphe-common", "asklyphe-frontend", "asklyphe-auth-frontend", "unit_converter", "authservice", "authservice/migration", "authservice/entity", "bingservice", "googleservice", "vorebot", "lyphedb", "lyphedb/ldbtesttool"]
|
members = ["asklyphe-common", "asklyphe-frontend", "asklyphe-auth-frontend", "unit_converter", "authservice", "authservice/migration", "authservice/entity", "bingservice", "googleservice", "vorebot", "lyphedb", "lyphedb/ldbtesttool", "searchservice"]
|
||||||
|
|
|
@ -176,3 +176,35 @@ pub async fn get_website(db: &DBConn, url: &str) -> Result<WebsiteData, ()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn count_websites(db: &DBConn) -> Result<u64, ()> {
|
||||||
|
let key = construct_path(&[SITESTORE]).as_bytes().to_vec();
|
||||||
|
|
||||||
|
let cmd = LDBNatsMessage::Command(LypheDBCommand::CountKeys(KeyDirectory { key }));
|
||||||
|
|
||||||
|
match db.query(cmd).await {
|
||||||
|
LDBNatsMessage::Count(c) => {
|
||||||
|
Ok(c)
|
||||||
|
}
|
||||||
|
LDBNatsMessage::Entries(_) => {
|
||||||
|
warn!("lyphedb responded with \"entries\" to count_websites, treating as error");
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
LDBNatsMessage::Success => {
|
||||||
|
warn!("lyphedb responded with \"success\" to count_websites, treating as error");
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
LDBNatsMessage::BadRequest => {
|
||||||
|
error!("bad request for count_websites");
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
LDBNatsMessage::NotFound => {
|
||||||
|
warn!("not found for count_websites");
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
warn!("lyphedb sent weird message as response, treating as error");
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "searchservice"
|
name = "searchservice"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "AGPL-3"
|
license = "AGPL-3"
|
||||||
license-file = "LICENSE"
|
license-file = "LICENSE"
|
||||||
|
@ -8,7 +8,7 @@ license-file = "LICENSE"
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
asklyphe-common = { path = "../asklyphe-common", features = ["foundationdb"] }
|
asklyphe-common = { path = "../asklyphe-common" }
|
||||||
tokio = { version = "1.0", features = ["full"] }
|
tokio = { version = "1.0", features = ["full"] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
rmp-serde = "1.1.2"
|
rmp-serde = "1.1.2"
|
||||||
|
|
|
@ -12,16 +12,13 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use std::collections::{BTreeMap, BTreeSet, VecDeque};
|
use std::collections::{BTreeMap, BTreeSet, VecDeque};
|
||||||
use std::ops::Mul;
|
use std::hash::{DefaultHasher, Hasher};
|
||||||
use async_recursion::async_recursion;
|
use async_recursion::async_recursion;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
use log::{debug, info};
|
use log::*;
|
||||||
use rand::Rng;
|
use asklyphe_common::ldb::DBConn;
|
||||||
use asklyphe_common::db;
|
|
||||||
use asklyphe_common::db::{STRHASH};
|
|
||||||
use asklyphe_common::foundationdb::Database;
|
|
||||||
use asklyphe_common::nats::searchservice;
|
use asklyphe_common::nats::searchservice;
|
||||||
|
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
|
@ -29,56 +26,20 @@ pub struct SearchResult {
|
||||||
pub title: Option<String>,
|
pub title: Option<String>,
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub url_contains_result: Option<usize>,
|
pub url_contains_result: Option<usize>,
|
||||||
pub word_occurs: f64,
|
|
||||||
pub authorword_occurs: f64,
|
|
||||||
pub descriptionword_occurs: f64,
|
|
||||||
pub keyword_occurs: f64,
|
|
||||||
pub sitename_occurs: f64,
|
|
||||||
pub linkword_occurs: f64,
|
|
||||||
pub pagerank: f64,
|
pub pagerank: f64,
|
||||||
pub relavence: f64,
|
pub relavence: f64,
|
||||||
pub words_contained: BTreeSet<usize>,
|
|
||||||
pub total_query_words: f64,
|
|
||||||
pub words: BTreeMap<usize, usize>,
|
|
||||||
pub closest_match: f64,
|
|
||||||
pub phrase_match_count: f64,
|
|
||||||
pub match_acc: f64,
|
|
||||||
pub match_c: f64,
|
|
||||||
pub highest_match: f64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SearchResult {
|
type STRHASH = u64;
|
||||||
pub fn relavence(&self, no_separator_flag: bool) -> f64 {
|
|
||||||
if !no_separator_flag {
|
|
||||||
((self.word_occurs * 1.3) + (self.descriptionword_occurs * 1.2) + (self.keyword_occurs * 0.5) + (self.authorword_occurs * 1.2) + (self.sitename_occurs * 1.3) + (self.linkword_occurs * 2.9) + (self.pagerank.powi(4)).max(0.001))
|
|
||||||
.max(0.01)
|
|
||||||
*
|
|
||||||
if self.total_query_words > 1.0 {
|
|
||||||
((1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
|
|
||||||
+ if !self.words_contained.is_empty() { (self.total_query_words / self.words_contained.len() as f64).max(0.002) } else { 1.0 }).max(0.001)
|
|
||||||
* self.phrase_match_count.max(0.02).powi(5)
|
|
||||||
} else {
|
|
||||||
1.0
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if self.total_query_words > 1.0 {
|
|
||||||
(1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
|
|
||||||
* self.phrase_match_count.max(0.02).powi(8)
|
|
||||||
} else {
|
|
||||||
1.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub static PRECALCED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
pub static PRECALCED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
||||||
pub static CACHED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
pub static CACHED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
||||||
pub static VISITED_PAGERANKS: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
|
|
||||||
pub static VISITED_PAGERANKS2: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
|
pub fn hash(data: &str) -> STRHASH {
|
||||||
pub static HASH_CACHE: Lazy<HashCache<String, STRHASH>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
let mut hash = DefaultHasher::new();
|
||||||
pub static UNHASH_CACHE: Lazy<HashCache<STRHASH, String>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
hash.write(data.as_bytes());
|
||||||
pub static TITLE_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
hash.finish()
|
||||||
pub static DESC_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
}
|
||||||
|
|
||||||
pub struct HashCache<K: Ord, V: Clone> {
|
pub struct HashCache<K: Ord, V: Clone> {
|
||||||
inner: Arc<Mutex<BTreeMap<K, V>>>,
|
inner: Arc<Mutex<BTreeMap<K, V>>>,
|
||||||
|
@ -122,44 +83,12 @@ impl<K: Ord, V: Clone> HashCache<K, V> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn cached_hash(db: &Database, key: String) -> STRHASH {
|
|
||||||
HASH_CACHE.entry(key.clone()).or_insert(db::foa_strhash(db, &key)).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn cached_unhash(db: &Database, key: STRHASH) -> String {
|
|
||||||
UNHASH_CACHE.entry(key).or_insert(db::unhash(db, key)).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn cached_title(db: &Database, key: STRHASH) -> Option<String> {
|
|
||||||
if let Some(title) = TITLE_CACHE.entry(key).or_insert(db::info_title(db, key)).await {
|
|
||||||
Some(title)
|
|
||||||
} else {
|
|
||||||
TITLE_CACHE.unconditional(key).or_insert(db::info_title(db, key)).await
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn cached_desc(db: &Database, key: STRHASH) -> Option<String> {
|
|
||||||
if let Some(desc) = DESC_CACHE.entry(key).or_insert(db::info_description(db, key)).await {
|
|
||||||
Some(desc)
|
|
||||||
} else {
|
|
||||||
DESC_CACHE.unconditional(key).or_insert(db::info_description(db, key)).await
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_recursion]
|
#[async_recursion]
|
||||||
pub async fn pagerank(db: &Database, url: STRHASH) -> f64 {
|
pub async fn pagerank(db: &DBConn, url: &str) -> f64 {
|
||||||
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
|
let uhash = hash(url);
|
||||||
|
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&uhash) {
|
||||||
return *precalc;
|
return *precalc;
|
||||||
}
|
}
|
||||||
if let Some(precalc_db) = db::page_pagerank(db, url).await {
|
|
||||||
//debug!("url {} in db {}", url, precalc_db);
|
|
||||||
if precalc_db == 0.0 {
|
|
||||||
//debug!("but 0 ):");
|
|
||||||
} else {
|
|
||||||
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
|
|
||||||
return precalc_db;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut accum = 0.0;
|
let mut accum = 0.0;
|
||||||
let incoming = {
|
let incoming = {
|
||||||
db::page_links_entering(db, url).await
|
db::page_links_entering(db, url).await
|
||||||
|
|
|
@ -27,11 +27,10 @@ use async_nats::jetstream::consumer::PullConsumer;
|
||||||
use async_nats::jetstream::stream::RetentionPolicy;
|
use async_nats::jetstream::stream::RetentionPolicy;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use log::{error, info, warn};
|
use log::{error, info, warn};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::{Lazy, OnceCell};
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use asklyphe_common::{db, foundationdb};
|
use asklyphe_common::ldb::DBConn;
|
||||||
use asklyphe_common::foundationdb::Database;
|
|
||||||
use asklyphe_common::nats::comms;
|
use asklyphe_common::nats::comms;
|
||||||
use asklyphe_common::nats::comms::{Service, ServiceResponse};
|
use asklyphe_common::nats::comms::{Service, ServiceResponse};
|
||||||
use asklyphe_common::nats::searchservice::SearchSrvcResponse;
|
use asklyphe_common::nats::searchservice::SearchSrvcResponse;
|
||||||
|
@ -39,13 +38,16 @@ use asklyphe_common::nats::searchservice::SearchSrvcResponse;
|
||||||
pub static NATS_URL: Lazy<String> = Lazy::new(|| std::env::var("NATS_URL").expect("NO NATS DEFINED"));
|
pub static NATS_URL: Lazy<String> = Lazy::new(|| std::env::var("NATS_URL").expect("NO NATS DEFINED"));
|
||||||
pub static NATS_CERT: Lazy<String> = Lazy::new(|| std::env::var("NATS_CERT").expect("NO NATS_CERT DEFINED"));
|
pub static NATS_CERT: Lazy<String> = Lazy::new(|| std::env::var("NATS_CERT").expect("NO NATS_CERT DEFINED"));
|
||||||
pub static NATS_KEY: Lazy<String> = Lazy::new(|| std::env::var("NATS_KEY").expect("NO NATS_KEY DEFINED"));
|
pub static NATS_KEY: Lazy<String> = Lazy::new(|| std::env::var("NATS_KEY").expect("NO NATS_KEY DEFINED"));
|
||||||
|
pub static DB_NAME: Lazy<String> =
|
||||||
|
Lazy::new(|| std::env::var("DB_NAME").expect("PLEASE ADD DB_NAME"));
|
||||||
|
|
||||||
pub static PROCESSES_HANDLED: AtomicU64 = AtomicU64::new(0);
|
pub static PROCESSES_HANDLED: AtomicU64 = AtomicU64::new(0);
|
||||||
pub static LAST_MESSAGE: AtomicI64 = AtomicI64::new(0);
|
pub static LAST_MESSAGE: AtomicI64 = AtomicI64::new(0);
|
||||||
|
|
||||||
|
static DB_CONN: OnceCell<DBConn> = OnceCell::new();
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let _guard = unsafe { foundationdb::boot() };
|
|
||||||
env_logger::init();
|
env_logger::init();
|
||||||
info!("searchservice began at {}", chrono::Utc::now().to_string());
|
info!("searchservice began at {}", chrono::Utc::now().to_string());
|
||||||
let nats = async_nats::ConnectOptions::new()
|
let nats = async_nats::ConnectOptions::new()
|
||||||
|
@ -57,12 +59,14 @@ async fn main() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let nats = nats.unwrap();
|
let nats = nats.unwrap();
|
||||||
|
let dbconn = DBConn::new(nats.clone(), DB_NAME.to_string());
|
||||||
|
let _ = DB_CONN.set(dbconn);
|
||||||
let nats = jetstream::new(nats);
|
let nats = jetstream::new(nats);
|
||||||
|
|
||||||
info!("searchbot spawned");
|
info!("searchbot spawned");
|
||||||
|
|
||||||
async fn on_recv(query: comms::Query) -> ServiceResponse {
|
async fn on_recv(query: comms::Query) -> ServiceResponse {
|
||||||
let db = Database::default().expect("couldn't connect to foundation db!");
|
let db = DB_CONN.get().unwrap().clone();
|
||||||
let now = chrono::Utc::now().timestamp();
|
let now = chrono::Utc::now().timestamp();
|
||||||
LAST_MESSAGE.store(now, Ordering::Relaxed);
|
LAST_MESSAGE.store(now, Ordering::Relaxed);
|
||||||
|
|
||||||
|
@ -85,7 +89,4 @@ async fn main() {
|
||||||
if let Err(e) = comms::subscribe_service(Service::SearchService, &nats, Arc::new(on_recv)).await {
|
if let Err(e) = comms::subscribe_service(Service::SearchService, &nats, Arc::new(on_recv)).await {
|
||||||
error!("failed to subscribe to searchservice nats! reason {:?}", e);
|
error!("failed to subscribe to searchservice nats! reason {:?}", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// we will respawn tasks if they crash
|
|
||||||
drop(_guard);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,15 +16,12 @@ use async_nats::{jetstream, Subject};
|
||||||
use log::{debug, error, warn};
|
use log::{debug, error, warn};
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use asklyphe_common::db::tables::{INFO_TABLE, WEBSITE_TABLE};
|
use asklyphe_common::ldb::{sitestore, DBConn};
|
||||||
use asklyphe_common::foundationdb::{Database, KeySelector, RangeOption};
|
|
||||||
use asklyphe_common::foundationdb::options::{StreamingMode, TransactionOption};
|
|
||||||
use asklyphe_common::foundationdb::tuple::{pack, Subspace};
|
|
||||||
use asklyphe_common::nats::searchservice;
|
use asklyphe_common::nats::searchservice;
|
||||||
use asklyphe_common::nats::searchservice::{SearchSrvcQuery, SearchSrvcRequest, SearchSrvcResponse, SearchRequest, SiteCountResponse, SearchResponse};
|
use asklyphe_common::nats::searchservice::{SearchSrvcQuery, SearchSrvcRequest, SearchSrvcResponse, SearchRequest, SiteCountResponse, SearchResponse};
|
||||||
use crate::{algorithm, hacks};
|
use crate::{algorithm, hacks};
|
||||||
|
|
||||||
pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse {
|
pub async fn process(query: SearchSrvcQuery, db: DBConn) -> SearchSrvcResponse {
|
||||||
// a future is used so that the whole program doesn't die if an algorithm panics
|
// a future is used so that the whole program doesn't die if an algorithm panics
|
||||||
let response = tokio::spawn(async move {
|
let response = tokio::spawn(async move {
|
||||||
match query.request {
|
match query.request {
|
||||||
|
@ -45,7 +42,7 @@ pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResponse {
|
pub async fn search_request(req: SearchRequest, db: &DBConn) -> SearchSrvcResponse {
|
||||||
let words_initial: Vec<String> = req.query.split_whitespace().map(|s| s.to_string()).collect();
|
let words_initial: Vec<String> = req.query.split_whitespace().map(|s| s.to_string()).collect();
|
||||||
let mut words = vec![];
|
let mut words = vec![];
|
||||||
let mut no_separator_flag = false;
|
let mut no_separator_flag = false;
|
||||||
|
@ -75,44 +72,12 @@ pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn count_websites(db: &Database) -> SearchSrvcResponse {
|
pub async fn count_websites(db: &DBConn) -> SearchSrvcResponse {
|
||||||
let mut counter: u64 = 0;
|
if let Ok(count) = sitestore::count_websites(db).await {
|
||||||
let subspace = Subspace::from(WEBSITE_TABLE);
|
SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
|
||||||
let mut last_key = subspace.range().0;
|
count,
|
||||||
let final_key = subspace.range().1;
|
})
|
||||||
|
|
||||||
for _failsafe in 0..10000 {
|
|
||||||
let trx = db.create_trx();
|
|
||||||
if let Err(e) = trx {
|
|
||||||
error!("DATABASE ERROR page_links_exiting_count {e}");
|
|
||||||
} else {
|
} else {
|
||||||
let trx = trx.unwrap();
|
|
||||||
// link -> from -> *
|
|
||||||
let mut range = RangeOption::from((last_key.clone(), final_key.clone()));
|
|
||||||
range.mode = StreamingMode::Iterator;
|
|
||||||
range.limit = Some(10096);
|
|
||||||
|
|
||||||
let mut stream = trx.get_ranges_keyvalues(range, true);
|
|
||||||
|
|
||||||
let mut this_time = 0;
|
|
||||||
|
|
||||||
while let Some(kv) = stream.next().await {
|
|
||||||
if let Ok(kv) = kv {
|
|
||||||
counter += 1;
|
|
||||||
this_time += 1;
|
|
||||||
last_key = kv.key().to_vec();
|
|
||||||
} else if let Err(e) = kv {
|
|
||||||
eprintln!("err while counting {e}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if this_time <= 10 {
|
|
||||||
return SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
|
|
||||||
count: counter / 8,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
|
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
|
||||||
}
|
}
|
||||||
|
}
|
2
searchservice_old/.gitignore
vendored
Normal file
2
searchservice_old/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
/target
|
||||||
|
.idea
|
23
searchservice_old/Cargo.toml
Normal file
23
searchservice_old/Cargo.toml
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
[package]
|
||||||
|
name = "searchservice"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
license = "AGPL-3"
|
||||||
|
license-file = "LICENSE"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
asklyphe-common = { path = "../asklyphe-common", features = ["foundationdb"] }
|
||||||
|
tokio = { version = "1.0", features = ["full"] }
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
rmp-serde = "1.1.2"
|
||||||
|
async-nats = "0.38.0"
|
||||||
|
chrono = "0.4.26"
|
||||||
|
once_cell = "1.18.0"
|
||||||
|
ulid = "1.0.0"
|
||||||
|
rand = "0.8.5"
|
||||||
|
async-recursion = "1.0.5"
|
||||||
|
futures = "0.3.30"
|
||||||
|
log = "0.4.20"
|
||||||
|
env_logger = "0.10.2"
|
1213
searchservice_old/src/algorithm.rs
Normal file
1213
searchservice_old/src/algorithm.rs
Normal file
File diff suppressed because it is too large
Load diff
62
searchservice_old/src/hacks.rs
Normal file
62
searchservice_old/src/hacks.rs
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
/*
|
||||||
|
* searchservice hacks.rs
|
||||||
|
* - awful awful solutions to our issues
|
||||||
|
*
|
||||||
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::ops::{ RangeInclusive};
|
||||||
|
|
||||||
|
const BURMESE_RANGE: RangeInclusive<char> = '\u{1000}'..='\u{104f}';
|
||||||
|
const CHINESE_RANGE1: RangeInclusive<char> = '\u{4e00}'..='\u{9fff}';
|
||||||
|
const CHINESE_RANGE2: RangeInclusive<char> = '\u{3400}'..='\u{4dbf}';
|
||||||
|
const CHINESE_RANGE3: RangeInclusive<char> = '\u{20000}'..='\u{2a6df}';
|
||||||
|
const CHINESE_RANGE4: RangeInclusive<char> = '\u{2A700}'..='\u{2B73F}';
|
||||||
|
const CHINESE_RANGE5: RangeInclusive<char> = '\u{2B740}'..='\u{2B81F}';
|
||||||
|
const CHINESE_RANGE6: RangeInclusive<char> = '\u{2B820}'..='\u{2CEAF}';
|
||||||
|
const CHINESE_RANGE7: RangeInclusive<char> = '\u{2CEB0}'..='\u{2EBEF}';
|
||||||
|
const CHINESE_RANGE8: RangeInclusive<char> = '\u{30000}'..='\u{3134F}';
|
||||||
|
const CHINESE_RANGE9: RangeInclusive<char> = '\u{31350}'..='\u{323AF}';
|
||||||
|
const CHINESE_RANGE10: RangeInclusive<char> = '\u{2EBF0}'..='\u{2EE5F}';
|
||||||
|
const CHINESE_RANGE11: RangeInclusive<char> = '\u{F900}'..='\u{FAFF}';
|
||||||
|
const JAPANESE_RANGE1: RangeInclusive<char> = '\u{3040}'..='\u{309F}';
|
||||||
|
/// KATAKANA
|
||||||
|
const JAPANESE_RANGE2: RangeInclusive<char> = '\u{30A0}'..='\u{30FF}';
|
||||||
|
const JAVANESE_RANGE: RangeInclusive<char> = '\u{A980}'..='\u{A9DF}';
|
||||||
|
const KHMER_RANGE1: RangeInclusive<char> = '\u{1780}'..='\u{17FF}';
|
||||||
|
const KHMER_RANGE2: RangeInclusive<char> = '\u{19E0}'..='\u{19FF}';
|
||||||
|
const LAO_RANGE: RangeInclusive<char> = '\u{0E80}'..='\u{0EFF}';
|
||||||
|
const PHAGSPA_RANGE: RangeInclusive<char> = '\u{A840}'..='\u{A87F}';
|
||||||
|
const TAITHAM_RANGE: RangeInclusive<char> = '\u{1A20}'..='\u{1AAF}';
|
||||||
|
const THAI_RANGE: RangeInclusive<char> = '\u{0E00}'..='\u{E07F}';
|
||||||
|
const TIBETAN_RANGE: RangeInclusive<char> = '\u{0F00}'..='\u{0FFF}';
|
||||||
|
const NO_WORD_BOUNDRIES: &[RangeInclusive<char>] = &[
|
||||||
|
BURMESE_RANGE,
|
||||||
|
CHINESE_RANGE1, CHINESE_RANGE2, CHINESE_RANGE3, CHINESE_RANGE4, CHINESE_RANGE5, CHINESE_RANGE6, CHINESE_RANGE7, CHINESE_RANGE8, CHINESE_RANGE9, CHINESE_RANGE10, CHINESE_RANGE11,
|
||||||
|
JAPANESE_RANGE1, JAPANESE_RANGE2,
|
||||||
|
JAVANESE_RANGE,
|
||||||
|
KHMER_RANGE1, KHMER_RANGE2,
|
||||||
|
LAO_RANGE,
|
||||||
|
PHAGSPA_RANGE,
|
||||||
|
TAITHAM_RANGE,
|
||||||
|
THAI_RANGE,
|
||||||
|
TIBETAN_RANGE,
|
||||||
|
];
|
||||||
|
|
||||||
|
pub fn is_from_language_that_doesnt_use_word_separators(str: &str) -> bool {
|
||||||
|
for c in str.chars() {
|
||||||
|
for range in NO_WORD_BOUNDRIES {
|
||||||
|
if range.contains(&c) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
91
searchservice_old/src/main.rs
Normal file
91
searchservice_old/src/main.rs
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
* searchservice main.rs
|
||||||
|
* - entrypoint for the asklyphe search service
|
||||||
|
*
|
||||||
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
pub mod algorithm;
|
||||||
|
pub mod hacks;
|
||||||
|
mod process;
|
||||||
|
|
||||||
|
use std::cmp::max;
|
||||||
|
use std::future::Future;
|
||||||
|
use std::io::Read;
|
||||||
|
use std::string::ToString;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
|
use async_nats::jetstream;
|
||||||
|
use async_nats::jetstream::consumer::PullConsumer;
|
||||||
|
use async_nats::jetstream::stream::RetentionPolicy;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use log::{error, info, warn};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
|
use asklyphe_common::{db, foundationdb};
|
||||||
|
use asklyphe_common::foundationdb::Database;
|
||||||
|
use asklyphe_common::nats::comms;
|
||||||
|
use asklyphe_common::nats::comms::{Service, ServiceResponse};
|
||||||
|
use asklyphe_common::nats::searchservice::SearchSrvcResponse;
|
||||||
|
|
||||||
|
pub static NATS_URL: Lazy<String> = Lazy::new(|| std::env::var("NATS_URL").expect("NO NATS DEFINED"));
|
||||||
|
pub static NATS_CERT: Lazy<String> = Lazy::new(|| std::env::var("NATS_CERT").expect("NO NATS_CERT DEFINED"));
|
||||||
|
pub static NATS_KEY: Lazy<String> = Lazy::new(|| std::env::var("NATS_KEY").expect("NO NATS_KEY DEFINED"));
|
||||||
|
|
||||||
|
pub static PROCESSES_HANDLED: AtomicU64 = AtomicU64::new(0);
|
||||||
|
pub static LAST_MESSAGE: AtomicI64 = AtomicI64::new(0);
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let _guard = unsafe { foundationdb::boot() };
|
||||||
|
env_logger::init();
|
||||||
|
info!("searchservice began at {}", chrono::Utc::now().to_string());
|
||||||
|
let nats = async_nats::ConnectOptions::new()
|
||||||
|
.add_client_certificate(NATS_CERT.as_str().into(), NATS_KEY.as_str().into())
|
||||||
|
.connect(NATS_URL.as_str())
|
||||||
|
.await;
|
||||||
|
if let Err(e) = nats {
|
||||||
|
error!("FATAL ERROR, COULDN'T CONNECT TO NATS: {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let nats = nats.unwrap();
|
||||||
|
let nats = jetstream::new(nats);
|
||||||
|
|
||||||
|
info!("searchbot spawned");
|
||||||
|
|
||||||
|
async fn on_recv(query: comms::Query) -> ServiceResponse {
|
||||||
|
let db = Database::default().expect("couldn't connect to foundation db!");
|
||||||
|
let now = chrono::Utc::now().timestamp();
|
||||||
|
LAST_MESSAGE.store(now, Ordering::Relaxed);
|
||||||
|
|
||||||
|
let response = match query {
|
||||||
|
comms::Query::SearchService(query) => {
|
||||||
|
process::process(query, db).await
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
SearchSrvcResponse::InvalidRequest
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if PROCESSES_HANDLED.load(Ordering::Relaxed) % 100 == 0 {
|
||||||
|
info!("handled {} requests!", PROCESSES_HANDLED.load(Ordering::Relaxed));
|
||||||
|
}
|
||||||
|
PROCESSES_HANDLED.fetch_add(1, Ordering::Relaxed);
|
||||||
|
ServiceResponse::SearchService(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Err(e) = comms::subscribe_service(Service::SearchService, &nats, Arc::new(on_recv)).await {
|
||||||
|
error!("failed to subscribe to searchservice nats! reason {:?}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we will respawn tasks if they crash
|
||||||
|
drop(_guard);
|
||||||
|
}
|
118
searchservice_old/src/process.rs
Normal file
118
searchservice_old/src/process.rs
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
/*
|
||||||
|
* searchservice process.rs
|
||||||
|
* - route incoming nats requests to their specific functions
|
||||||
|
*
|
||||||
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
use async_nats::{jetstream, Subject};
|
||||||
|
use log::{debug, error, warn};
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use asklyphe_common::db::tables::{INFO_TABLE, WEBSITE_TABLE};
|
||||||
|
use asklyphe_common::foundationdb::{Database, KeySelector, RangeOption};
|
||||||
|
use asklyphe_common::foundationdb::options::{StreamingMode, TransactionOption};
|
||||||
|
use asklyphe_common::foundationdb::tuple::{pack, Subspace};
|
||||||
|
use asklyphe_common::nats::searchservice;
|
||||||
|
use asklyphe_common::nats::searchservice::{SearchSrvcQuery, SearchSrvcRequest, SearchSrvcResponse, SearchRequest, SiteCountResponse, SearchResponse};
|
||||||
|
use crate::{algorithm, hacks};
|
||||||
|
|
||||||
|
pub async fn process(query: SearchSrvcQuery, db: Database) -> SearchSrvcResponse {
|
||||||
|
// a future is used so that the whole program doesn't die if an algorithm panics
|
||||||
|
let response = tokio::spawn(async move {
|
||||||
|
match query.request {
|
||||||
|
SearchSrvcRequest::SearchRequest(req) => {
|
||||||
|
search_request(req, &db).await
|
||||||
|
}
|
||||||
|
SearchSrvcRequest::SiteCountRequest => {
|
||||||
|
count_websites(&db).await
|
||||||
|
}
|
||||||
|
}}).await;
|
||||||
|
|
||||||
|
if let Ok(response) = response {
|
||||||
|
response
|
||||||
|
} else if let Err(e) = response {
|
||||||
|
SearchSrvcResponse::OtherError(e.to_string())
|
||||||
|
} else {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn search_request(req: SearchRequest, db: &Database) -> SearchSrvcResponse {
|
||||||
|
let words_initial: Vec<String> = req.query.split_whitespace().map(|s| s.to_string()).collect();
|
||||||
|
let mut words = vec![];
|
||||||
|
let mut no_separator_flag = false;
|
||||||
|
for word in words_initial {
|
||||||
|
if hacks::is_from_language_that_doesnt_use_word_separators(&word) {
|
||||||
|
words.extend(word.chars().map(|c| c.to_string()).collect::<Vec<String>>());
|
||||||
|
no_separator_flag = true;
|
||||||
|
} else {
|
||||||
|
words.push(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match algorithm::search(&db, words, no_separator_flag).await {
|
||||||
|
Some(results) => {
|
||||||
|
SearchSrvcResponse::SearchResponse(results)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
SearchSrvcResponse::SearchResponse(SearchResponse {
|
||||||
|
results: vec![],
|
||||||
|
blocked: vec![],
|
||||||
|
pagerank_time_seconds: 0.0,
|
||||||
|
total_query_seconds: 0.0,
|
||||||
|
max_relevance: 0.0,
|
||||||
|
exact_phrase_found: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn count_websites(db: &Database) -> SearchSrvcResponse {
|
||||||
|
let mut counter: u64 = 0;
|
||||||
|
let subspace = Subspace::from(WEBSITE_TABLE);
|
||||||
|
let mut last_key = subspace.range().0;
|
||||||
|
let final_key = subspace.range().1;
|
||||||
|
|
||||||
|
for _failsafe in 0..10000 {
|
||||||
|
let trx = db.create_trx();
|
||||||
|
if let Err(e) = trx {
|
||||||
|
error!("DATABASE ERROR page_links_exiting_count {e}");
|
||||||
|
} else {
|
||||||
|
let trx = trx.unwrap();
|
||||||
|
// link -> from -> *
|
||||||
|
let mut range = RangeOption::from((last_key.clone(), final_key.clone()));
|
||||||
|
range.mode = StreamingMode::Iterator;
|
||||||
|
range.limit = Some(10096);
|
||||||
|
|
||||||
|
let mut stream = trx.get_ranges_keyvalues(range, true);
|
||||||
|
|
||||||
|
let mut this_time = 0;
|
||||||
|
|
||||||
|
while let Some(kv) = stream.next().await {
|
||||||
|
if let Ok(kv) = kv {
|
||||||
|
counter += 1;
|
||||||
|
this_time += 1;
|
||||||
|
last_key = kv.key().to_vec();
|
||||||
|
} else if let Err(e) = kv {
|
||||||
|
eprintln!("err while counting {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if this_time <= 10 {
|
||||||
|
return SearchSrvcResponse::SiteCountResponse(SiteCountResponse {
|
||||||
|
count: counter / 8,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
SearchSrvcResponse::OtherError("couldn't retrieve count ):".to_string())
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue