/*
* searchservice algorithm.rs
* - how search works
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see .
*/
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::ops::Mul;
use async_recursion::async_recursion;
use once_cell::sync::Lazy;
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use log::{debug, info};
use rand::Rng;
use asklyphe_common::db;
use asklyphe_common::db::{STRHASH};
use asklyphe_common::foundationdb::Database;
use asklyphe_common::nats::searchservice;
pub struct SearchResult {
pub url: String,
pub title: Option,
pub description: Option,
pub url_contains_result: Option,
pub word_occurs: f64,
pub authorword_occurs: f64,
pub descriptionword_occurs: f64,
pub keyword_occurs: f64,
pub sitename_occurs: f64,
pub linkword_occurs: f64,
pub pagerank: f64,
pub relavence: f64,
pub words_contained: BTreeSet,
pub total_query_words: f64,
pub words: BTreeMap,
pub closest_match: f64,
pub phrase_match_count: f64,
pub match_acc: f64,
pub match_c: f64,
pub highest_match: f64,
}
impl SearchResult {
pub fn relavence(&self, no_separator_flag: bool) -> f64 {
if !no_separator_flag {
((self.word_occurs * 1.3) + (self.descriptionword_occurs * 1.2) + (self.keyword_occurs * 0.5) + (self.authorword_occurs * 1.2) + (self.sitename_occurs * 1.3) + (self.linkword_occurs * 2.9) + (self.pagerank.powi(4)).max(0.001))
.max(0.01)
*
if self.total_query_words > 1.0 {
((1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
+ if !self.words_contained.is_empty() { (self.total_query_words / self.words_contained.len() as f64).max(0.002) } else { 1.0 }).max(0.001)
* self.phrase_match_count.max(0.02).powi(5)
} else {
1.0
}
} else {
if self.total_query_words > 1.0 {
(1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
* self.phrase_match_count.max(0.02).powi(8)
} else {
1.0
}
}
}
}
pub static PRECALCED_PAGERANKS: Lazy>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static CACHED_PAGERANKS: Lazy>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static VISITED_PAGERANKS: Lazy>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static VISITED_PAGERANKS2: Lazy>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static HASH_CACHE: Lazy> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static UNHASH_CACHE: Lazy> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static TITLE_CACHE: Lazy>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static DESC_CACHE: Lazy>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub struct HashCache {
inner: Arc>>,
}
pub enum EntryBuilder {
Found(V),
NeedsInsert((K, Arc>>)),
}
impl EntryBuilder {
pub async fn or_insert(self, value: impl futures::Future