This repository has been archived on 2025-03-12. You can view files and clone it, but cannot push or open issues or pull requests.
searchservice/src/algorithm.rs

1213 lines
No EOL
49 KiB
Rust

/*
* searchservice algorithm.rs
* - how search works
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::ops::Mul;
use async_recursion::async_recursion;
use once_cell::sync::Lazy;
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use log::{debug, info};
use rand::Rng;
use asklyphe_common::db;
use asklyphe_common::db::{STRHASH};
use asklyphe_common::foundationdb::Database;
use asklyphe_common::nats::searchservice;
pub struct SearchResult {
pub url: String,
pub title: Option<String>,
pub description: Option<String>,
pub url_contains_result: Option<usize>,
pub word_occurs: f64,
pub authorword_occurs: f64,
pub descriptionword_occurs: f64,
pub keyword_occurs: f64,
pub sitename_occurs: f64,
pub linkword_occurs: f64,
pub pagerank: f64,
pub relavence: f64,
pub words_contained: BTreeSet<usize>,
pub total_query_words: f64,
pub words: BTreeMap<usize, usize>,
pub closest_match: f64,
pub phrase_match_count: f64,
pub match_acc: f64,
pub match_c: f64,
pub highest_match: f64,
}
impl SearchResult {
pub fn relavence(&self, no_separator_flag: bool) -> f64 {
if !no_separator_flag {
((self.word_occurs * 1.3) + (self.descriptionword_occurs * 1.2) + (self.keyword_occurs * 0.5) + (self.authorword_occurs * 1.2) + (self.sitename_occurs * 1.3) + (self.linkword_occurs * 2.9) + (self.pagerank.powi(4)).max(0.001))
.max(0.01)
*
if self.total_query_words > 1.0 {
((1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
+ if !self.words_contained.is_empty() { (self.total_query_words / self.words_contained.len() as f64).max(0.002) } else { 1.0 }).max(0.001)
* self.phrase_match_count.max(0.02).powi(5)
} else {
1.0
}
} else {
if self.total_query_words > 1.0 {
(1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
* self.phrase_match_count.max(0.02).powi(8)
} else {
1.0
}
}
}
}
pub static PRECALCED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static CACHED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
pub static VISITED_PAGERANKS: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static VISITED_PAGERANKS2: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
pub static HASH_CACHE: Lazy<HashCache<String, STRHASH>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static UNHASH_CACHE: Lazy<HashCache<STRHASH, String>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static TITLE_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub static DESC_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
pub struct HashCache<K: Ord, V: Clone> {
inner: Arc<Mutex<BTreeMap<K, V>>>,
}
pub enum EntryBuilder<K: Ord, V: Clone> {
Found(V),
NeedsInsert((K, Arc<Mutex<BTreeMap<K, V>>>)),
}
impl<K: Ord, V: Clone> EntryBuilder<K, V> {
pub async fn or_insert(self, value: impl futures::Future<Output=V>) -> V {
match self {
EntryBuilder::Found(v) => { v }
EntryBuilder::NeedsInsert((key, inner)) => {
let value = (value).await;
inner.lock().unwrap().insert(key, value.clone());
value
}
}
}
}
impl<K: Ord, V: Clone> HashCache<K, V> {
pub fn new(inner: Arc<Mutex<BTreeMap<K, V>>>) -> Self {
Self {
inner,
}
}
pub fn entry(&self, key: K) -> EntryBuilder<K, V> {
if let Some(value) = self.inner.lock().unwrap().get(&key).cloned() {
EntryBuilder::Found(value)
} else {
EntryBuilder::NeedsInsert((key, self.inner.clone()))
}
}
pub fn unconditional(&self, key: K) -> EntryBuilder<K, V> {
EntryBuilder::NeedsInsert((key, self.inner.clone()))
}
}
pub async fn cached_hash(db: &Database, key: String) -> STRHASH {
HASH_CACHE.entry(key.clone()).or_insert(db::foa_strhash(db, &key)).await
}
pub async fn cached_unhash(db: &Database, key: STRHASH) -> String {
UNHASH_CACHE.entry(key).or_insert(db::unhash(db, key)).await
}
pub async fn cached_title(db: &Database, key: STRHASH) -> Option<String> {
if let Some(title) = TITLE_CACHE.entry(key).or_insert(db::info_title(db, key)).await {
Some(title)
} else {
TITLE_CACHE.unconditional(key).or_insert(db::info_title(db, key)).await
}
}
pub async fn cached_desc(db: &Database, key: STRHASH) -> Option<String> {
if let Some(desc) = DESC_CACHE.entry(key).or_insert(db::info_description(db, key)).await {
Some(desc)
} else {
DESC_CACHE.unconditional(key).or_insert(db::info_description(db, key)).await
}
}
#[async_recursion]
pub async fn pagerank(db: &Database, url: STRHASH) -> f64 {
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
return *precalc;
}
if let Some(precalc_db) = db::page_pagerank(db, url).await {
//debug!("url {} in db {}", url, precalc_db);
if precalc_db == 0.0 {
//debug!("but 0 ):");
} else {
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
return precalc_db;
}
}
let mut accum = 0.0;
let incoming = {
db::page_links_entering(db, url).await
};
let d = {
db::page_damping(db, url).await.unwrap_or(0.85)
};
let d = (1.0 - d).max(0.0);
for url in incoming {
if PRECALCED_PAGERANKS.lock().unwrap().get(&url).is_none() && VISITED_PAGERANKS2.lock().unwrap().contains(&url) {
continue;
}
let c = {
db::page_links_exiting_count(db, url).await
};
if c == 0 { continue; }
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
if *precalc != 0.0 {
accum += *precalc / c as f64;
continue;
}
}
VISITED_PAGERANKS2.lock().unwrap().insert(url);
let pr = pagerank(db, url).await;
accum += pr / c as f64;
}
let pr = (1.0 - d) + (d * accum);
db::document_set_pagerank(db, url, pr, chrono::Utc::now().timestamp()).await;
PRECALCED_PAGERANKS.lock().unwrap().insert(url, pr);
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
pr
}
pub const MAX_PAGERANK_APPROX_DEPTH: u64 = 1;
#[async_recursion]
pub async fn pagerank_approx(db: &Database, url: STRHASH, depth: u64) -> f64 {
if depth > MAX_PAGERANK_APPROX_DEPTH {
return 0.8;
}
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
return *precalc;
} else if let Some(precalc) = CACHED_PAGERANKS.lock().unwrap().get(&url) {
return *precalc;
}
let mut not_in_db = false;
let mut in_db_but_zero = false;
if let Some(precalc_db) = db::page_pagerank(db, url).await {
//debug!("url {} in db {}", url, precalc_db);
if precalc_db == 0.0 {
//debug!("but 0 ):");
// uncomment when we want to eventually try to recalc 0.0 prs
//not_in_db = true;
//in_db_but_zero = true;
if depth == 0 {
tokio::spawn(async move {
info!("task spawned to calc real pagerank...");
let db = Database::default().expect("couldn't connect to foundation db!");
let pr = pagerank(&db, url).await;
info!("finished calculating {} real pagerank: {}", url, pr);
});
}
}// else {
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
return precalc_db;
//}
} else {
not_in_db = true;
}
// spawn task to eventually calculate real pagerank
if depth == 0 && not_in_db {
tokio::spawn(async move {
//info!("task spawned to calc real pagerank...");
let db = Database::default().expect("couldn't connect to foundation db!");
pagerank(&db, url).await;
//info!("finished calculating {} real pagerank: {}", url, pr);
});
}
if in_db_but_zero {
CACHED_PAGERANKS.lock().unwrap().insert(url, 0.0);
return 0.0;
}
let mut accum = 0.0;
let incoming = {
db::page_links_entering(db, url).await
};
let d = {
db::page_damping(db, url).await.unwrap_or(0.85)
};
let d = (1.0 - d).max(0.0);
for url in incoming {
if PRECALCED_PAGERANKS.lock().unwrap().get(&url).is_none() && CACHED_PAGERANKS.lock().unwrap().get(&url).is_none() && VISITED_PAGERANKS.lock().unwrap().contains(&url) {
continue;
}
let c = {
db::page_links_exiting_count(db, url).await
};
if c == 0 { continue; }
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
accum += *precalc / c as f64;
continue;
}
if let Some(precalc) = CACHED_PAGERANKS.lock().unwrap().get(&url) {
accum += *precalc / c as f64;
continue;
}
VISITED_PAGERANKS.lock().unwrap().insert(url);
let pr = pagerank_approx(db, url, depth + 1).await;
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
accum += pr / c as f64;
}
let pr = (1.0 - d) + (d * accum);
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
pr
}
fn multiword_penalty(dist_a: f64, dist_b: f64) -> f64 {
if (dist_b - dist_a).is_sign_negative() {
// second comes before first, not good!
(dist_b - dist_a)
} else {
1.0 - (dist_b - dist_a)
}
}
pub async fn search(db: &Database, args: Vec<String>, no_separator_flag: bool) -> Option<searchservice::SearchResponse> {
let mut results: BTreeMap<STRHASH, SearchResult> = BTreeMap::new();
let start_t = chrono::Utc::now();
if args.is_empty() {
return None;
}
let args_count = args.len();
let arg_words = args.clone();
//let word = format!("_{}_", word);
let first_query = args.first().cloned();
let mut hash = vec![];
for (i, word) in args.into_iter().enumerate() {
hash.push((i, cached_hash(db, word.clone()).await));
//hash.push((i, cached_hash(db, format!("{}.", word)).await));
//hash.push((i, cached_hash(db, format!(".{}", word)).await));
//hash.push((i, cached_hash(db, format!(".{}.", word)).await));
//hash.push((i, cached_hash(db, format!("{}s", word)).await));
}
//let hash: Vec<(usize, STRHASH)> = hash.into_iter().filter_map(|v| v.1.map(|b| (v.0, b))).collect();
if hash.is_empty() {
println!("none in database");
return None;
}
let first_query = first_query.unwrap();
let SECONDARY_WORDS = [
db::hash("how"),
db::hash("is"),
db::hash("are"),
db::hash("the"),
db::hash("a"),
db::hash("when"),
db::hash("what"),
db::hash("why"),
db::hash("to"),
db::hash("where"),
db::hash("from"),
db::hash("best"),
db::hash("for"),
db::hash("like"),
];
let mut secondary_indices = BTreeSet::new();
let mut word_occurs = BTreeMap::new();
for hash in &hash {
if SECONDARY_WORDS.contains(&hash.1) {
secondary_indices.insert(hash.0);
}
word_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
db::word_occurs(db, hash.1).await);
}
//let mut authorword_occurs = BTreeMap::new();
let mut descriptionword_occurs = BTreeMap::new();
let mut keyword_occurs = BTreeMap::new();
//let mut sitenameword_occurs = BTreeMap::new();
for hash in &hash {
//authorword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
// db::metaword_occurs(db, db::hash("author"), hash.1).await);
descriptionword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
db::metaword_occurs(db, db::hash("description"), hash.1).await);
keyword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
db::metaword_occurs(db, db::hash("keywords"), hash.1).await);
//sitenameword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
// db::metaword_occurs(db, db::hash("site_name"), hash.1).await);
}
let mut linkword_occurs = BTreeMap::new();
for hash in &hash {
linkword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
db::linkword_occurs(db, hash.1).await);
}
let mut urls = vec![];
for (_, vals) in &word_occurs {
for (url, _) in vals {
urls.push(*url);
}
}
for (_, vals) in &linkword_occurs {
for (url, _) in vals {
urls.push(*url);
}
}
for (_, vals) in &descriptionword_occurs {
for (url, _) in vals {
urls.push(*url);
}
}
for (_, vals) in &keyword_occurs {
for (url, _) in vals {
urls.push(*url);
}
}
let mut useless_urls = vec![];
// we want to remove urls that aren't contained in every word index (i.e. urls that don't contain every word of the search query)
// however, we don't want to remove the url if it's not contained in a secondary index (i.e. if someone searches "best x",
// we don't want to remove results that don't contain the word "best")
// we also don't want to remove results if the page doesn't have that word, but the linkwords / metawords does
for url in &urls {
// for every word index...
for (index, vals) in &word_occurs {
// don't worry about secondary indices
if secondary_indices.contains(index) {
continue;
}
// collect urls
let urls: Vec<STRHASH> = vals.iter().map(|(url, _)| *url).collect();
// is this url not in the words bucket for this index?
if !urls.contains(url) {
// does another bucket contain it for this index?
let mut found_elsewhere = 0;
if linkword_occurs.get(index).unwrap().iter()
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
found_elsewhere += 1;
}
if descriptionword_occurs.get(index).unwrap().iter()
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
found_elsewhere += 1;
}
if keyword_occurs.get(index).unwrap().iter()
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
found_elsewhere += 1;
}
if found_elsewhere < 2 {
// not found anywhere else, thus this url doesn't match every meaningful word in the query
useless_urls.push(*url);
}
}
}
}
//for (_, vals) in &authorword_occurs {
// for (url, _) in vals {
// urls.push(*url);
// }
//}
//for (_, vals) in &sitenameword_occurs {
// for (url, _) in vals {
// urls.push(*url);
// }
//}
//for url in &urls {
// for (_, vals) in &linkword_occurs {
// let urls: Vec<STRHASH> = vals.iter().map(|(u, _)| *u).collect();
// if !urls.contains(url) {
// useless_urls.push(*url);
// }
// }
//}
urls.sort();
urls.dedup();
urls.retain(|u| !useless_urls.contains(u));
for vals in word_occurs.values_mut() {
vals.retain(|(u, _)| !useless_urls.contains(u));
}
for vals in linkword_occurs.values_mut() {
vals.retain(|(u, _)| !useless_urls.contains(u));
}
for vals in descriptionword_occurs.values_mut() {
vals.retain(|(u, _)| !useless_urls.contains(u));
}
for vals in keyword_occurs.values_mut() {
vals.retain(|(u, _)| !useless_urls.contains(u));
}
let start = chrono::Utc::now();
let mut word_count = Arc::new(Mutex::new(BTreeMap::new()));
let mut allowed = Arc::new(AtomicUsize::new(0));
let mut pr_acc = 0.0;
const ALLOWED_BEFORE_PRUNE: usize = 512;
const ALLOWED_BEFORE_MEGAPRUNE: usize = 777;
const ALLOWED_BEFORE_GIGAPRUNE: usize = 2048;
//const ICKY_WORDS: &[&str] = &[
// "distrowatch.com", // distrowatch is taking up too many results at the moment, remove this later
// "mariowiki.com", // mariowiki is taking up too many results at the moment, remove this later
// "wired.com", // we have too many wired articles
// "wired.cz", // we are very targeted at an english audience, they probably don't want czech wired articles at the moment
// "neocities.org/browse?", // people probably don't want to visit neocities tag lists
// "https://distrowatch.com/?language=", // people probably aren't looking for the distrowatch homepage in a random language
// "https://distrowatch.com/weekly.php/weekly.php?issue=", // a lot of results are unrelated distrowatch weekly posts
// "terms", // people probably aren't looking for tos pages
// "statement", // people probably aren't looking for tos pages
// "3cx", // nonenglish voip company, takes up unrelated search queries
// "1377x", // phishing site pretending to be 1337x, temporary fix until we can implement something like site blocking
// "//kickasstorrents.", // kickasstorrents has been down for years, only remaining sites are likely phishing scams
// "//kickasstorrent.", // kickasstorrents has been down for years, only remaining sites are likely phishing scams
// "//katcr.to", // fake kickasstorrents site
// "//kat.am", // fake kickasstorrents site
// "//kikass.to", // fake kickasstorrents site
// "//thepiratebays.com", // fake thepiratebay site
// ".fandom.com", // fuck fandom.com (todo: remove this since ultimately, it should be the user's choice to block fandom)
//];
// todo: since our list is so small this is okay for now, but we should cache this in the future
let deranks = Arc::new(db::get_deranked_websites(db).await);
let mut initial_pruned = Arc::new(Mutex::new(vec![]));
// url, reason
let mut blocked: Arc<Mutex<Vec<(String, String)>>> = Arc::new(Mutex::new(vec![]));
debug!("checking {} urls", urls.len());
let mut url_tasks = vec![];
for (i, url) in urls.into_iter().enumerate() {
let allowed = allowed.clone();
let blocked = blocked.clone();
let initial_pruned = initial_pruned.clone();
let word_count = word_count.clone();
let arg_words = arg_words.clone();
let deranks = deranks.clone();
url_tasks.push(tokio::spawn(async move {
if i > ALLOWED_BEFORE_GIGAPRUNE {
initial_pruned.lock().unwrap().push(url);
return;
}
let db = Database::default().expect("FAILED TO CREATE NEW FDB HANDLE");
let surl = {
cached_unhash(&db, url).await
}.to_lowercase();
let mut contains_query_word = false;
for w in &arg_words {
if surl.contains(w) {
contains_query_word = true;
break;
}
}
let mut prepruned = false;
for (_, derank) in deranks.iter() {
if surl.contains(&derank.urlmatch) || surl.contains(&derank.urlmatch.replace("//", ".")) {
if let Some(and) = &derank.and {
if !surl.contains(and) {
continue;
}
}
if let Some(unless) = &derank.unless {
if surl.contains(unless) {
continue;
}
}
if !contains_query_word &&
(i > ALLOWED_BEFORE_MEGAPRUNE || (i > ALLOWED_BEFORE_PRUNE && derank.amount < 0.85)) &&
!contains_query_word {
initial_pruned.lock().unwrap().push(url);
prepruned = true;
}
if derank.amount == 0.0 {
initial_pruned.lock().unwrap().push(url);
blocked.lock().unwrap().push((surl.clone(), derank.comment.clone()));
prepruned = true;
}
}
}
if prepruned {
return;
}
let pr = pagerank_approx(&db, url, 0).await;
if i > ALLOWED_BEFORE_PRUNE {
let mut contains_query_word = false;
for w in &arg_words {
if surl.contains(w) {
contains_query_word = true;
break;
}
}
if contains_query_word || (pr > (((pr_acc / i as f64).max(0.01)) * if i > ALLOWED_BEFORE_MEGAPRUNE { 4.0 } else { 1.0 }) || (i < ALLOWED_BEFORE_MEGAPRUNE)) {
let wc = db::page_word_count(&db, url).await;
word_count.lock().unwrap().insert(url, wc);
pr_acc += pr;
allowed.fetch_add(1, Ordering::Relaxed);
} else {
initial_pruned.lock().unwrap().push(url);
}
} else {
let wc = db::page_word_count(&db, url).await;
word_count.lock().unwrap().insert(url, wc);
pr_acc += pr;
allowed.fetch_add(1, Ordering::Relaxed);
}
//let url = {
//};
//debug!("{} pr: {}", url, pr);
//debug!("{} wc: {}", url, wc);
// precache values
cached_unhash(&db, url).await;
cached_title(&db, url).await;
cached_desc(&db, url).await;
}));
}
for url_task in url_tasks {
url_task.await.expect("url task failure");
}
let initial_pruned = initial_pruned.lock().unwrap().clone();
let word_count = word_count.lock().unwrap().clone();
debug!("pruned {} results ({pr_acc}, {})", initial_pruned.len(), allowed.load(Ordering::Relaxed));
for vals in word_occurs.values_mut() {
vals.retain(|(u, _)| !initial_pruned.contains(u));
}
for vals in linkword_occurs.values_mut() {
vals.retain(|(u, _)| !initial_pruned.contains(u));
}
for vals in descriptionword_occurs.values_mut() {
vals.retain(|(u, _)| !initial_pruned.contains(u));
}
for vals in keyword_occurs.values_mut() {
vals.retain(|(u, _)| !initial_pruned.contains(u));
}
let pagerank_secs = chrono::Utc::now().signed_duration_since(start).num_milliseconds() as f64 / 1000.0;
info!("pageranks in {} secs", pagerank_secs);
//word_occurs.sort_by(|a, b| {
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
//});
//metaword_occurs.sort_by(|a, b| {
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
//});
//linkword_occurs.sort_by(|a, b| {
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
//});
pub const MULTIWORD_PENALTY: f64 = 0.0001;
pub const PHRASE_PERCENTAGE: f64 = 0.7;
for ((position, vals)) in word_occurs.iter() {
if let Some(vals2) = word_occurs.get(&(*position + 1)) {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distacc = 0.0;
let mut distc = 0;
let mut smallest = f64::MAX;
let mut largest = 0.0;
let mut phrase_matches = 0;
for i in 0..occurs {
let offset = (i * 2) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
for (url_b, occurs_b) in vals2 {
if url_a != url_b {
continue;
}
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
for j in 0..occurs2 {
let offset2 = (j * 2) + 8;
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
distacc += (dist2 - dist);
if (dist2 - dist) >= 0.0 && (dist2 - dist) < smallest {
smallest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
largest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(1.0)) {
phrase_matches += 1;
}
distc += 1;
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
}
}
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: acc,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: 0.0,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: f64::MAX,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.word_occurs = acc;
res.words_contained.insert(*position);
res.match_acc += distacc;
res.match_c += distc as f64;
if smallest < res.closest_match {
res.closest_match = smallest;
}
if largest < res.highest_match {
res.highest_match = largest;
}
res.phrase_match_count += phrase_matches as f64;
*res.words.entry(*position).or_insert(0) += 1;
}
} else {
for (urlv, occursv) in vals {
let url = {
cached_unhash(db, *urlv).await
};
let occurs = u64::from_be_bytes(occursv.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distc = 0;
for i in 0..occurs {
let offset = (i * 2) + 8;
let dist = occursv[offset as usize] as f64 / 255.0;
//distc += 1;
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
}
let res = results.entry(*urlv).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *urlv).await,
description: cached_desc(db, *urlv).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: acc,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: 0.0,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: 0.0,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.word_occurs = acc;
res.words_contained.insert(*position);
*res.words.entry(*position).or_insert(0) += 1;
}
}
}
for ((position, vals)) in linkword_occurs.iter() {
if let Some(vals2) = linkword_occurs.get(&(*position + 1)) {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distacc = 0.0;
let mut distc = 0;
let mut smallest = f64::MAX;
let mut largest = 0.0;
let mut phrase_matches = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
for (url_b, occurs_b) in vals2 {
if url_a != url_b {
distc -= 4;
continue;
}
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
for j in 0..occurs2 {
let offset2 = (j) + 8;
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
distacc += (dist2 - dist) * 12.0;
distc += 1;
if (dist2 - dist) < smallest {
smallest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
largest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
phrase_matches += 1;
}
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
}
}
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: f64::MAX,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.linkword_occurs = acc;
res.match_acc += distacc;
res.match_c += distc as f64;
if smallest < res.closest_match {
res.closest_match = smallest;
}
res.phrase_match_count += phrase_matches as f64;
*res.words.entry(*position).or_insert(0) += 1;
}
} else {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distc = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
//distc += 1;
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: 0.0,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.linkword_occurs = acc;
res.words_contained.insert(*position);
*res.words.entry(*position).or_insert(0) += 1;
}
}
}
for ((position, vals)) in descriptionword_occurs.iter() {
if let Some(vals2) = descriptionword_occurs.get(&(*position + 1)) {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distacc = 0.0;
let mut distc = 0;
let mut smallest = f64::MAX;
let mut largest = 0.0;
let mut phrase_matches = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
for (url_b, occurs_b) in vals2 {
if url_a != url_b {
distc -= 4;
continue;
}
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
for j in 0..occurs2 {
let offset2 = (j) + 8;
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
distacc += (dist2 - dist) * 12.0;
distc += 1;
if (dist2 - dist) < smallest {
smallest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
largest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
phrase_matches += 1;
}
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
}
}
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: f64::MAX,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.descriptionword_occurs = acc;
res.match_acc += distacc;
res.match_c += distc as f64;
if smallest < res.closest_match {
res.closest_match = smallest;
}
res.phrase_match_count += phrase_matches as f64;
*res.words.entry(*position).or_insert(0) += 1;
}
} else {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distc = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
//distc += 1;
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: 0.0,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.descriptionword_occurs = acc;
res.words_contained.insert(*position);
*res.words.entry(*position).or_insert(0) += 1;
}
}
}
for ((position, vals)) in keyword_occurs.iter() {
if let Some(vals2) = keyword_occurs.get(&(*position + 1)) {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distacc = 0.0;
let mut distc = 0;
let mut smallest = f64::MAX;
let mut largest = 0.0;
let mut phrase_matches = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
for (url_b, occurs_b) in vals2 {
if url_a != url_b {
distc -= 4;
continue;
}
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
for j in 0..occurs2 {
let offset2 = (j) + 8;
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
distacc += (dist2 - dist) * 12.0;
distc += 1;
if (dist2 - dist) < smallest {
smallest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
largest = (dist2 - dist);
}
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
phrase_matches += 1;
}
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
}
}
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: f64::MAX,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.keyword_occurs = acc;
res.match_acc += distacc;
res.match_c += distc as f64;
if smallest < res.closest_match {
res.closest_match = smallest;
}
res.phrase_match_count += phrase_matches as f64;
*res.words.entry(*position).or_insert(0) += 1;
}
} else {
for (url_a, occurs_a) in vals {
let url = {
cached_unhash(db, *url_a).await
};
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
let mut acc = 0.0;
let mut distc = 0;
for i in 0..occurs {
let offset = (i) + 8;
let dist = occurs_a[offset as usize] as f64 / 255.0;
//distc += 1;
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
}
let res = results.entry(*url_a).or_insert(SearchResult {
url: url.clone(),
title: cached_title(db, *url_a).await,
description: cached_desc(db, *url_a).await,
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
word_occurs: 0.0,
authorword_occurs: 0.0,
descriptionword_occurs: 0.0,
keyword_occurs: 0.0,
sitename_occurs: 0.0,
linkword_occurs: acc,
pagerank: 0.0,
relavence: 0.0,
words_contained: BTreeSet::new(),
total_query_words: args_count as f64,
words: Default::default(),
closest_match: 0.0,
phrase_match_count: 0.0,
match_acc: 0.0,
match_c: 0.0,
highest_match: 0.0,
});
res.keyword_occurs = acc;
res.words_contained.insert(*position);
*res.words.entry(*position).or_insert(0) += 1;
}
}
}
let mut max_relavence = 0.0;
let mut longest_url = 1;
let mut max_words = 1;
let mut phrase_found = false;
// todo: move to database
results.iter_mut().for_each(|v| {
v.1.pagerank = *PRECALCED_PAGERANKS.lock().unwrap().get(v.0).unwrap_or(
CACHED_PAGERANKS.lock().unwrap().get(v.0).unwrap()
);
//debug!("{},{}", v.1.phrase_match_count, args_count);
if v.1.phrase_match_count + 1.0 >= args_count as f64 {
phrase_found = true;
}
if v.1.url.len() > longest_url {
longest_url = v.1.url.len();
}
let mut wc = 1;
for word in &arg_words {
if v.1.url.to_lowercase().contains(word) {
wc += 1;
}
}
if wc > max_words {
max_words = wc;
}
v.1.relavence = v.1.relavence(no_separator_flag);
//debug!("{} -> {}/{}", v.1.url, v.1.words_contained, v.1.total_query_words);
if v.1.relavence > max_relavence {
max_relavence = v.1.relavence;
}
});
results.iter_mut().for_each(|v| {
let url_multiplier = ((v.1.url.len() as f64 * 2.3) / (longest_url as f64));
//debug!("url {} multiplier {}", v.1.url, url_multiplier);
let mut word_multiplier = 1;
for word in &arg_words {
if v.1.url.to_lowercase().contains(word) {
word_multiplier += 1;
}
}
let word_multiplier = (word_multiplier as f64 * 4.0) / max_words as f64;
let mut icky_found = 1.0;
for (_, derank) in deranks.iter() {
if v.1.url.contains(&derank.urlmatch) {
if let Some(and) = &derank.and {
if !v.1.url.contains(and) {
continue;
}
}
if let Some(unless) = &derank.unless {
if v.1.url.contains(unless) {
continue;
}
}
icky_found *= derank.amount;
}
}
v.1.relavence *= word_multiplier.powi(8).min((max_words as f64).powi(8));
v.1.relavence /= url_multiplier.max(1.0).powi(4);
v.1.relavence *= icky_found;
});
//results.iter_mut().for_each(|v| v.relevance /= max_occurs as f64);
let mut results: Vec<SearchResult> = results.into_iter().map(|v| v.1).collect();
max_relavence = 0.0;
results.sort_by(|a, b| {
if a.relavence > max_relavence {
max_relavence = a.relavence;
}
if b.relavence > max_relavence {
max_relavence = a.relavence;
}
b.relavence.total_cmp(&a.relavence)
});
let mut results_final = vec![];
for result in results {
results_final.push(searchservice::SearchResult {
url: result.url,
relevance: result.relavence,
title: result.title,
description: result.description,
});
}
let time = chrono::Utc::now().signed_duration_since(start_t).num_milliseconds() as f64 / 1000.0;
let blocked = blocked.lock().unwrap();
Some(searchservice::SearchResponse {
results: results_final,
blocked: blocked.clone(),
pagerank_time_seconds: pagerank_secs,
total_query_seconds: time,
max_relevance: max_relavence,
exact_phrase_found: phrase_found,
})
}