1213 lines
No EOL
49 KiB
Rust
1213 lines
No EOL
49 KiB
Rust
/*
|
|
* searchservice algorithm.rs
|
|
* - how search works
|
|
*
|
|
* Copyright (C) 2025 Real Microsoft, LLC
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
use std::collections::{BTreeMap, BTreeSet, VecDeque};
|
|
use std::ops::Mul;
|
|
use async_recursion::async_recursion;
|
|
use once_cell::sync::Lazy;
|
|
use std::sync::{Arc, Mutex};
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
use log::{debug, info};
|
|
use rand::Rng;
|
|
use asklyphe_common::db;
|
|
use asklyphe_common::db::{STRHASH};
|
|
use asklyphe_common::foundationdb::Database;
|
|
use asklyphe_common::nats::searchservice;
|
|
|
|
pub struct SearchResult {
|
|
pub url: String,
|
|
pub title: Option<String>,
|
|
pub description: Option<String>,
|
|
pub url_contains_result: Option<usize>,
|
|
pub word_occurs: f64,
|
|
pub authorword_occurs: f64,
|
|
pub descriptionword_occurs: f64,
|
|
pub keyword_occurs: f64,
|
|
pub sitename_occurs: f64,
|
|
pub linkword_occurs: f64,
|
|
pub pagerank: f64,
|
|
pub relavence: f64,
|
|
pub words_contained: BTreeSet<usize>,
|
|
pub total_query_words: f64,
|
|
pub words: BTreeMap<usize, usize>,
|
|
pub closest_match: f64,
|
|
pub phrase_match_count: f64,
|
|
pub match_acc: f64,
|
|
pub match_c: f64,
|
|
pub highest_match: f64,
|
|
}
|
|
|
|
impl SearchResult {
|
|
pub fn relavence(&self, no_separator_flag: bool) -> f64 {
|
|
if !no_separator_flag {
|
|
((self.word_occurs * 1.3) + (self.descriptionword_occurs * 1.2) + (self.keyword_occurs * 0.5) + (self.authorword_occurs * 1.2) + (self.sitename_occurs * 1.3) + (self.linkword_occurs * 2.9) + (self.pagerank.powi(4)).max(0.001))
|
|
.max(0.01)
|
|
*
|
|
if self.total_query_words > 1.0 {
|
|
((1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
|
|
+ if !self.words_contained.is_empty() { (self.total_query_words / self.words_contained.len() as f64).max(0.002) } else { 1.0 }).max(0.001)
|
|
* self.phrase_match_count.max(0.02).powi(5)
|
|
} else {
|
|
1.0
|
|
}
|
|
} else {
|
|
if self.total_query_words > 1.0 {
|
|
(1.0001 - if self.match_c > 0.0 { (self.match_acc / self.match_c) / self.highest_match.max(0.01) } else { 0.1 })
|
|
* self.phrase_match_count.max(0.02).powi(8)
|
|
} else {
|
|
1.0
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub static PRECALCED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
|
pub static CACHED_PAGERANKS: Lazy<Mutex<BTreeMap<STRHASH, f64>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
|
pub static VISITED_PAGERANKS: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
|
|
pub static VISITED_PAGERANKS2: Lazy<Mutex<BTreeSet<STRHASH>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
|
|
pub static HASH_CACHE: Lazy<HashCache<String, STRHASH>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
|
pub static UNHASH_CACHE: Lazy<HashCache<STRHASH, String>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
|
pub static TITLE_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
|
pub static DESC_CACHE: Lazy<HashCache<STRHASH, Option<String>>> = Lazy::new(|| HashCache::new(Arc::new(Mutex::new(BTreeMap::new()))));
|
|
|
|
pub struct HashCache<K: Ord, V: Clone> {
|
|
inner: Arc<Mutex<BTreeMap<K, V>>>,
|
|
}
|
|
|
|
pub enum EntryBuilder<K: Ord, V: Clone> {
|
|
Found(V),
|
|
NeedsInsert((K, Arc<Mutex<BTreeMap<K, V>>>)),
|
|
}
|
|
|
|
impl<K: Ord, V: Clone> EntryBuilder<K, V> {
|
|
pub async fn or_insert(self, value: impl futures::Future<Output=V>) -> V {
|
|
match self {
|
|
EntryBuilder::Found(v) => { v }
|
|
EntryBuilder::NeedsInsert((key, inner)) => {
|
|
let value = (value).await;
|
|
inner.lock().unwrap().insert(key, value.clone());
|
|
value
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<K: Ord, V: Clone> HashCache<K, V> {
|
|
pub fn new(inner: Arc<Mutex<BTreeMap<K, V>>>) -> Self {
|
|
Self {
|
|
inner,
|
|
}
|
|
}
|
|
|
|
pub fn entry(&self, key: K) -> EntryBuilder<K, V> {
|
|
if let Some(value) = self.inner.lock().unwrap().get(&key).cloned() {
|
|
EntryBuilder::Found(value)
|
|
} else {
|
|
EntryBuilder::NeedsInsert((key, self.inner.clone()))
|
|
}
|
|
}
|
|
|
|
pub fn unconditional(&self, key: K) -> EntryBuilder<K, V> {
|
|
EntryBuilder::NeedsInsert((key, self.inner.clone()))
|
|
}
|
|
}
|
|
|
|
pub async fn cached_hash(db: &Database, key: String) -> STRHASH {
|
|
HASH_CACHE.entry(key.clone()).or_insert(db::foa_strhash(db, &key)).await
|
|
}
|
|
|
|
pub async fn cached_unhash(db: &Database, key: STRHASH) -> String {
|
|
UNHASH_CACHE.entry(key).or_insert(db::unhash(db, key)).await
|
|
}
|
|
|
|
pub async fn cached_title(db: &Database, key: STRHASH) -> Option<String> {
|
|
if let Some(title) = TITLE_CACHE.entry(key).or_insert(db::info_title(db, key)).await {
|
|
Some(title)
|
|
} else {
|
|
TITLE_CACHE.unconditional(key).or_insert(db::info_title(db, key)).await
|
|
}
|
|
}
|
|
|
|
pub async fn cached_desc(db: &Database, key: STRHASH) -> Option<String> {
|
|
if let Some(desc) = DESC_CACHE.entry(key).or_insert(db::info_description(db, key)).await {
|
|
Some(desc)
|
|
} else {
|
|
DESC_CACHE.unconditional(key).or_insert(db::info_description(db, key)).await
|
|
}
|
|
}
|
|
|
|
#[async_recursion]
|
|
pub async fn pagerank(db: &Database, url: STRHASH) -> f64 {
|
|
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
|
|
return *precalc;
|
|
}
|
|
if let Some(precalc_db) = db::page_pagerank(db, url).await {
|
|
//debug!("url {} in db {}", url, precalc_db);
|
|
if precalc_db == 0.0 {
|
|
//debug!("but 0 ):");
|
|
} else {
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
|
|
return precalc_db;
|
|
}
|
|
}
|
|
let mut accum = 0.0;
|
|
let incoming = {
|
|
db::page_links_entering(db, url).await
|
|
};
|
|
let d = {
|
|
db::page_damping(db, url).await.unwrap_or(0.85)
|
|
};
|
|
let d = (1.0 - d).max(0.0);
|
|
for url in incoming {
|
|
if PRECALCED_PAGERANKS.lock().unwrap().get(&url).is_none() && VISITED_PAGERANKS2.lock().unwrap().contains(&url) {
|
|
continue;
|
|
}
|
|
let c = {
|
|
db::page_links_exiting_count(db, url).await
|
|
};
|
|
if c == 0 { continue; }
|
|
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
|
|
if *precalc != 0.0 {
|
|
accum += *precalc / c as f64;
|
|
continue;
|
|
}
|
|
}
|
|
VISITED_PAGERANKS2.lock().unwrap().insert(url);
|
|
let pr = pagerank(db, url).await;
|
|
accum += pr / c as f64;
|
|
}
|
|
|
|
let pr = (1.0 - d) + (d * accum);
|
|
db::document_set_pagerank(db, url, pr, chrono::Utc::now().timestamp()).await;
|
|
PRECALCED_PAGERANKS.lock().unwrap().insert(url, pr);
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
|
|
pr
|
|
}
|
|
|
|
pub const MAX_PAGERANK_APPROX_DEPTH: u64 = 1;
|
|
|
|
#[async_recursion]
|
|
pub async fn pagerank_approx(db: &Database, url: STRHASH, depth: u64) -> f64 {
|
|
if depth > MAX_PAGERANK_APPROX_DEPTH {
|
|
return 0.8;
|
|
}
|
|
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
|
|
return *precalc;
|
|
} else if let Some(precalc) = CACHED_PAGERANKS.lock().unwrap().get(&url) {
|
|
return *precalc;
|
|
}
|
|
|
|
let mut not_in_db = false;
|
|
let mut in_db_but_zero = false;
|
|
|
|
if let Some(precalc_db) = db::page_pagerank(db, url).await {
|
|
//debug!("url {} in db {}", url, precalc_db);
|
|
if precalc_db == 0.0 {
|
|
//debug!("but 0 ):");
|
|
|
|
// uncomment when we want to eventually try to recalc 0.0 prs
|
|
//not_in_db = true;
|
|
//in_db_but_zero = true;
|
|
if depth == 0 {
|
|
tokio::spawn(async move {
|
|
info!("task spawned to calc real pagerank...");
|
|
let db = Database::default().expect("couldn't connect to foundation db!");
|
|
let pr = pagerank(&db, url).await;
|
|
info!("finished calculating {} real pagerank: {}", url, pr);
|
|
});
|
|
}
|
|
}// else {
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, precalc_db);
|
|
return precalc_db;
|
|
//}
|
|
} else {
|
|
not_in_db = true;
|
|
}
|
|
|
|
// spawn task to eventually calculate real pagerank
|
|
if depth == 0 && not_in_db {
|
|
tokio::spawn(async move {
|
|
//info!("task spawned to calc real pagerank...");
|
|
let db = Database::default().expect("couldn't connect to foundation db!");
|
|
pagerank(&db, url).await;
|
|
//info!("finished calculating {} real pagerank: {}", url, pr);
|
|
});
|
|
}
|
|
|
|
if in_db_but_zero {
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, 0.0);
|
|
return 0.0;
|
|
}
|
|
|
|
let mut accum = 0.0;
|
|
let incoming = {
|
|
db::page_links_entering(db, url).await
|
|
};
|
|
let d = {
|
|
db::page_damping(db, url).await.unwrap_or(0.85)
|
|
};
|
|
let d = (1.0 - d).max(0.0);
|
|
for url in incoming {
|
|
if PRECALCED_PAGERANKS.lock().unwrap().get(&url).is_none() && CACHED_PAGERANKS.lock().unwrap().get(&url).is_none() && VISITED_PAGERANKS.lock().unwrap().contains(&url) {
|
|
continue;
|
|
}
|
|
let c = {
|
|
db::page_links_exiting_count(db, url).await
|
|
};
|
|
if c == 0 { continue; }
|
|
if let Some(precalc) = PRECALCED_PAGERANKS.lock().unwrap().get(&url) {
|
|
accum += *precalc / c as f64;
|
|
continue;
|
|
}
|
|
if let Some(precalc) = CACHED_PAGERANKS.lock().unwrap().get(&url) {
|
|
accum += *precalc / c as f64;
|
|
continue;
|
|
}
|
|
VISITED_PAGERANKS.lock().unwrap().insert(url);
|
|
let pr = pagerank_approx(db, url, depth + 1).await;
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
|
|
accum += pr / c as f64;
|
|
}
|
|
|
|
let pr = (1.0 - d) + (d * accum);
|
|
CACHED_PAGERANKS.lock().unwrap().insert(url, pr);
|
|
pr
|
|
}
|
|
|
|
fn multiword_penalty(dist_a: f64, dist_b: f64) -> f64 {
|
|
if (dist_b - dist_a).is_sign_negative() {
|
|
// second comes before first, not good!
|
|
(dist_b - dist_a)
|
|
} else {
|
|
1.0 - (dist_b - dist_a)
|
|
}
|
|
}
|
|
|
|
pub async fn search(db: &Database, args: Vec<String>, no_separator_flag: bool) -> Option<searchservice::SearchResponse> {
|
|
let mut results: BTreeMap<STRHASH, SearchResult> = BTreeMap::new();
|
|
let start_t = chrono::Utc::now();
|
|
|
|
if args.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
let args_count = args.len();
|
|
|
|
let arg_words = args.clone();
|
|
|
|
//let word = format!("_{}_", word);
|
|
|
|
let first_query = args.first().cloned();
|
|
|
|
let mut hash = vec![];
|
|
for (i, word) in args.into_iter().enumerate() {
|
|
hash.push((i, cached_hash(db, word.clone()).await));
|
|
//hash.push((i, cached_hash(db, format!("{}.", word)).await));
|
|
//hash.push((i, cached_hash(db, format!(".{}", word)).await));
|
|
//hash.push((i, cached_hash(db, format!(".{}.", word)).await));
|
|
//hash.push((i, cached_hash(db, format!("{}s", word)).await));
|
|
}
|
|
|
|
//let hash: Vec<(usize, STRHASH)> = hash.into_iter().filter_map(|v| v.1.map(|b| (v.0, b))).collect();
|
|
|
|
if hash.is_empty() {
|
|
println!("none in database");
|
|
return None;
|
|
}
|
|
|
|
let first_query = first_query.unwrap();
|
|
|
|
let SECONDARY_WORDS = [
|
|
db::hash("how"),
|
|
db::hash("is"),
|
|
db::hash("are"),
|
|
db::hash("the"),
|
|
db::hash("a"),
|
|
db::hash("when"),
|
|
db::hash("what"),
|
|
db::hash("why"),
|
|
db::hash("to"),
|
|
db::hash("where"),
|
|
db::hash("from"),
|
|
db::hash("best"),
|
|
db::hash("for"),
|
|
db::hash("like"),
|
|
];
|
|
|
|
let mut secondary_indices = BTreeSet::new();
|
|
|
|
let mut word_occurs = BTreeMap::new();
|
|
for hash in &hash {
|
|
if SECONDARY_WORDS.contains(&hash.1) {
|
|
secondary_indices.insert(hash.0);
|
|
}
|
|
word_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
db::word_occurs(db, hash.1).await);
|
|
}
|
|
//let mut authorword_occurs = BTreeMap::new();
|
|
let mut descriptionword_occurs = BTreeMap::new();
|
|
let mut keyword_occurs = BTreeMap::new();
|
|
//let mut sitenameword_occurs = BTreeMap::new();
|
|
for hash in &hash {
|
|
//authorword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
// db::metaword_occurs(db, db::hash("author"), hash.1).await);
|
|
descriptionword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
db::metaword_occurs(db, db::hash("description"), hash.1).await);
|
|
keyword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
db::metaword_occurs(db, db::hash("keywords"), hash.1).await);
|
|
//sitenameword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
// db::metaword_occurs(db, db::hash("site_name"), hash.1).await);
|
|
}
|
|
let mut linkword_occurs = BTreeMap::new();
|
|
for hash in &hash {
|
|
linkword_occurs.entry(hash.0).or_insert(VecDeque::new()).extend(
|
|
db::linkword_occurs(db, hash.1).await);
|
|
}
|
|
|
|
let mut urls = vec![];
|
|
|
|
for (_, vals) in &word_occurs {
|
|
for (url, _) in vals {
|
|
urls.push(*url);
|
|
}
|
|
}
|
|
for (_, vals) in &linkword_occurs {
|
|
for (url, _) in vals {
|
|
urls.push(*url);
|
|
}
|
|
}
|
|
for (_, vals) in &descriptionword_occurs {
|
|
for (url, _) in vals {
|
|
urls.push(*url);
|
|
}
|
|
}
|
|
for (_, vals) in &keyword_occurs {
|
|
for (url, _) in vals {
|
|
urls.push(*url);
|
|
}
|
|
}
|
|
|
|
let mut useless_urls = vec![];
|
|
|
|
// we want to remove urls that aren't contained in every word index (i.e. urls that don't contain every word of the search query)
|
|
// however, we don't want to remove the url if it's not contained in a secondary index (i.e. if someone searches "best x",
|
|
// we don't want to remove results that don't contain the word "best")
|
|
// we also don't want to remove results if the page doesn't have that word, but the linkwords / metawords does
|
|
for url in &urls {
|
|
// for every word index...
|
|
for (index, vals) in &word_occurs {
|
|
// don't worry about secondary indices
|
|
if secondary_indices.contains(index) {
|
|
continue;
|
|
}
|
|
// collect urls
|
|
let urls: Vec<STRHASH> = vals.iter().map(|(url, _)| *url).collect();
|
|
// is this url not in the words bucket for this index?
|
|
if !urls.contains(url) {
|
|
// does another bucket contain it for this index?
|
|
let mut found_elsewhere = 0;
|
|
if linkword_occurs.get(index).unwrap().iter()
|
|
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
|
|
found_elsewhere += 1;
|
|
}
|
|
if descriptionword_occurs.get(index).unwrap().iter()
|
|
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
|
|
found_elsewhere += 1;
|
|
}
|
|
if keyword_occurs.get(index).unwrap().iter()
|
|
.map(|(url, _)| *url).collect::<Vec<STRHASH>>().contains(url) {
|
|
found_elsewhere += 1;
|
|
}
|
|
|
|
if found_elsewhere < 2 {
|
|
// not found anywhere else, thus this url doesn't match every meaningful word in the query
|
|
useless_urls.push(*url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//for (_, vals) in &authorword_occurs {
|
|
// for (url, _) in vals {
|
|
// urls.push(*url);
|
|
// }
|
|
//}
|
|
//for (_, vals) in &sitenameword_occurs {
|
|
// for (url, _) in vals {
|
|
// urls.push(*url);
|
|
// }
|
|
//}
|
|
//for url in &urls {
|
|
// for (_, vals) in &linkword_occurs {
|
|
// let urls: Vec<STRHASH> = vals.iter().map(|(u, _)| *u).collect();
|
|
// if !urls.contains(url) {
|
|
// useless_urls.push(*url);
|
|
// }
|
|
// }
|
|
//}
|
|
|
|
urls.sort();
|
|
urls.dedup();
|
|
urls.retain(|u| !useless_urls.contains(u));
|
|
|
|
for vals in word_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !useless_urls.contains(u));
|
|
}
|
|
for vals in linkword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !useless_urls.contains(u));
|
|
}
|
|
for vals in descriptionword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !useless_urls.contains(u));
|
|
}
|
|
for vals in keyword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !useless_urls.contains(u));
|
|
}
|
|
|
|
let start = chrono::Utc::now();
|
|
|
|
let mut word_count = Arc::new(Mutex::new(BTreeMap::new()));
|
|
|
|
let mut allowed = Arc::new(AtomicUsize::new(0));
|
|
let mut pr_acc = 0.0;
|
|
|
|
const ALLOWED_BEFORE_PRUNE: usize = 512;
|
|
const ALLOWED_BEFORE_MEGAPRUNE: usize = 777;
|
|
const ALLOWED_BEFORE_GIGAPRUNE: usize = 2048;
|
|
//const ICKY_WORDS: &[&str] = &[
|
|
// "distrowatch.com", // distrowatch is taking up too many results at the moment, remove this later
|
|
// "mariowiki.com", // mariowiki is taking up too many results at the moment, remove this later
|
|
// "wired.com", // we have too many wired articles
|
|
// "wired.cz", // we are very targeted at an english audience, they probably don't want czech wired articles at the moment
|
|
// "neocities.org/browse?", // people probably don't want to visit neocities tag lists
|
|
// "https://distrowatch.com/?language=", // people probably aren't looking for the distrowatch homepage in a random language
|
|
// "https://distrowatch.com/weekly.php/weekly.php?issue=", // a lot of results are unrelated distrowatch weekly posts
|
|
// "terms", // people probably aren't looking for tos pages
|
|
// "statement", // people probably aren't looking for tos pages
|
|
// "3cx", // nonenglish voip company, takes up unrelated search queries
|
|
// "1377x", // phishing site pretending to be 1337x, temporary fix until we can implement something like site blocking
|
|
// "//kickasstorrents.", // kickasstorrents has been down for years, only remaining sites are likely phishing scams
|
|
// "//kickasstorrent.", // kickasstorrents has been down for years, only remaining sites are likely phishing scams
|
|
// "//katcr.to", // fake kickasstorrents site
|
|
// "//kat.am", // fake kickasstorrents site
|
|
// "//kikass.to", // fake kickasstorrents site
|
|
// "//thepiratebays.com", // fake thepiratebay site
|
|
// ".fandom.com", // fuck fandom.com (todo: remove this since ultimately, it should be the user's choice to block fandom)
|
|
//];
|
|
|
|
// todo: since our list is so small this is okay for now, but we should cache this in the future
|
|
let deranks = Arc::new(db::get_deranked_websites(db).await);
|
|
|
|
let mut initial_pruned = Arc::new(Mutex::new(vec![]));
|
|
// url, reason
|
|
let mut blocked: Arc<Mutex<Vec<(String, String)>>> = Arc::new(Mutex::new(vec![]));
|
|
|
|
debug!("checking {} urls", urls.len());
|
|
|
|
let mut url_tasks = vec![];
|
|
for (i, url) in urls.into_iter().enumerate() {
|
|
let allowed = allowed.clone();
|
|
let blocked = blocked.clone();
|
|
let initial_pruned = initial_pruned.clone();
|
|
let word_count = word_count.clone();
|
|
let arg_words = arg_words.clone();
|
|
let deranks = deranks.clone();
|
|
url_tasks.push(tokio::spawn(async move {
|
|
if i > ALLOWED_BEFORE_GIGAPRUNE {
|
|
initial_pruned.lock().unwrap().push(url);
|
|
return;
|
|
}
|
|
let db = Database::default().expect("FAILED TO CREATE NEW FDB HANDLE");
|
|
let surl = {
|
|
cached_unhash(&db, url).await
|
|
}.to_lowercase();
|
|
let mut contains_query_word = false;
|
|
for w in &arg_words {
|
|
if surl.contains(w) {
|
|
contains_query_word = true;
|
|
break;
|
|
}
|
|
}
|
|
let mut prepruned = false;
|
|
for (_, derank) in deranks.iter() {
|
|
if surl.contains(&derank.urlmatch) || surl.contains(&derank.urlmatch.replace("//", ".")) {
|
|
if let Some(and) = &derank.and {
|
|
if !surl.contains(and) {
|
|
continue;
|
|
}
|
|
}
|
|
if let Some(unless) = &derank.unless {
|
|
if surl.contains(unless) {
|
|
continue;
|
|
}
|
|
}
|
|
if !contains_query_word &&
|
|
(i > ALLOWED_BEFORE_MEGAPRUNE || (i > ALLOWED_BEFORE_PRUNE && derank.amount < 0.85)) &&
|
|
!contains_query_word {
|
|
initial_pruned.lock().unwrap().push(url);
|
|
prepruned = true;
|
|
}
|
|
if derank.amount == 0.0 {
|
|
initial_pruned.lock().unwrap().push(url);
|
|
blocked.lock().unwrap().push((surl.clone(), derank.comment.clone()));
|
|
prepruned = true;
|
|
}
|
|
}
|
|
}
|
|
if prepruned {
|
|
return;
|
|
}
|
|
let pr = pagerank_approx(&db, url, 0).await;
|
|
if i > ALLOWED_BEFORE_PRUNE {
|
|
let mut contains_query_word = false;
|
|
for w in &arg_words {
|
|
if surl.contains(w) {
|
|
contains_query_word = true;
|
|
break;
|
|
}
|
|
}
|
|
if contains_query_word || (pr > (((pr_acc / i as f64).max(0.01)) * if i > ALLOWED_BEFORE_MEGAPRUNE { 4.0 } else { 1.0 }) || (i < ALLOWED_BEFORE_MEGAPRUNE)) {
|
|
let wc = db::page_word_count(&db, url).await;
|
|
word_count.lock().unwrap().insert(url, wc);
|
|
pr_acc += pr;
|
|
allowed.fetch_add(1, Ordering::Relaxed);
|
|
} else {
|
|
initial_pruned.lock().unwrap().push(url);
|
|
}
|
|
} else {
|
|
let wc = db::page_word_count(&db, url).await;
|
|
word_count.lock().unwrap().insert(url, wc);
|
|
pr_acc += pr;
|
|
allowed.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
//let url = {
|
|
//};
|
|
//debug!("{} pr: {}", url, pr);
|
|
//debug!("{} wc: {}", url, wc);
|
|
|
|
// precache values
|
|
cached_unhash(&db, url).await;
|
|
cached_title(&db, url).await;
|
|
cached_desc(&db, url).await;
|
|
}));
|
|
}
|
|
|
|
for url_task in url_tasks {
|
|
url_task.await.expect("url task failure");
|
|
}
|
|
|
|
let initial_pruned = initial_pruned.lock().unwrap().clone();
|
|
let word_count = word_count.lock().unwrap().clone();
|
|
|
|
debug!("pruned {} results ({pr_acc}, {})", initial_pruned.len(), allowed.load(Ordering::Relaxed));
|
|
|
|
for vals in word_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !initial_pruned.contains(u));
|
|
}
|
|
for vals in linkword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !initial_pruned.contains(u));
|
|
}
|
|
for vals in descriptionword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !initial_pruned.contains(u));
|
|
}
|
|
for vals in keyword_occurs.values_mut() {
|
|
vals.retain(|(u, _)| !initial_pruned.contains(u));
|
|
}
|
|
|
|
let pagerank_secs = chrono::Utc::now().signed_duration_since(start).num_milliseconds() as f64 / 1000.0;
|
|
|
|
info!("pageranks in {} secs", pagerank_secs);
|
|
|
|
//word_occurs.sort_by(|a, b| {
|
|
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
|
|
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
|
|
//});
|
|
//metaword_occurs.sort_by(|a, b| {
|
|
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
|
|
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
|
|
//});
|
|
//linkword_occurs.sort_by(|a, b| {
|
|
// let av = *PRECALCED_PAGERANKS.lock().unwrap().get(&b.1.0).unwrap();
|
|
// av.total_cmp(PRECALCED_PAGERANKS.lock().unwrap().get(&a.1.0).unwrap())
|
|
//});
|
|
|
|
pub const MULTIWORD_PENALTY: f64 = 0.0001;
|
|
pub const PHRASE_PERCENTAGE: f64 = 0.7;
|
|
|
|
for ((position, vals)) in word_occurs.iter() {
|
|
if let Some(vals2) = word_occurs.get(&(*position + 1)) {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distacc = 0.0;
|
|
let mut distc = 0;
|
|
let mut smallest = f64::MAX;
|
|
let mut largest = 0.0;
|
|
let mut phrase_matches = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i * 2) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
for (url_b, occurs_b) in vals2 {
|
|
if url_a != url_b {
|
|
continue;
|
|
}
|
|
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
for j in 0..occurs2 {
|
|
let offset2 = (j * 2) + 8;
|
|
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
|
|
distacc += (dist2 - dist);
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) < smallest {
|
|
smallest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
|
|
largest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(1.0)) {
|
|
phrase_matches += 1;
|
|
}
|
|
distc += 1;
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
|
|
}
|
|
}
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: acc,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: 0.0,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: f64::MAX,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.word_occurs = acc;
|
|
res.words_contained.insert(*position);
|
|
res.match_acc += distacc;
|
|
res.match_c += distc as f64;
|
|
if smallest < res.closest_match {
|
|
res.closest_match = smallest;
|
|
}
|
|
if largest < res.highest_match {
|
|
res.highest_match = largest;
|
|
}
|
|
res.phrase_match_count += phrase_matches as f64;
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
} else {
|
|
for (urlv, occursv) in vals {
|
|
let url = {
|
|
cached_unhash(db, *urlv).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occursv.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distc = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i * 2) + 8;
|
|
let dist = occursv[offset as usize] as f64 / 255.0;
|
|
//distc += 1;
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
|
|
}
|
|
let res = results.entry(*urlv).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *urlv).await,
|
|
description: cached_desc(db, *urlv).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: acc,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: 0.0,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: 0.0,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.word_occurs = acc;
|
|
res.words_contained.insert(*position);
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for ((position, vals)) in linkword_occurs.iter() {
|
|
if let Some(vals2) = linkword_occurs.get(&(*position + 1)) {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distacc = 0.0;
|
|
let mut distc = 0;
|
|
let mut smallest = f64::MAX;
|
|
let mut largest = 0.0;
|
|
let mut phrase_matches = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
for (url_b, occurs_b) in vals2 {
|
|
if url_a != url_b {
|
|
distc -= 4;
|
|
continue;
|
|
}
|
|
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
for j in 0..occurs2 {
|
|
let offset2 = (j) + 8;
|
|
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
|
|
distacc += (dist2 - dist) * 12.0;
|
|
distc += 1;
|
|
if (dist2 - dist) < smallest {
|
|
smallest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
|
|
largest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
|
|
phrase_matches += 1;
|
|
}
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
|
|
}
|
|
}
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: f64::MAX,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.linkword_occurs = acc;
|
|
res.match_acc += distacc;
|
|
res.match_c += distc as f64;
|
|
if smallest < res.closest_match {
|
|
res.closest_match = smallest;
|
|
}
|
|
res.phrase_match_count += phrase_matches as f64;
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
} else {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distc = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
//distc += 1;
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: 0.0,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.linkword_occurs = acc;
|
|
res.words_contained.insert(*position);
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for ((position, vals)) in descriptionword_occurs.iter() {
|
|
if let Some(vals2) = descriptionword_occurs.get(&(*position + 1)) {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distacc = 0.0;
|
|
let mut distc = 0;
|
|
let mut smallest = f64::MAX;
|
|
let mut largest = 0.0;
|
|
let mut phrase_matches = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
for (url_b, occurs_b) in vals2 {
|
|
if url_a != url_b {
|
|
distc -= 4;
|
|
continue;
|
|
}
|
|
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
for j in 0..occurs2 {
|
|
let offset2 = (j) + 8;
|
|
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
|
|
distacc += (dist2 - dist) * 12.0;
|
|
distc += 1;
|
|
if (dist2 - dist) < smallest {
|
|
smallest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
|
|
largest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
|
|
phrase_matches += 1;
|
|
}
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
|
|
}
|
|
}
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: f64::MAX,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.descriptionword_occurs = acc;
|
|
res.match_acc += distacc;
|
|
res.match_c += distc as f64;
|
|
if smallest < res.closest_match {
|
|
res.closest_match = smallest;
|
|
}
|
|
res.phrase_match_count += phrase_matches as f64;
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
} else {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distc = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
//distc += 1;
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: 0.0,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.descriptionword_occurs = acc;
|
|
res.words_contained.insert(*position);
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for ((position, vals)) in keyword_occurs.iter() {
|
|
if let Some(vals2) = keyword_occurs.get(&(*position + 1)) {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distacc = 0.0;
|
|
let mut distc = 0;
|
|
let mut smallest = f64::MAX;
|
|
let mut largest = 0.0;
|
|
let mut phrase_matches = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
for (url_b, occurs_b) in vals2 {
|
|
if url_a != url_b {
|
|
distc -= 4;
|
|
continue;
|
|
}
|
|
let occurs2 = u64::from_be_bytes(occurs_b.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
for j in 0..occurs2 {
|
|
let offset2 = (j) + 8;
|
|
let dist2 = occurs_b[offset2 as usize] as f64 / 255.0;
|
|
distacc += (dist2 - dist) * 12.0;
|
|
distc += 1;
|
|
if (dist2 - dist) < smallest {
|
|
smallest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) > largest {
|
|
largest = (dist2 - dist);
|
|
}
|
|
if (dist2 - dist) >= 0.0 && (dist2 - dist) <= (2.1 / (*word_count.get(url_a).unwrap_or(&1) as f64).max(20.0)) {
|
|
phrase_matches += 1;
|
|
}
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0) * multiword_penalty(dist, dist2);
|
|
}
|
|
}
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: f64::MAX,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.keyword_occurs = acc;
|
|
res.match_acc += distacc;
|
|
res.match_c += distc as f64;
|
|
if smallest < res.closest_match {
|
|
res.closest_match = smallest;
|
|
}
|
|
res.phrase_match_count += phrase_matches as f64;
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
} else {
|
|
for (url_a, occurs_a) in vals {
|
|
let url = {
|
|
cached_unhash(db, *url_a).await
|
|
};
|
|
let occurs = u64::from_be_bytes(occurs_a.split_at(std::mem::size_of::<u64>()).0.try_into().unwrap());
|
|
let mut acc = 0.0;
|
|
let mut distc = 0;
|
|
for i in 0..occurs {
|
|
let offset = (i) + 8;
|
|
let dist = occurs_a[offset as usize] as f64 / 255.0;
|
|
//distc += 1;
|
|
acc += ((-(2.0 * dist + 1.0).log(1.2) / 30.0) + 1.0);
|
|
}
|
|
let res = results.entry(*url_a).or_insert(SearchResult {
|
|
url: url.clone(),
|
|
title: cached_title(db, *url_a).await,
|
|
description: cached_desc(db, *url_a).await,
|
|
url_contains_result: if url.contains(&first_query) { Some(url.len()) } else { None },
|
|
word_occurs: 0.0,
|
|
authorword_occurs: 0.0,
|
|
descriptionword_occurs: 0.0,
|
|
keyword_occurs: 0.0,
|
|
sitename_occurs: 0.0,
|
|
linkword_occurs: acc,
|
|
pagerank: 0.0,
|
|
relavence: 0.0,
|
|
words_contained: BTreeSet::new(),
|
|
total_query_words: args_count as f64,
|
|
words: Default::default(),
|
|
closest_match: 0.0,
|
|
phrase_match_count: 0.0,
|
|
match_acc: 0.0,
|
|
match_c: 0.0,
|
|
highest_match: 0.0,
|
|
});
|
|
res.keyword_occurs = acc;
|
|
res.words_contained.insert(*position);
|
|
*res.words.entry(*position).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut max_relavence = 0.0;
|
|
let mut longest_url = 1;
|
|
let mut max_words = 1;
|
|
|
|
let mut phrase_found = false;
|
|
|
|
// todo: move to database
|
|
|
|
results.iter_mut().for_each(|v| {
|
|
v.1.pagerank = *PRECALCED_PAGERANKS.lock().unwrap().get(v.0).unwrap_or(
|
|
CACHED_PAGERANKS.lock().unwrap().get(v.0).unwrap()
|
|
);
|
|
|
|
//debug!("{},{}", v.1.phrase_match_count, args_count);
|
|
if v.1.phrase_match_count + 1.0 >= args_count as f64 {
|
|
phrase_found = true;
|
|
}
|
|
if v.1.url.len() > longest_url {
|
|
longest_url = v.1.url.len();
|
|
}
|
|
let mut wc = 1;
|
|
for word in &arg_words {
|
|
if v.1.url.to_lowercase().contains(word) {
|
|
wc += 1;
|
|
}
|
|
}
|
|
if wc > max_words {
|
|
max_words = wc;
|
|
}
|
|
|
|
v.1.relavence = v.1.relavence(no_separator_flag);
|
|
|
|
//debug!("{} -> {}/{}", v.1.url, v.1.words_contained, v.1.total_query_words);
|
|
if v.1.relavence > max_relavence {
|
|
max_relavence = v.1.relavence;
|
|
}
|
|
});
|
|
|
|
results.iter_mut().for_each(|v| {
|
|
let url_multiplier = ((v.1.url.len() as f64 * 2.3) / (longest_url as f64));
|
|
//debug!("url {} multiplier {}", v.1.url, url_multiplier);
|
|
let mut word_multiplier = 1;
|
|
for word in &arg_words {
|
|
if v.1.url.to_lowercase().contains(word) {
|
|
word_multiplier += 1;
|
|
}
|
|
}
|
|
let word_multiplier = (word_multiplier as f64 * 4.0) / max_words as f64;
|
|
|
|
let mut icky_found = 1.0;
|
|
for (_, derank) in deranks.iter() {
|
|
if v.1.url.contains(&derank.urlmatch) {
|
|
if let Some(and) = &derank.and {
|
|
if !v.1.url.contains(and) {
|
|
continue;
|
|
}
|
|
}
|
|
if let Some(unless) = &derank.unless {
|
|
if v.1.url.contains(unless) {
|
|
continue;
|
|
}
|
|
}
|
|
icky_found *= derank.amount;
|
|
}
|
|
}
|
|
|
|
v.1.relavence *= word_multiplier.powi(8).min((max_words as f64).powi(8));
|
|
v.1.relavence /= url_multiplier.max(1.0).powi(4);
|
|
v.1.relavence *= icky_found;
|
|
});
|
|
|
|
//results.iter_mut().for_each(|v| v.relevance /= max_occurs as f64);
|
|
|
|
let mut results: Vec<SearchResult> = results.into_iter().map(|v| v.1).collect();
|
|
|
|
max_relavence = 0.0;
|
|
results.sort_by(|a, b| {
|
|
if a.relavence > max_relavence {
|
|
max_relavence = a.relavence;
|
|
}
|
|
if b.relavence > max_relavence {
|
|
max_relavence = a.relavence;
|
|
}
|
|
b.relavence.total_cmp(&a.relavence)
|
|
});
|
|
|
|
let mut results_final = vec![];
|
|
for result in results {
|
|
results_final.push(searchservice::SearchResult {
|
|
url: result.url,
|
|
relevance: result.relavence,
|
|
title: result.title,
|
|
description: result.description,
|
|
});
|
|
}
|
|
|
|
let time = chrono::Utc::now().signed_duration_since(start_t).num_milliseconds() as f64 / 1000.0;
|
|
|
|
let blocked = blocked.lock().unwrap();
|
|
|
|
Some(searchservice::SearchResponse {
|
|
results: results_final,
|
|
blocked: blocked.clone(),
|
|
pagerank_time_seconds: pagerank_secs,
|
|
total_query_seconds: time,
|
|
max_relevance: max_relavence,
|
|
exact_phrase_found: phrase_found,
|
|
})
|
|
} |