diff --git a/asklyphe-frontend/src/routes/search.rs b/asklyphe-frontend/src/routes/search.rs index c979062..48ed081 100644 --- a/asklyphe-frontend/src/routes/search.rs +++ b/asklyphe-frontend/src/routes/search.rs @@ -19,6 +19,7 @@ use crate::unit_converter::UnitConversion; use crate::wikipedia::WikipediaSummary; use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR}; use crate::spellcheck; +use crate::spellcheck::SpellCheckResults; use askama::Template; use asklyphe_common::nats; use asklyphe_common::nats::bingservice::{ @@ -69,6 +70,7 @@ pub struct Complications { disabled: bool, wikipedia: Option, unit_converter: Option, + spellcheck: Option, } pub async fn search( @@ -173,6 +175,7 @@ pub async fn search_js( complications.unit_converter = unit_comp; let corrections = spellcheck::check(&query); + complications.spellcheck = corrections; } else { complications.disabled = true; query = query.replace("-complications", ""); diff --git a/asklyphe-frontend/src/spellcheck.rs b/asklyphe-frontend/src/spellcheck.rs index 187cc32..06c8c48 100644 --- a/asklyphe-frontend/src/spellcheck.rs +++ b/asklyphe-frontend/src/spellcheck.rs @@ -1,36 +1,104 @@ use tracing::{debug, error}; use std::{cmp, mem}; -// use tokio::sync::{Mutex, RwLock}; +// TODO: cache distances of strings/substrings +// TODO: use binary search to find direct matches, and if that fails, calculate and cache the result in BTreeMap +// TODO: limit by number of words and word length, not max chars, and use code more like this for better readability & async: +/* + let words = prepare(query).split_whitespace() + .filter(|qword| qword.len() > 0) + .map(|qword| qword.to_lowercase()); + for word in words { // it might need to be while let Some(word) = words.next() + tokio::spawn(levenshtein_distance(...)) + } + */ -include!("./google-10000-english-no-swears.txt"); +include!("./words.txt"); // max distance before no alternatives are considered const MAX_DISTANCE: usize = 6; +// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine +// update: got a larger word database and it doesn't take 4 seconds anymore lmao +const MAX_QUERY_SIZE: usize = 1024; -pub fn check(query: &String) -> Vec { +pub type SpellCheckResults = Vec; + +#[derive(Debug)] +pub struct SpellCheckResult { + pub orig: String, + pub correction: String, +} + +pub fn check(query: &String) -> Option { error!("Query: {}", query); - let distances = query.split(" ") + let query: &str = { + if query.len() > MAX_QUERY_SIZE { + error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE); + query.get(0..MAX_QUERY_SIZE).unwrap() + // return None; + } else { + query + } + }; + + let distances = prepare(query).split_whitespace() + .filter(|qword| qword.len() > 0) .map(|qword| qword.to_lowercase()) .map( - |qword| - KNOWN_WORDS.iter() - .map( - |kword| - (qword.clone(), kword, levenshtein_distance(&qword, kword))) - .map(|val| (val.0, val.1, val.2)) - .min_by(|a, b| a.2.cmp(&b.2)).unwrap() - ).filter(|(_, _, d)| *d > 0) - .for_each(|word| { - debug!("instead of '{}' did you mean '{}'? (distance of {})", word.0, word.1, word.2); - }); - // .collect::>(); + |qword| { + let mut exact_match = false; + KNOWN_WORDS.iter() + .map(|kword| kword.to_lowercase()) + .map( + |kword| + (qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword))) + // totally isn't jank at all and is the best solution totally + .take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true}) + // .map(|val| {error!("Val: {:?}", val); val}) + .min_by(|a, b| a.2.cmp(&b.2)).unwrap() + })/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/ + .map(|word| { + debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction); + word + }) + .collect::>(); + /*for word in &distances { + debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/); + }*/ - // distances - vec![] + if distances.len() > 0 { + Some(distances) + } else { + None + } + // vec![] +} + +// TODO: handle symbols better, probably with a regex +fn prepare(s: &str) -> String { + s.replace("\"", "") + .replace(",", " ") + .replace(":", " ") + .replace(".", " ") + .replace("/", " ") + .replace("&", " ") + .replace("!", " ") + .replace("?", " ") + /*.replace("'", "")*/ + .replace("0", "") + .replace("1", "") + .replace("2", "") + .replace("3", "") + .replace("4", "") + .replace("5", "") + .replace("6", "") + .replace("7", "") + .replace("8", "") + .replace("9", "") } // cost of 2 for add/remove, cost of 1 for replace fn levenshtein_distance(a: &str, other: &str) -> usize { + // debug!("Self: '{}', Other: '{}'", a, other); let mut dist = vec![0usize; other.len() + 1]; let mut dist_prev = vec![0usize; other.len() + 1]; @@ -50,11 +118,6 @@ fn levenshtein_distance(a: &str, other: &str) -> usize { cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap())); } } - // let temp = dist_prev; - // dist_prev = dist.clone(); - // dist = temp; - // dist_prev = dist; - // dist = vec![0usize; max_len]; mem::swap(&mut dist, &mut dist_prev); } dist_prev[other.len()]