From b4112c311ce9f54a8982bcdfa95105a67bf23910 Mon Sep 17 00:00:00 2001 From: Book-reader Date: Fri, 9 May 2025 09:53:28 +1200 Subject: [PATCH] initial spellcheck implementation --- asklyphe-frontend/src/main.rs | 1 + asklyphe-frontend/src/routes/search.rs | 7 +- asklyphe-frontend/src/spellcheck.rs | 113 +++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 asklyphe-frontend/src/spellcheck.rs diff --git a/asklyphe-frontend/src/main.rs b/asklyphe-frontend/src/main.rs index 2b87121..6e5b782 100644 --- a/asklyphe-frontend/src/main.rs +++ b/asklyphe-frontend/src/main.rs @@ -14,6 +14,7 @@ pub mod searchbot; pub mod wikipedia; pub mod unit_converter; +pub mod spellcheck; pub mod routes; use std::{env, process}; diff --git a/asklyphe-frontend/src/routes/search.rs b/asklyphe-frontend/src/routes/search.rs index f1d43b7..c979062 100644 --- a/asklyphe-frontend/src/routes/search.rs +++ b/asklyphe-frontend/src/routes/search.rs @@ -18,6 +18,7 @@ use crate::unit_converter; use crate::unit_converter::UnitConversion; use crate::wikipedia::WikipediaSummary; use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR}; +use crate::spellcheck; use askama::Template; use asklyphe_common::nats; use asklyphe_common::nats::bingservice::{ @@ -158,18 +159,20 @@ pub async fn search_js( let mut complications = Complications::default(); // todo: better way of specifying that user doesn't want complications if !query.contains("-complications") { - let mut wikiquery = query.clone().to_lowercase(); + /*let mut wikiquery = query.clone().to_lowercase(); wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace()); wikiquery = wikiquery.replace(' ', "%20"); // todo: proper url escaping let wikipedia_comp = tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await }); - complications.wikipedia = wikipedia_comp.await.unwrap_or_default(); + complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/ let mut unit_query = query.clone().to_lowercase(); unit_query = unit_query.replace("metre", "meter"); let unit_comp = unit_converter::convert_unit(&unit_query); complications.unit_converter = unit_comp; + + let corrections = spellcheck::check(&query); } else { complications.disabled = true; query = query.replace("-complications", ""); diff --git a/asklyphe-frontend/src/spellcheck.rs b/asklyphe-frontend/src/spellcheck.rs new file mode 100644 index 0000000..187cc32 --- /dev/null +++ b/asklyphe-frontend/src/spellcheck.rs @@ -0,0 +1,113 @@ +use tracing::{debug, error}; +use std::{cmp, mem}; +// use tokio::sync::{Mutex, RwLock}; + +include!("./google-10000-english-no-swears.txt"); + +// max distance before no alternatives are considered +const MAX_DISTANCE: usize = 6; + +pub fn check(query: &String) -> Vec { + error!("Query: {}", query); + let distances = query.split(" ") + .map(|qword| qword.to_lowercase()) + .map( + |qword| + KNOWN_WORDS.iter() + .map( + |kword| + (qword.clone(), kword, levenshtein_distance(&qword, kword))) + .map(|val| (val.0, val.1, val.2)) + .min_by(|a, b| a.2.cmp(&b.2)).unwrap() + ).filter(|(_, _, d)| *d > 0) + .for_each(|word| { + debug!("instead of '{}' did you mean '{}'? (distance of {})", word.0, word.1, word.2); + }); + // .collect::>(); + + // distances + vec![] +} + +// cost of 2 for add/remove, cost of 1 for replace +fn levenshtein_distance(a: &str, other: &str) -> usize { + let mut dist = vec![0usize; other.len() + 1]; + let mut dist_prev = vec![0usize; other.len() + 1]; + + for i in 0..=other.len() { + dist_prev[i] = i; + } + + for i in 1..=a.len() { + dist[0] = i; + + for j in 1..=other.len() { + if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() { + dist[j] = dist_prev[j - 1]; + } else { + dist[j] = 1 + cmp::min( + dist.get(j - 1).unwrap(), + cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap())); + } + } + // let temp = dist_prev; + // dist_prev = dist.clone(); + // dist = temp; + // dist_prev = dist; + // dist = vec![0usize; max_len]; + mem::swap(&mut dist, &mut dist_prev); + } + dist_prev[other.len()] + + + + /*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1]; + for i in 1..=a.len() { + distances[i][0] = i; + } + + for j in 1..=other.len() { + distances[0][j] = j; + } + + /*unsafe { + for i in 1..=a.len() { + for j in 1..=other.len() { + if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) { + // 0 + distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1); + } else { + // 1 + distances[i][j] = 1 + cmp::min( + (*distances.get_unchecked(i - 1).get_unchecked(j - 1)), + cmp::min( + (*distances.get_unchecked(i - 1).get_unchecked(j)), + (*distances.get_unchecked(i).get_unchecked(j - 1)) + ) + ); + } + } + } + }*/ + + + for i in 1..=a.len() { + for j in 1..=other.len() { + if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() { + // 0 + distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap(); + } else { + // 1 + distances[i][j] = 1 + cmp::min( + (*distances.get(i - 1).unwrap().get(j - 1).unwrap()), + cmp::min( + (*distances.get(i - 1).unwrap().get(j).unwrap()), + (*distances.get(i).unwrap().get(j - 1).unwrap()) + ) + ); + } + } + } + *distances.get(a.len()).unwrap().get(other.len()).unwrap()*/ +} +