weight levenshtein distance calculation to prefer replacement over addition/removal

whoops
add instructions
2025-05-10 02:28:21 +12:00 · 2025-05-10 02:06:53 +12:00 · 2025-05-10 01:55:53 +12:00 · 2025-05-10 01:46:36 +12:00 · 2025-05-09 23:32:41 +12:00 · 2025-05-09 23:32:15 +12:00
3 changed files with 267 additions and 2 deletions
--- a/asklyphe-frontend/src/main.rs
+++ b/asklyphe-frontend/src/main.rs
@ -14,6 +14,7 @@
 pub mod searchbot;
 pub mod wikipedia;
 pub mod unit_converter;
+pub mod spellcheck;
 pub mod routes;

 use std::{env, process};
--- a/asklyphe-frontend/src/routes/search.rs
+++ b/asklyphe-frontend/src/routes/search.rs
@ -18,6 +18,8 @@ use crate::unit_converter;
 use crate::unit_converter::UnitConversion;
 use crate::wikipedia::WikipediaSummary;
 use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
+use crate::spellcheck;
+use crate::spellcheck::SpellCheckResults;
 use askama::Template;
 use asklyphe_common::nats;
 use asklyphe_common::nats::bingservice::{
@ -68,6 +70,7 @@ pub struct Complications {
    disabled: bool,
    wikipedia: Option<WikipediaSummary>,
    unit_converter: Option<UnitConversion>,
+    spellcheck: Option<SpellCheckResults>,
 }

 pub async fn search(
@ -158,18 +161,21 @@ pub async fn search_js(
        let mut complications = Complications::default();
        // todo: better way of specifying that user doesn't want complications
        if !query.contains("-complications") {
-            let mut wikiquery = query.clone().to_lowercase();
+            /*let mut wikiquery = query.clone().to_lowercase();
            wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
            wikiquery = wikiquery.replace(' ', "%20");
            // todo: proper url escaping
            let wikipedia_comp =
                tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
-            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();
+            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/

            let mut unit_query = query.clone().to_lowercase();
            unit_query = unit_query.replace("metre", "meter");
            let unit_comp = unit_converter::convert_unit(&unit_query);
            complications.unit_converter = unit_comp;
+
+            let corrections = spellcheck::check(&query);
+            complications.spellcheck = corrections;
        } else {
            complications.disabled = true;
            query = query.replace("-complications", "");
--- a/asklyphe-frontend/src/spellcheck.rs
+++ b/asklyphe-frontend/src/spellcheck.rs
@ -0,0 +1,258 @@
+use once_cell::sync::Lazy;
+use tracing::{debug, error};
+use std::{cmp, mem};
+use std::collections::BTreeMap;
+use std::sync::Mutex;
+
+// how to generate words.txt:
+// clone https://github.com/en-wl/wordlist && cd wordlist
+// make
+// ./scowl wl --deaccent > words0.txt
+// filtered with this python script:
+// -----------------------------------
+// with open("words0.txt", "r") as f:
+// 	out = []
+// 	for line in f:
+// 		line = line.lower()
+// 		if not line in out:
+// 			out.append(line)
+// 	out.sort()
+// 	with open("words.txt", "w") as out_file:
+// 		for line in out:
+// 			out_file.write(f'{line}')
+// ------------------------------------
+// then use regex or similar to enclose every line in quotes and add comma, then add 'static KNOWN_WORDS: &[&str] = &[' to the start and '];' to the end
+include!("./words.txt");
+
+// a cache of misspelled words and the closest match in the database
+static MATCH_CACHE: Lazy<Mutex<BTreeMap<String, Option<&str>>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
+
+// max distance before no alternatives are considered
+const MAX_DISTANCE: usize = 6;
+// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
+// update: got a larger word database and it doesn't take 4 seconds anymore lmao
+// update 2: added binary search & caching and now 50000 chars takes ~2-4 seconds
+const MAX_QUERY_WORDS: usize = 512;
+// Not really a huge issue, just used to hopefully reduce the allocations made in levenshtein_distance & provide minor performance improvements
+// not needed for now
+// const MAX_WORD_SIZE: usize = 64;
+
+pub type SpellCheckResults = Vec<SpellCheckResult>;
+
+#[derive(Debug)]
+pub struct SpellCheckResult {
+	pub orig: String,
+	pub correction: &'static str,
+}
+
+pub fn check(query: &String) -> Option<SpellCheckResults> {
+	error!("Query: {}", query);
+	/*let query: &str = {
+		if query.len() > MAX_QUERY_SIZE {
+			error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
+			query.get(0..MAX_QUERY_SIZE).unwrap()
+			// return None;
+		} else {
+			query
+		}
+	};*/
+
+	// TODO: look into how 'wc -w' counts words and copy how it splits things
+	let query_flattened = prepare(query);
+	let words = query_flattened
+		.split_whitespace()
+		.filter(|word| word.len() > 0)
+		// .filter(|word|)
+		.collect::<Vec<_>>();
+
+	error!("Words in query: {}", words.len());
+
+	if (words.len() > MAX_QUERY_WORDS) {
+		error!("{} is too many words in query to spell check", words.len());
+		// return None;
+	}
+
+	let mut distances: SpellCheckResults = vec![];
+	for qword in words {
+		// error!("Word: {}", qword);
+		// error!("is known: {:?}", KNOWN_WORDS.binary_search(&qword));
+		if KNOWN_WORDS.binary_search(&qword).is_ok() {
+			// error!("Exact word match: {}", qword);
+		} else {
+			let mut cache = MATCH_CACHE.lock().unwrap();
+			if cache.contains_key(qword) {
+				// We don't need to tell the user if there is no suggestion for an unknown word
+				if (cache.get(qword).unwrap().is_some()) {
+					// TODO: don't push duplicate misspelled words
+					distances.push(SpellCheckResult{orig: qword.to_owned(), correction: cache.get(qword).unwrap().unwrap()});
+				}
+			} else {
+				let closest_match = KNOWN_WORDS.iter()
+					.map(|kword| (kword, levenshtein_distance(&qword, &kword)))
+					.min_by(|a, b| a.1.cmp(&b.1)).unwrap();
+
+				assert!(closest_match.1 > 0, "Found exact match not caught by binary search, is the word database properly sorted?");
+
+				if closest_match.1 <= MAX_DISTANCE {
+					cache.insert(qword.to_owned(), Some(*closest_match.0));
+					distances.push(SpellCheckResult{orig: qword.to_owned(), correction: *closest_match.0});
+				} else {
+					// even though there is no close enough match, cache it anyway so that it doesn't have to be looked up every time
+					cache.insert(qword.to_owned(), None);
+				}
+			}
+		}
+		// error!("End");
+	}
+	error!("Spell check results:");
+	for word in &distances {
+		debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
+	}
+
+	if distances.len() > 0 {
+		Some(distances)
+	} else {
+		None
+	}
+
+/*	let distances = prepare(query).split_whitespace()
+		.filter(|qword| qword.len() > 0)
+		.map(|qword| qword.to_lowercase())
+		.map(
+			|qword| {
+				let mut exact_match = false;
+				KNOWN_WORDS.iter()
+					.map(|kword| kword.to_lowercase())
+					.map(
+						|kword|
+						(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
+							// totally isn't jank at all and is the best solution totally
+							.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
+							// .map(|val| {error!("Val: {:?}", val); val})
+							.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
+			})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
+		.map(|word| {
+			debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
+			word
+		})
+		.collect::<Vec<_>>();
+	/*for word in &distances {
+		debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
+	}*/
+
+	if distances.len() > 0 {
+		Some(distances)
+	} else {
+		None
+	}*/
+	// None
+	// vec![]
+}
+
+// TODO: handle symbols better, probably with a regex
+fn prepare(s: &str) -> String {
+	s.replace("\"", "")
+		.replace(",", " ")
+		.replace(":", " ")
+		.replace(".", " ")
+		.replace("/", " ")
+		.replace("&", " ")
+		.replace("!", " ")
+		.replace("?", " ")
+		/*.replace("'", "")*/
+		.replace("0", "")
+		.replace("1", "")
+		.replace("2", "")
+		.replace("3", "")
+		.replace("4", "")
+		.replace("5", "")
+		.replace("6", "")
+		.replace("7", "")
+		.replace("8", "")
+		.replace("9", "")
+		.to_lowercase()
+}
+
+// cost of 2 for add/remove, cost of 1 for replace
+fn levenshtein_distance(a: &str, other: &str) -> usize {
+	// debug!("Self: '{}', Other: '{}'", a, other);
+	// let mut dist: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
+	// let mut dist_prev: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
+
+	let mut dist = vec![0usize; other.len() + 1];
+	let mut dist_prev = vec![0usize; other.len() + 1];
+
+	for i in 0..=other.len() {
+		dist_prev[i] = i;
+	}
+
+	for i in 1..=a.len() {
+		dist[0] = i;
+
+		for j in 1..=other.len() {
+			if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
+				dist[j] = dist_prev[j - 1];
+			} else {
+				// TODO: make addition/subtraction 1 more expensive than replacement, presumably by adding '+ 1' to 2/3 of these
+				// motivation: honex from bee movie script is turned into hone instead of honey, this will also generally improve results & is what wikipedia says to do (best reason)
+				dist[j] = 1 + cmp::min(
+					*dist.get(j - 1).unwrap() + 1,
+					cmp::min(*dist_prev.get(j).unwrap() + 1, *dist_prev.get(j - 1).unwrap()));
+			}
+		}
+		mem::swap(&mut dist, &mut dist_prev);
+	}
+	dist_prev[other.len()]
+
+	
+
+	/*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1];
+	for i in 1..=a.len() {
+		distances[i][0] = i;
+	}
+
+	for j in 1..=other.len() {
+		distances[0][j] = j;
+	}
+
+	/*unsafe {
+	for i in 1..=a.len() {
+		for j in 1..=other.len() {
+			if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) {
+				// 0
+				distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1);
+			} else {
+				// 1
+				distances[i][j] = 1 + cmp::min(
+					(*distances.get_unchecked(i - 1).get_unchecked(j - 1)),
+					cmp::min(
+						(*distances.get_unchecked(i - 1).get_unchecked(j)),
+						(*distances.get_unchecked(i).get_unchecked(j - 1))
+					)
+				);
+			}
+		}
+	}
+	}*/
+
+
+	for i in 1..=a.len()  {
+		for j in 1..=other.len() {
+			if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
+				// 0
+				distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap();
+			} else {
+				// 1
+				distances[i][j] = 1 + cmp::min(
+					(*distances.get(i - 1).unwrap().get(j - 1).unwrap()),
+					cmp::min(
+						(*distances.get(i - 1).unwrap().get(j).unwrap()),
+						(*distances.get(i).unwrap().get(j - 1).unwrap())
+					)
+				);
+			}
+		}
+	}
+	*distances.get(a.len()).unwrap().get(other.len()).unwrap()*/
+}
+
Author	SHA1	Message	Date
Book-reader	b91674678e	weight levenshtein distance calculation to prefer replacement over addition/removal	2025-05-10 02:28:21 +12:00
Book-reader	26f5196138	whoops	2025-05-10 02:06:53 +12:00
Book-reader	623b068cef	add instructions	2025-05-10 01:55:53 +12:00
Book-reader	270698c762	implement caching spellcheck results & other stuff	2025-05-10 01:46:36 +12:00
Book-reader	7e7079dd42	get it working better	2025-05-09 23:32:41 +12:00
Book-reader	b4112c311c	initial spellcheck implementation	2025-05-09 23:32:15 +12:00