get it working better

2025-05-09 23:21:14 +12:00 · 2025-05-09 23:21:14 +12:00 · 7e7079dd42
commit 7e7079dd42
parent b4112c311c
2 changed files with 89 additions and 23 deletions
--- a/asklyphe-frontend/src/routes/search.rs
+++ b/asklyphe-frontend/src/routes/search.rs
@ -19,6 +19,7 @@ use crate::unit_converter::UnitConversion;
 use crate::wikipedia::WikipediaSummary;
 use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
 use crate::spellcheck;
+use crate::spellcheck::SpellCheckResults;
 use askama::Template;
 use asklyphe_common::nats;
 use asklyphe_common::nats::bingservice::{
@ -69,6 +70,7 @@ pub struct Complications {
    disabled: bool,
    wikipedia: Option<WikipediaSummary>,
    unit_converter: Option<UnitConversion>,
+    spellcheck: Option<SpellCheckResults>,
 }

 pub async fn search(
@ -173,6 +175,7 @@ pub async fn search_js(
            complications.unit_converter = unit_comp;

            let corrections = spellcheck::check(&query);
+            complications.spellcheck = corrections;
        } else {
            complications.disabled = true;
            query = query.replace("-complications", "");
--- a/asklyphe-frontend/src/spellcheck.rs
+++ b/asklyphe-frontend/src/spellcheck.rs
@ -1,36 +1,104 @@
 use tracing::{debug, error};
 use std::{cmp, mem};
-// use tokio::sync::{Mutex, RwLock};
+// TODO: cache distances of strings/substrings
+// TODO: use binary search to find direct matches, and if that fails, calculate and cache the result in BTreeMap<word: String, closest_match: String>
+// TODO: limit by number of words and word length, not max chars, and use code more like this for better readability & async:
+/*
+	let words = prepare(query).split_whitespace()
+		.filter(|qword| qword.len() > 0)
+		.map(|qword| qword.to_lowercase());
+	for word in words { // it might need to be while let Some(word) = words.next()
+		tokio::spawn(levenshtein_distance(...))
+	}
+ */

-include!("./google-10000-english-no-swears.txt");
+include!("./words.txt");

 // max distance before no alternatives are considered
 const MAX_DISTANCE: usize = 6;
+// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
+// update: got a larger word database and it doesn't take 4 seconds anymore lmao
+const MAX_QUERY_SIZE: usize = 1024;

-pub fn check(query: &String) -> Vec<String> {
+pub type SpellCheckResults = Vec<SpellCheckResult>;
+
+#[derive(Debug)]
+pub struct SpellCheckResult {
+	pub orig: String,
+	pub correction: String,
+}
+
+pub fn check(query: &String) -> Option<SpellCheckResults> {
 	error!("Query: {}", query);
-	let distances = query.split(" ")
+	let query: &str = {
+		if query.len() > MAX_QUERY_SIZE {
+			error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
+			query.get(0..MAX_QUERY_SIZE).unwrap()
+			// return None;
+		} else {
+			query
+		}
+	};
+
+	let distances = prepare(query).split_whitespace()
+		.filter(|qword| qword.len() > 0)
 		.map(|qword| qword.to_lowercase())
 		.map(
-			|qword|
-			KNOWN_WORDS.iter()
-				.map(
-					|kword|
-					(qword.clone(), kword, levenshtein_distance(&qword, kword)))
-						.map(|val| (val.0, val.1, val.2))
-						.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
-		).filter(|(_, _, d)| *d > 0)
-		.for_each(|word| {
-			debug!("instead of '{}' did you mean '{}'? (distance of {})", word.0, word.1, word.2);
-		});
-		// .collect::<Vec<_>>();
+			|qword| {
+				let mut exact_match = false;
+				KNOWN_WORDS.iter()
+					.map(|kword| kword.to_lowercase())
+					.map(
+						|kword|
+						(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
+							// totally isn't jank at all and is the best solution totally
+							.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
+							// .map(|val| {error!("Val: {:?}", val); val})
+							.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
+			})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
+		.map(|word| {
+			debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
+			word
+		})
+		.collect::<Vec<_>>();
+	/*for word in &distances {
+		debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
+	}*/

-	// distances
-	vec![]
+	if distances.len() > 0 {
+		Some(distances)
+	} else {
+		None
+	}
+	// vec![]
+}
+
+// TODO: handle symbols better, probably with a regex
+fn prepare(s: &str) -> String {
+	s.replace("\"", "")
+		.replace(",", " ")
+		.replace(":", " ")
+		.replace(".", " ")
+		.replace("/", " ")
+		.replace("&", " ")
+		.replace("!", " ")
+		.replace("?", " ")
+		/*.replace("'", "")*/
+		.replace("0", "")
+		.replace("1", "")
+		.replace("2", "")
+		.replace("3", "")
+		.replace("4", "")
+		.replace("5", "")
+		.replace("6", "")
+		.replace("7", "")
+		.replace("8", "")
+		.replace("9", "")
 }

 // cost of 2 for add/remove, cost of 1 for replace
 fn levenshtein_distance(a: &str, other: &str) -> usize {
+	// debug!("Self: '{}', Other: '{}'", a, other);
 	let mut dist = vec![0usize; other.len() + 1];
 	let mut dist_prev = vec![0usize; other.len() + 1];

@ -50,11 +118,6 @@ fn levenshtein_distance(a: &str, other: &str) -> usize {
 					cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap()));
 			}
 		}
-		// let temp = dist_prev;
-		// dist_prev = dist.clone();
-		// dist = temp;
-		// dist_prev = dist;
-		// dist = vec![0usize; max_len];
 		mem::swap(&mut dist, &mut dist_prev);
 	}
 	dist_prev[other.len()]