Compare commits

...

2 commits

Author SHA1 Message Date
7e7079dd42 get it working better 2025-05-09 23:32:41 +12:00
b4112c311c initial spellcheck implementation 2025-05-09 23:32:15 +12:00
3 changed files with 185 additions and 2 deletions

View file

@ -14,6 +14,7 @@
pub mod searchbot; pub mod searchbot;
pub mod wikipedia; pub mod wikipedia;
pub mod unit_converter; pub mod unit_converter;
pub mod spellcheck;
pub mod routes; pub mod routes;
use std::{env, process}; use std::{env, process};

View file

@ -18,6 +18,8 @@ use crate::unit_converter;
use crate::unit_converter::UnitConversion; use crate::unit_converter::UnitConversion;
use crate::wikipedia::WikipediaSummary; use crate::wikipedia::WikipediaSummary;
use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR}; use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
use crate::spellcheck;
use crate::spellcheck::SpellCheckResults;
use askama::Template; use askama::Template;
use asklyphe_common::nats; use asklyphe_common::nats;
use asklyphe_common::nats::bingservice::{ use asklyphe_common::nats::bingservice::{
@ -68,6 +70,7 @@ pub struct Complications {
disabled: bool, disabled: bool,
wikipedia: Option<WikipediaSummary>, wikipedia: Option<WikipediaSummary>,
unit_converter: Option<UnitConversion>, unit_converter: Option<UnitConversion>,
spellcheck: Option<SpellCheckResults>,
} }
pub async fn search( pub async fn search(
@ -158,18 +161,21 @@ pub async fn search_js(
let mut complications = Complications::default(); let mut complications = Complications::default();
// todo: better way of specifying that user doesn't want complications // todo: better way of specifying that user doesn't want complications
if !query.contains("-complications") { if !query.contains("-complications") {
let mut wikiquery = query.clone().to_lowercase(); /*let mut wikiquery = query.clone().to_lowercase();
wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace()); wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
wikiquery = wikiquery.replace(' ', "%20"); wikiquery = wikiquery.replace(' ', "%20");
// todo: proper url escaping // todo: proper url escaping
let wikipedia_comp = let wikipedia_comp =
tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await }); tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
complications.wikipedia = wikipedia_comp.await.unwrap_or_default(); complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/
let mut unit_query = query.clone().to_lowercase(); let mut unit_query = query.clone().to_lowercase();
unit_query = unit_query.replace("metre", "meter"); unit_query = unit_query.replace("metre", "meter");
let unit_comp = unit_converter::convert_unit(&unit_query); let unit_comp = unit_converter::convert_unit(&unit_query);
complications.unit_converter = unit_comp; complications.unit_converter = unit_comp;
let corrections = spellcheck::check(&query);
complications.spellcheck = corrections;
} else { } else {
complications.disabled = true; complications.disabled = true;
query = query.replace("-complications", ""); query = query.replace("-complications", "");

View file

@ -0,0 +1,176 @@
use tracing::{debug, error};
use std::{cmp, mem};
// TODO: cache distances of strings/substrings
// TODO: use binary search to find direct matches, and if that fails, calculate and cache the result in BTreeMap<word: String, closest_match: String>
// TODO: limit by number of words and word length, not max chars, and use code more like this for better readability & async:
/*
let words = prepare(query).split_whitespace()
.filter(|qword| qword.len() > 0)
.map(|qword| qword.to_lowercase());
for word in words { // it might need to be while let Some(word) = words.next()
tokio::spawn(levenshtein_distance(...))
}
*/
include!("./words.txt");
// max distance before no alternatives are considered
const MAX_DISTANCE: usize = 6;
// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
// update: got a larger word database and it doesn't take 4 seconds anymore lmao
const MAX_QUERY_SIZE: usize = 1024;
pub type SpellCheckResults = Vec<SpellCheckResult>;
#[derive(Debug)]
pub struct SpellCheckResult {
pub orig: String,
pub correction: String,
}
pub fn check(query: &String) -> Option<SpellCheckResults> {
error!("Query: {}", query);
let query: &str = {
if query.len() > MAX_QUERY_SIZE {
error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
query.get(0..MAX_QUERY_SIZE).unwrap()
// return None;
} else {
query
}
};
let distances = prepare(query).split_whitespace()
.filter(|qword| qword.len() > 0)
.map(|qword| qword.to_lowercase())
.map(
|qword| {
let mut exact_match = false;
KNOWN_WORDS.iter()
.map(|kword| kword.to_lowercase())
.map(
|kword|
(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
// totally isn't jank at all and is the best solution totally
.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
// .map(|val| {error!("Val: {:?}", val); val})
.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
.map(|word| {
debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
word
})
.collect::<Vec<_>>();
/*for word in &distances {
debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
}*/
if distances.len() > 0 {
Some(distances)
} else {
None
}
// vec![]
}
// TODO: handle symbols better, probably with a regex
fn prepare(s: &str) -> String {
s.replace("\"", "")
.replace(",", " ")
.replace(":", " ")
.replace(".", " ")
.replace("/", " ")
.replace("&", " ")
.replace("!", " ")
.replace("?", " ")
/*.replace("'", "")*/
.replace("0", "")
.replace("1", "")
.replace("2", "")
.replace("3", "")
.replace("4", "")
.replace("5", "")
.replace("6", "")
.replace("7", "")
.replace("8", "")
.replace("9", "")
}
// cost of 2 for add/remove, cost of 1 for replace
fn levenshtein_distance(a: &str, other: &str) -> usize {
// debug!("Self: '{}', Other: '{}'", a, other);
let mut dist = vec![0usize; other.len() + 1];
let mut dist_prev = vec![0usize; other.len() + 1];
for i in 0..=other.len() {
dist_prev[i] = i;
}
for i in 1..=a.len() {
dist[0] = i;
for j in 1..=other.len() {
if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
dist[j] = dist_prev[j - 1];
} else {
dist[j] = 1 + cmp::min(
dist.get(j - 1).unwrap(),
cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap()));
}
}
mem::swap(&mut dist, &mut dist_prev);
}
dist_prev[other.len()]
/*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1];
for i in 1..=a.len() {
distances[i][0] = i;
}
for j in 1..=other.len() {
distances[0][j] = j;
}
/*unsafe {
for i in 1..=a.len() {
for j in 1..=other.len() {
if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) {
// 0
distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1);
} else {
// 1
distances[i][j] = 1 + cmp::min(
(*distances.get_unchecked(i - 1).get_unchecked(j - 1)),
cmp::min(
(*distances.get_unchecked(i - 1).get_unchecked(j)),
(*distances.get_unchecked(i).get_unchecked(j - 1))
)
);
}
}
}
}*/
for i in 1..=a.len() {
for j in 1..=other.len() {
if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
// 0
distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap();
} else {
// 1
distances[i][j] = 1 + cmp::min(
(*distances.get(i - 1).unwrap().get(j - 1).unwrap()),
cmp::min(
(*distances.get(i - 1).unwrap().get(j).unwrap()),
(*distances.get(i).unwrap().get(j - 1).unwrap())
)
);
}
}
}
*distances.get(a.len()).unwrap().get(other.len()).unwrap()*/
}