forked from asklyphe-public/asklyphe
get it working better
This commit is contained in:
parent
b4112c311c
commit
7e7079dd42
2 changed files with 89 additions and 23 deletions
|
@ -19,6 +19,7 @@ use crate::unit_converter::UnitConversion;
|
|||
use crate::wikipedia::WikipediaSummary;
|
||||
use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
|
||||
use crate::spellcheck;
|
||||
use crate::spellcheck::SpellCheckResults;
|
||||
use askama::Template;
|
||||
use asklyphe_common::nats;
|
||||
use asklyphe_common::nats::bingservice::{
|
||||
|
@ -69,6 +70,7 @@ pub struct Complications {
|
|||
disabled: bool,
|
||||
wikipedia: Option<WikipediaSummary>,
|
||||
unit_converter: Option<UnitConversion>,
|
||||
spellcheck: Option<SpellCheckResults>,
|
||||
}
|
||||
|
||||
pub async fn search(
|
||||
|
@ -173,6 +175,7 @@ pub async fn search_js(
|
|||
complications.unit_converter = unit_comp;
|
||||
|
||||
let corrections = spellcheck::check(&query);
|
||||
complications.spellcheck = corrections;
|
||||
} else {
|
||||
complications.disabled = true;
|
||||
query = query.replace("-complications", "");
|
||||
|
|
|
@ -1,36 +1,104 @@
|
|||
use tracing::{debug, error};
|
||||
use std::{cmp, mem};
|
||||
// use tokio::sync::{Mutex, RwLock};
|
||||
// TODO: cache distances of strings/substrings
|
||||
// TODO: use binary search to find direct matches, and if that fails, calculate and cache the result in BTreeMap<word: String, closest_match: String>
|
||||
// TODO: limit by number of words and word length, not max chars, and use code more like this for better readability & async:
|
||||
/*
|
||||
let words = prepare(query).split_whitespace()
|
||||
.filter(|qword| qword.len() > 0)
|
||||
.map(|qword| qword.to_lowercase());
|
||||
for word in words { // it might need to be while let Some(word) = words.next()
|
||||
tokio::spawn(levenshtein_distance(...))
|
||||
}
|
||||
*/
|
||||
|
||||
include!("./google-10000-english-no-swears.txt");
|
||||
include!("./words.txt");
|
||||
|
||||
// max distance before no alternatives are considered
|
||||
const MAX_DISTANCE: usize = 6;
|
||||
// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
|
||||
// update: got a larger word database and it doesn't take 4 seconds anymore lmao
|
||||
const MAX_QUERY_SIZE: usize = 1024;
|
||||
|
||||
pub fn check(query: &String) -> Vec<String> {
|
||||
pub type SpellCheckResults = Vec<SpellCheckResult>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SpellCheckResult {
|
||||
pub orig: String,
|
||||
pub correction: String,
|
||||
}
|
||||
|
||||
pub fn check(query: &String) -> Option<SpellCheckResults> {
|
||||
error!("Query: {}", query);
|
||||
let distances = query.split(" ")
|
||||
let query: &str = {
|
||||
if query.len() > MAX_QUERY_SIZE {
|
||||
error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
|
||||
query.get(0..MAX_QUERY_SIZE).unwrap()
|
||||
// return None;
|
||||
} else {
|
||||
query
|
||||
}
|
||||
};
|
||||
|
||||
let distances = prepare(query).split_whitespace()
|
||||
.filter(|qword| qword.len() > 0)
|
||||
.map(|qword| qword.to_lowercase())
|
||||
.map(
|
||||
|qword|
|
||||
KNOWN_WORDS.iter()
|
||||
.map(
|
||||
|kword|
|
||||
(qword.clone(), kword, levenshtein_distance(&qword, kword)))
|
||||
.map(|val| (val.0, val.1, val.2))
|
||||
.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
|
||||
).filter(|(_, _, d)| *d > 0)
|
||||
.for_each(|word| {
|
||||
debug!("instead of '{}' did you mean '{}'? (distance of {})", word.0, word.1, word.2);
|
||||
});
|
||||
// .collect::<Vec<_>>();
|
||||
|qword| {
|
||||
let mut exact_match = false;
|
||||
KNOWN_WORDS.iter()
|
||||
.map(|kword| kword.to_lowercase())
|
||||
.map(
|
||||
|kword|
|
||||
(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
|
||||
// totally isn't jank at all and is the best solution totally
|
||||
.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
|
||||
// .map(|val| {error!("Val: {:?}", val); val})
|
||||
.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
|
||||
})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
|
||||
.map(|word| {
|
||||
debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
|
||||
word
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
/*for word in &distances {
|
||||
debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
|
||||
}*/
|
||||
|
||||
// distances
|
||||
vec![]
|
||||
if distances.len() > 0 {
|
||||
Some(distances)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
// vec![]
|
||||
}
|
||||
|
||||
// TODO: handle symbols better, probably with a regex
|
||||
fn prepare(s: &str) -> String {
|
||||
s.replace("\"", "")
|
||||
.replace(",", " ")
|
||||
.replace(":", " ")
|
||||
.replace(".", " ")
|
||||
.replace("/", " ")
|
||||
.replace("&", " ")
|
||||
.replace("!", " ")
|
||||
.replace("?", " ")
|
||||
/*.replace("'", "")*/
|
||||
.replace("0", "")
|
||||
.replace("1", "")
|
||||
.replace("2", "")
|
||||
.replace("3", "")
|
||||
.replace("4", "")
|
||||
.replace("5", "")
|
||||
.replace("6", "")
|
||||
.replace("7", "")
|
||||
.replace("8", "")
|
||||
.replace("9", "")
|
||||
}
|
||||
|
||||
// cost of 2 for add/remove, cost of 1 for replace
|
||||
fn levenshtein_distance(a: &str, other: &str) -> usize {
|
||||
// debug!("Self: '{}', Other: '{}'", a, other);
|
||||
let mut dist = vec![0usize; other.len() + 1];
|
||||
let mut dist_prev = vec![0usize; other.len() + 1];
|
||||
|
||||
|
@ -50,11 +118,6 @@ fn levenshtein_distance(a: &str, other: &str) -> usize {
|
|||
cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap()));
|
||||
}
|
||||
}
|
||||
// let temp = dist_prev;
|
||||
// dist_prev = dist.clone();
|
||||
// dist = temp;
|
||||
// dist_prev = dist;
|
||||
// dist = vec![0usize; max_len];
|
||||
mem::swap(&mut dist, &mut dist_prev);
|
||||
}
|
||||
dist_prev[other.len()]
|
||||
|
|
Loading…
Add table
Reference in a new issue