forked from asklyphe-public/asklyphe
Compare commits
9 commits
spellcheck
...
develop
Author | SHA1 | Date | |
---|---|---|---|
24067eca99 | |||
3696d4cb6d | |||
f06b84bf66 | |||
43aae463e8 | |||
87458f30b6 | |||
64a771f8cc | |||
bac21898c9 | |||
245744a317 | |||
109e20c7b4 |
4 changed files with 119 additions and 266 deletions
106
asklyphe-frontend/src/bangs.rs
Normal file
106
asklyphe-frontend/src/bangs.rs
Normal file
|
@ -0,0 +1,106 @@
|
|||
use tracing::{debug, error};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::BTreeMap;
|
||||
use url_encoded_data;
|
||||
|
||||
pub static BANG_PREFIX: &str = "!";
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Bang<'a> {
|
||||
pub url: &'a str,
|
||||
pub aliases: &'a [&'a str]
|
||||
}
|
||||
|
||||
impl<'a> Bang<'_> {
|
||||
fn new(url: &'a str, aliases: &'a [&'a str]) -> Bang<'a> {
|
||||
Bang {url, aliases}
|
||||
}
|
||||
}
|
||||
|
||||
static BUILTIN_BANGS: Lazy<BTreeMap<&str, Bang>> = Lazy::new(|| {
|
||||
let mut bangs = BTreeMap::new();
|
||||
bangs.insert("Google", Bang::new("https://google.com/search?q={}", &["g", "google"] as &[&str]));
|
||||
|
||||
bangs.insert("DuckDuckGo", Bang::new("https://duckduckgo.com/?q={}", &["d", "ddg", "duckduckgo"] as &[&str]));
|
||||
|
||||
bangs.insert("Wikipedia", Bang::new("https://wikipedia.org/w/index.php?search={}", &["w", "wiki", "wikipedia"] as &[&str]));
|
||||
bangs
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct BangLoc<'b> {
|
||||
pub url: &'b str,
|
||||
pub start_idx: usize,
|
||||
pub len: usize
|
||||
}
|
||||
|
||||
impl<'b> BangLoc<'_> {
|
||||
fn new(url: &'b str, start_idx: usize, len: usize) -> BangLoc<'b> {
|
||||
BangLoc {url, start_idx, len}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn redirect_bang(query: &String) -> Option<String> {
|
||||
if !query.contains(BANG_PREFIX) {
|
||||
return None;
|
||||
}
|
||||
let bangs = query.match_indices(BANG_PREFIX).filter(|(bang_start_idx, _)| {
|
||||
if *bang_start_idx == 0 || query.chars().nth(*bang_start_idx - 1).unwrap().is_whitespace() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}).map(|(bang_start_idx, _)| {
|
||||
let rest = query.get(bang_start_idx + 1..query.len()).unwrap();
|
||||
BUILTIN_BANGS.iter().map(|(_, bang)| {
|
||||
let alias = bang.aliases.iter()
|
||||
.filter(|alias| rest.starts_with(**alias))
|
||||
.filter(
|
||||
|alias| rest.chars()
|
||||
.nth(alias.len())
|
||||
.unwrap_or(' ')
|
||||
.is_whitespace())
|
||||
.max_by(|a, b| a.len().cmp(&b.len()))?;
|
||||
Some(BangLoc::new(bang.url, bang_start_idx, alias.len()))
|
||||
}).filter(|bang| bang.is_some()).map(|bang| bang.unwrap()).next()
|
||||
}).filter(|bang| bang.is_some())
|
||||
.map(|bang| bang.unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
||||
let bang = bangs.first()?;
|
||||
let end_idx = {
|
||||
let mut end_idx = bang.start_idx + 1 + bang.len;
|
||||
if end_idx < query.len() {
|
||||
end_idx += 1;
|
||||
}
|
||||
end_idx
|
||||
};
|
||||
|
||||
let start_idx = if end_idx == query.len() && bang.start_idx > 0 {
|
||||
bang.start_idx - 1
|
||||
} else {
|
||||
bang.start_idx
|
||||
};
|
||||
|
||||
|
||||
let query_split = query.split_once(query.get(start_idx..end_idx).unwrap()).unwrap();
|
||||
|
||||
let query_trimmed = format!("{}{}", query_split.0, query_split.1);
|
||||
|
||||
// A hack to get URL escaping without using a proper URL layout, hopefully has no other issues apart from prepending '=' to the string
|
||||
let query_encoded = url_encoded_data::stringify(&[("", query_trimmed.as_str())]);
|
||||
let query_encoded = query_encoded.get(1..query_encoded.len()).unwrap().to_owned();
|
||||
|
||||
|
||||
let bang_url_split = bang.url.split_once("{}").unwrap();
|
||||
|
||||
let bang_url = format!(
|
||||
"{}{}{}",
|
||||
bang_url_split.0,
|
||||
query_encoded,
|
||||
bang_url_split.1
|
||||
);
|
||||
|
||||
Some(bang_url)
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
pub mod searchbot;
|
||||
pub mod wikipedia;
|
||||
pub mod unit_converter;
|
||||
pub mod spellcheck;
|
||||
pub mod bangs;
|
||||
pub mod routes;
|
||||
|
||||
use std::{env, process};
|
||||
|
|
|
@ -18,8 +18,7 @@ use crate::unit_converter;
|
|||
use crate::unit_converter::UnitConversion;
|
||||
use crate::wikipedia::WikipediaSummary;
|
||||
use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
|
||||
use crate::spellcheck;
|
||||
use crate::spellcheck::SpellCheckResults;
|
||||
use crate::bangs;
|
||||
use askama::Template;
|
||||
use asklyphe_common::nats;
|
||||
use asklyphe_common::nats::bingservice::{
|
||||
|
@ -70,7 +69,6 @@ pub struct Complications {
|
|||
disabled: bool,
|
||||
wikipedia: Option<WikipediaSummary>,
|
||||
unit_converter: Option<UnitConversion>,
|
||||
spellcheck: Option<SpellCheckResults>,
|
||||
}
|
||||
|
||||
pub async fn search(
|
||||
|
@ -161,21 +159,23 @@ pub async fn search_js(
|
|||
let mut complications = Complications::default();
|
||||
// todo: better way of specifying that user doesn't want complications
|
||||
if !query.contains("-complications") {
|
||||
/*let mut wikiquery = query.clone().to_lowercase();
|
||||
let mut wikiquery = query.clone().to_lowercase();
|
||||
wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
|
||||
wikiquery = wikiquery.replace(' ', "%20");
|
||||
// todo: proper url escaping
|
||||
let wikipedia_comp =
|
||||
tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
|
||||
complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/
|
||||
complications.wikipedia = wikipedia_comp.await.unwrap_or_default();
|
||||
|
||||
let mut unit_query = query.clone().to_lowercase();
|
||||
unit_query = unit_query.replace("metre", "meter");
|
||||
let unit_comp = unit_converter::convert_unit(&unit_query);
|
||||
complications.unit_converter = unit_comp;
|
||||
|
||||
let corrections = spellcheck::check(&query);
|
||||
complications.spellcheck = corrections;
|
||||
let bang_redirect = bangs::redirect_bang(&query);
|
||||
if let Some(redirect) = bang_redirect {
|
||||
return Redirect::to(&redirect).into_response();
|
||||
}
|
||||
} else {
|
||||
complications.disabled = true;
|
||||
query = query.replace("-complications", "");
|
||||
|
@ -288,6 +288,11 @@ pub async fn search_nojs(
|
|||
unit_query = unit_query.replace("metre", "meter");
|
||||
let unit_comp = unit_converter::convert_unit(&unit_query);
|
||||
complications.unit_converter = unit_comp;
|
||||
|
||||
let bang_redirect = bangs::redirect_bang(&query);
|
||||
if let Some(redirect) = bang_redirect {
|
||||
return Redirect::to(&redirect).into_response();
|
||||
}
|
||||
} else {
|
||||
complications.disabled = true;
|
||||
query = query.replace("-complications", "");
|
||||
|
|
|
@ -1,258 +0,0 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use tracing::{debug, error};
|
||||
use std::{cmp, mem};
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
// how to generate words.txt:
|
||||
// clone https://github.com/en-wl/wordlist && cd wordlist
|
||||
// make
|
||||
// ./scowl wl --deaccent > words0.txt
|
||||
// filtered with this python script:
|
||||
// -----------------------------------
|
||||
// with open("words0.txt", "r") as f:
|
||||
// out = []
|
||||
// for line in f:
|
||||
// line = line.lower()
|
||||
// if not line in out:
|
||||
// out.append(line)
|
||||
// out.sort()
|
||||
// with open("words.txt", "w") as out_file:
|
||||
// for line in out:
|
||||
// out_file.write(f'{line}')
|
||||
// ------------------------------------
|
||||
// then use regex or similar to enclose every line in quotes and add comma, then add 'static KNOWN_WORDS: &[&str] = &[' to the start and '];' to the end
|
||||
include!("./words.txt");
|
||||
|
||||
// a cache of misspelled words and the closest match in the database
|
||||
static MATCH_CACHE: Lazy<Mutex<BTreeMap<String, Option<&str>>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
|
||||
|
||||
// max distance before no alternatives are considered
|
||||
const MAX_DISTANCE: usize = 6;
|
||||
// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
|
||||
// update: got a larger word database and it doesn't take 4 seconds anymore lmao
|
||||
// update 2: added binary search & caching and now 50000 chars takes ~2-4 seconds
|
||||
const MAX_QUERY_WORDS: usize = 512;
|
||||
// Not really a huge issue, just used to hopefully reduce the allocations made in levenshtein_distance & provide minor performance improvements
|
||||
// not needed for now
|
||||
// const MAX_WORD_SIZE: usize = 64;
|
||||
|
||||
pub type SpellCheckResults = Vec<SpellCheckResult>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SpellCheckResult {
|
||||
pub orig: String,
|
||||
pub correction: &'static str,
|
||||
}
|
||||
|
||||
pub fn check(query: &String) -> Option<SpellCheckResults> {
|
||||
error!("Query: {}", query);
|
||||
/*let query: &str = {
|
||||
if query.len() > MAX_QUERY_SIZE {
|
||||
error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
|
||||
query.get(0..MAX_QUERY_SIZE).unwrap()
|
||||
// return None;
|
||||
} else {
|
||||
query
|
||||
}
|
||||
};*/
|
||||
|
||||
// TODO: look into how 'wc -w' counts words and copy how it splits things
|
||||
let query_flattened = prepare(query);
|
||||
let words = query_flattened
|
||||
.split_whitespace()
|
||||
.filter(|word| word.len() > 0)
|
||||
// .filter(|word|)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
error!("Words in query: {}", words.len());
|
||||
|
||||
if (words.len() > MAX_QUERY_WORDS) {
|
||||
error!("{} is too many words in query to spell check", words.len());
|
||||
// return None;
|
||||
}
|
||||
|
||||
let mut distances: SpellCheckResults = vec![];
|
||||
for qword in words {
|
||||
// error!("Word: {}", qword);
|
||||
// error!("is known: {:?}", KNOWN_WORDS.binary_search(&qword));
|
||||
if KNOWN_WORDS.binary_search(&qword).is_ok() {
|
||||
// error!("Exact word match: {}", qword);
|
||||
} else {
|
||||
let mut cache = MATCH_CACHE.lock().unwrap();
|
||||
if cache.contains_key(qword) {
|
||||
// We don't need to tell the user if there is no suggestion for an unknown word
|
||||
if (cache.get(qword).unwrap().is_some()) {
|
||||
// TODO: don't push duplicate misspelled words
|
||||
distances.push(SpellCheckResult{orig: qword.to_owned(), correction: cache.get(qword).unwrap().unwrap()});
|
||||
}
|
||||
} else {
|
||||
let closest_match = KNOWN_WORDS.iter()
|
||||
.map(|kword| (kword, levenshtein_distance(&qword, &kword)))
|
||||
.min_by(|a, b| a.1.cmp(&b.1)).unwrap();
|
||||
|
||||
assert!(closest_match.1 > 0, "Found exact match not caught by binary search, is the word database properly sorted?");
|
||||
|
||||
if closest_match.1 <= MAX_DISTANCE {
|
||||
cache.insert(qword.to_owned(), Some(*closest_match.0));
|
||||
distances.push(SpellCheckResult{orig: qword.to_owned(), correction: *closest_match.0});
|
||||
} else {
|
||||
// even though there is no close enough match, cache it anyway so that it doesn't have to be looked up every time
|
||||
cache.insert(qword.to_owned(), None);
|
||||
}
|
||||
}
|
||||
}
|
||||
// error!("End");
|
||||
}
|
||||
error!("Spell check results:");
|
||||
for word in &distances {
|
||||
debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
|
||||
}
|
||||
|
||||
if distances.len() > 0 {
|
||||
Some(distances)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
/* let distances = prepare(query).split_whitespace()
|
||||
.filter(|qword| qword.len() > 0)
|
||||
.map(|qword| qword.to_lowercase())
|
||||
.map(
|
||||
|qword| {
|
||||
let mut exact_match = false;
|
||||
KNOWN_WORDS.iter()
|
||||
.map(|kword| kword.to_lowercase())
|
||||
.map(
|
||||
|kword|
|
||||
(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
|
||||
// totally isn't jank at all and is the best solution totally
|
||||
.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
|
||||
// .map(|val| {error!("Val: {:?}", val); val})
|
||||
.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
|
||||
})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
|
||||
.map(|word| {
|
||||
debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
|
||||
word
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
/*for word in &distances {
|
||||
debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
|
||||
}*/
|
||||
|
||||
if distances.len() > 0 {
|
||||
Some(distances)
|
||||
} else {
|
||||
None
|
||||
}*/
|
||||
// None
|
||||
// vec![]
|
||||
}
|
||||
|
||||
// TODO: handle symbols better, probably with a regex
|
||||
fn prepare(s: &str) -> String {
|
||||
s.replace("\"", "")
|
||||
.replace(",", " ")
|
||||
.replace(":", " ")
|
||||
.replace(".", " ")
|
||||
.replace("/", " ")
|
||||
.replace("&", " ")
|
||||
.replace("!", " ")
|
||||
.replace("?", " ")
|
||||
/*.replace("'", "")*/
|
||||
.replace("0", "")
|
||||
.replace("1", "")
|
||||
.replace("2", "")
|
||||
.replace("3", "")
|
||||
.replace("4", "")
|
||||
.replace("5", "")
|
||||
.replace("6", "")
|
||||
.replace("7", "")
|
||||
.replace("8", "")
|
||||
.replace("9", "")
|
||||
.to_lowercase()
|
||||
}
|
||||
|
||||
// cost of 2 for add/remove, cost of 1 for replace
|
||||
fn levenshtein_distance(a: &str, other: &str) -> usize {
|
||||
// debug!("Self: '{}', Other: '{}'", a, other);
|
||||
// let mut dist: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
|
||||
// let mut dist_prev: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
|
||||
|
||||
let mut dist = vec![0usize; other.len() + 1];
|
||||
let mut dist_prev = vec![0usize; other.len() + 1];
|
||||
|
||||
for i in 0..=other.len() {
|
||||
dist_prev[i] = i;
|
||||
}
|
||||
|
||||
for i in 1..=a.len() {
|
||||
dist[0] = i;
|
||||
|
||||
for j in 1..=other.len() {
|
||||
if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
|
||||
dist[j] = dist_prev[j - 1];
|
||||
} else {
|
||||
// TODO: make addition/subtraction 1 more expensive than replacement, presumably by adding '+ 1' to 2/3 of these
|
||||
// motivation: honex from bee movie script is turned into hone instead of honey, this will also generally improve results & is what wikipedia says to do (best reason)
|
||||
dist[j] = 1 + cmp::min(
|
||||
*dist.get(j - 1).unwrap() + 1,
|
||||
cmp::min(*dist_prev.get(j).unwrap() + 1, *dist_prev.get(j - 1).unwrap()));
|
||||
}
|
||||
}
|
||||
mem::swap(&mut dist, &mut dist_prev);
|
||||
}
|
||||
dist_prev[other.len()]
|
||||
|
||||
|
||||
|
||||
/*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1];
|
||||
for i in 1..=a.len() {
|
||||
distances[i][0] = i;
|
||||
}
|
||||
|
||||
for j in 1..=other.len() {
|
||||
distances[0][j] = j;
|
||||
}
|
||||
|
||||
/*unsafe {
|
||||
for i in 1..=a.len() {
|
||||
for j in 1..=other.len() {
|
||||
if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) {
|
||||
// 0
|
||||
distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1);
|
||||
} else {
|
||||
// 1
|
||||
distances[i][j] = 1 + cmp::min(
|
||||
(*distances.get_unchecked(i - 1).get_unchecked(j - 1)),
|
||||
cmp::min(
|
||||
(*distances.get_unchecked(i - 1).get_unchecked(j)),
|
||||
(*distances.get_unchecked(i).get_unchecked(j - 1))
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
for i in 1..=a.len() {
|
||||
for j in 1..=other.len() {
|
||||
if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
|
||||
// 0
|
||||
distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap();
|
||||
} else {
|
||||
// 1
|
||||
distances[i][j] = 1 + cmp::min(
|
||||
(*distances.get(i - 1).unwrap().get(j - 1).unwrap()),
|
||||
cmp::min(
|
||||
(*distances.get(i - 1).unwrap().get(j).unwrap()),
|
||||
(*distances.get(i).unwrap().get(j - 1).unwrap())
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
*distances.get(a.len()).unwrap().get(other.len()).unwrap()*/
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue