forked from asklyphe-public/asklyphe
		
	Compare commits
	
		
			2 commits
		
	
	
		
			0c10b15447
			...
			7e7079dd42
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 7e7079dd42 | |||
| b4112c311c | 
					 3 changed files with 185 additions and 2 deletions
				
			
		| 
						 | 
					@ -14,6 +14,7 @@
 | 
				
			||||||
pub mod searchbot;
 | 
					pub mod searchbot;
 | 
				
			||||||
pub mod wikipedia;
 | 
					pub mod wikipedia;
 | 
				
			||||||
pub mod unit_converter;
 | 
					pub mod unit_converter;
 | 
				
			||||||
 | 
					pub mod spellcheck;
 | 
				
			||||||
pub mod routes;
 | 
					pub mod routes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use std::{env, process};
 | 
					use std::{env, process};
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,6 +18,8 @@ use crate::unit_converter;
 | 
				
			||||||
use crate::unit_converter::UnitConversion;
 | 
					use crate::unit_converter::UnitConversion;
 | 
				
			||||||
use crate::wikipedia::WikipediaSummary;
 | 
					use crate::wikipedia::WikipediaSummary;
 | 
				
			||||||
use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
 | 
					use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
 | 
				
			||||||
 | 
					use crate::spellcheck;
 | 
				
			||||||
 | 
					use crate::spellcheck::SpellCheckResults;
 | 
				
			||||||
use askama::Template;
 | 
					use askama::Template;
 | 
				
			||||||
use asklyphe_common::nats;
 | 
					use asklyphe_common::nats;
 | 
				
			||||||
use asklyphe_common::nats::bingservice::{
 | 
					use asklyphe_common::nats::bingservice::{
 | 
				
			||||||
| 
						 | 
					@ -68,6 +70,7 @@ pub struct Complications {
 | 
				
			||||||
    disabled: bool,
 | 
					    disabled: bool,
 | 
				
			||||||
    wikipedia: Option<WikipediaSummary>,
 | 
					    wikipedia: Option<WikipediaSummary>,
 | 
				
			||||||
    unit_converter: Option<UnitConversion>,
 | 
					    unit_converter: Option<UnitConversion>,
 | 
				
			||||||
 | 
					    spellcheck: Option<SpellCheckResults>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub async fn search(
 | 
					pub async fn search(
 | 
				
			||||||
| 
						 | 
					@ -158,18 +161,21 @@ pub async fn search_js(
 | 
				
			||||||
        let mut complications = Complications::default();
 | 
					        let mut complications = Complications::default();
 | 
				
			||||||
        // todo: better way of specifying that user doesn't want complications
 | 
					        // todo: better way of specifying that user doesn't want complications
 | 
				
			||||||
        if !query.contains("-complications") {
 | 
					        if !query.contains("-complications") {
 | 
				
			||||||
            let mut wikiquery = query.clone().to_lowercase();
 | 
					            /*let mut wikiquery = query.clone().to_lowercase();
 | 
				
			||||||
            wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
 | 
					            wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
 | 
				
			||||||
            wikiquery = wikiquery.replace(' ', "%20");
 | 
					            wikiquery = wikiquery.replace(' ', "%20");
 | 
				
			||||||
            // todo: proper url escaping
 | 
					            // todo: proper url escaping
 | 
				
			||||||
            let wikipedia_comp =
 | 
					            let wikipedia_comp =
 | 
				
			||||||
                tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
 | 
					                tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
 | 
				
			||||||
            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();
 | 
					            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            let mut unit_query = query.clone().to_lowercase();
 | 
					            let mut unit_query = query.clone().to_lowercase();
 | 
				
			||||||
            unit_query = unit_query.replace("metre", "meter");
 | 
					            unit_query = unit_query.replace("metre", "meter");
 | 
				
			||||||
            let unit_comp = unit_converter::convert_unit(&unit_query);
 | 
					            let unit_comp = unit_converter::convert_unit(&unit_query);
 | 
				
			||||||
            complications.unit_converter = unit_comp;
 | 
					            complications.unit_converter = unit_comp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let corrections = spellcheck::check(&query);
 | 
				
			||||||
 | 
					            complications.spellcheck = corrections;
 | 
				
			||||||
        } else {
 | 
					        } else {
 | 
				
			||||||
            complications.disabled = true;
 | 
					            complications.disabled = true;
 | 
				
			||||||
            query = query.replace("-complications", "");
 | 
					            query = query.replace("-complications", "");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										176
									
								
								asklyphe-frontend/src/spellcheck.rs
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								asklyphe-frontend/src/spellcheck.rs
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,176 @@
 | 
				
			||||||
 | 
					use tracing::{debug, error};
 | 
				
			||||||
 | 
					use std::{cmp, mem};
 | 
				
			||||||
 | 
					// TODO: cache distances of strings/substrings
 | 
				
			||||||
 | 
					// TODO: use binary search to find direct matches, and if that fails, calculate and cache the result in BTreeMap<word: String, closest_match: String>
 | 
				
			||||||
 | 
					// TODO: limit by number of words and word length, not max chars, and use code more like this for better readability & async:
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
						let words = prepare(query).split_whitespace()
 | 
				
			||||||
 | 
							.filter(|qword| qword.len() > 0)
 | 
				
			||||||
 | 
							.map(|qword| qword.to_lowercase());
 | 
				
			||||||
 | 
						for word in words { // it might need to be while let Some(word) = words.next()
 | 
				
			||||||
 | 
							tokio::spawn(levenshtein_distance(...))
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					include!("./words.txt");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// max distance before no alternatives are considered
 | 
				
			||||||
 | 
					const MAX_DISTANCE: usize = 6;
 | 
				
			||||||
 | 
					// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
 | 
				
			||||||
 | 
					// update: got a larger word database and it doesn't take 4 seconds anymore lmao
 | 
				
			||||||
 | 
					const MAX_QUERY_SIZE: usize = 1024;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub type SpellCheckResults = Vec<SpellCheckResult>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[derive(Debug)]
 | 
				
			||||||
 | 
					pub struct SpellCheckResult {
 | 
				
			||||||
 | 
						pub orig: String,
 | 
				
			||||||
 | 
						pub correction: String,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub fn check(query: &String) -> Option<SpellCheckResults> {
 | 
				
			||||||
 | 
						error!("Query: {}", query);
 | 
				
			||||||
 | 
						let query: &str = {
 | 
				
			||||||
 | 
							if query.len() > MAX_QUERY_SIZE {
 | 
				
			||||||
 | 
								error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
 | 
				
			||||||
 | 
								query.get(0..MAX_QUERY_SIZE).unwrap()
 | 
				
			||||||
 | 
								// return None;
 | 
				
			||||||
 | 
							} else {
 | 
				
			||||||
 | 
								query
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						let distances = prepare(query).split_whitespace()
 | 
				
			||||||
 | 
							.filter(|qword| qword.len() > 0)
 | 
				
			||||||
 | 
							.map(|qword| qword.to_lowercase())
 | 
				
			||||||
 | 
							.map(
 | 
				
			||||||
 | 
								|qword| {
 | 
				
			||||||
 | 
									let mut exact_match = false;
 | 
				
			||||||
 | 
									KNOWN_WORDS.iter()
 | 
				
			||||||
 | 
										.map(|kword| kword.to_lowercase())
 | 
				
			||||||
 | 
										.map(
 | 
				
			||||||
 | 
											|kword|
 | 
				
			||||||
 | 
											(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
 | 
				
			||||||
 | 
												// totally isn't jank at all and is the best solution totally
 | 
				
			||||||
 | 
												.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
 | 
				
			||||||
 | 
												// .map(|val| {error!("Val: {:?}", val); val})
 | 
				
			||||||
 | 
												.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
 | 
				
			||||||
 | 
								})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
 | 
				
			||||||
 | 
							.map(|word| {
 | 
				
			||||||
 | 
								debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
 | 
				
			||||||
 | 
								word
 | 
				
			||||||
 | 
							})
 | 
				
			||||||
 | 
							.collect::<Vec<_>>();
 | 
				
			||||||
 | 
						/*for word in &distances {
 | 
				
			||||||
 | 
							debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
 | 
				
			||||||
 | 
						}*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if distances.len() > 0 {
 | 
				
			||||||
 | 
							Some(distances)
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							None
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						// vec![]
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// TODO: handle symbols better, probably with a regex
 | 
				
			||||||
 | 
					fn prepare(s: &str) -> String {
 | 
				
			||||||
 | 
						s.replace("\"", "")
 | 
				
			||||||
 | 
							.replace(",", " ")
 | 
				
			||||||
 | 
							.replace(":", " ")
 | 
				
			||||||
 | 
							.replace(".", " ")
 | 
				
			||||||
 | 
							.replace("/", " ")
 | 
				
			||||||
 | 
							.replace("&", " ")
 | 
				
			||||||
 | 
							.replace("!", " ")
 | 
				
			||||||
 | 
							.replace("?", " ")
 | 
				
			||||||
 | 
							/*.replace("'", "")*/
 | 
				
			||||||
 | 
							.replace("0", "")
 | 
				
			||||||
 | 
							.replace("1", "")
 | 
				
			||||||
 | 
							.replace("2", "")
 | 
				
			||||||
 | 
							.replace("3", "")
 | 
				
			||||||
 | 
							.replace("4", "")
 | 
				
			||||||
 | 
							.replace("5", "")
 | 
				
			||||||
 | 
							.replace("6", "")
 | 
				
			||||||
 | 
							.replace("7", "")
 | 
				
			||||||
 | 
							.replace("8", "")
 | 
				
			||||||
 | 
							.replace("9", "")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// cost of 2 for add/remove, cost of 1 for replace
 | 
				
			||||||
 | 
					fn levenshtein_distance(a: &str, other: &str) -> usize {
 | 
				
			||||||
 | 
						// debug!("Self: '{}', Other: '{}'", a, other);
 | 
				
			||||||
 | 
						let mut dist = vec![0usize; other.len() + 1];
 | 
				
			||||||
 | 
						let mut dist_prev = vec![0usize; other.len() + 1];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for i in 0..=other.len() {
 | 
				
			||||||
 | 
							dist_prev[i] = i;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for i in 1..=a.len() {
 | 
				
			||||||
 | 
							dist[0] = i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							for j in 1..=other.len() {
 | 
				
			||||||
 | 
								if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
 | 
				
			||||||
 | 
									dist[j] = dist_prev[j - 1];
 | 
				
			||||||
 | 
								} else {
 | 
				
			||||||
 | 
									dist[j] = 1 + cmp::min(
 | 
				
			||||||
 | 
										dist.get(j - 1).unwrap(),
 | 
				
			||||||
 | 
										cmp::min(dist_prev.get(j).unwrap(), dist_prev.get(j - 1).unwrap()));
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							mem::swap(&mut dist, &mut dist_prev);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						dist_prev[other.len()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1];
 | 
				
			||||||
 | 
						for i in 1..=a.len() {
 | 
				
			||||||
 | 
							distances[i][0] = i;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for j in 1..=other.len() {
 | 
				
			||||||
 | 
							distances[0][j] = j;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*unsafe {
 | 
				
			||||||
 | 
						for i in 1..=a.len() {
 | 
				
			||||||
 | 
							for j in 1..=other.len() {
 | 
				
			||||||
 | 
								if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) {
 | 
				
			||||||
 | 
									// 0
 | 
				
			||||||
 | 
									distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1);
 | 
				
			||||||
 | 
								} else {
 | 
				
			||||||
 | 
									// 1
 | 
				
			||||||
 | 
									distances[i][j] = 1 + cmp::min(
 | 
				
			||||||
 | 
										(*distances.get_unchecked(i - 1).get_unchecked(j - 1)),
 | 
				
			||||||
 | 
										cmp::min(
 | 
				
			||||||
 | 
											(*distances.get_unchecked(i - 1).get_unchecked(j)),
 | 
				
			||||||
 | 
											(*distances.get_unchecked(i).get_unchecked(j - 1))
 | 
				
			||||||
 | 
										)
 | 
				
			||||||
 | 
									);
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						}*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for i in 1..=a.len()  {
 | 
				
			||||||
 | 
							for j in 1..=other.len() {
 | 
				
			||||||
 | 
								if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
 | 
				
			||||||
 | 
									// 0
 | 
				
			||||||
 | 
									distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap();
 | 
				
			||||||
 | 
								} else {
 | 
				
			||||||
 | 
									// 1
 | 
				
			||||||
 | 
									distances[i][j] = 1 + cmp::min(
 | 
				
			||||||
 | 
										(*distances.get(i - 1).unwrap().get(j - 1).unwrap()),
 | 
				
			||||||
 | 
										cmp::min(
 | 
				
			||||||
 | 
											(*distances.get(i - 1).unwrap().get(j).unwrap()),
 | 
				
			||||||
 | 
											(*distances.get(i).unwrap().get(j - 1).unwrap())
 | 
				
			||||||
 | 
										)
 | 
				
			||||||
 | 
									);
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						*distances.get(a.len()).unwrap().get(other.len()).unwrap()*/
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue