I was just writing the PR and realised that aliases is a better name than keys

spaces to follow at least a little code style :(
a little cleanup
2025-05-08 20:18:00 +12:00 · 2025-05-08 20:12:03 +12:00 · 2025-05-08 20:11:15 +12:00 · 2025-05-08 19:57:38 +12:00 · 2025-05-08 19:40:35 +12:00 · 2025-05-08 19:21:13 +12:00
4 changed files with 119 additions and 266 deletions
--- a/asklyphe-frontend/src/bangs.rs
+++ b/asklyphe-frontend/src/bangs.rs
@ -0,0 +1,106 @@
+use tracing::{debug, error};
+use once_cell::sync::Lazy;
+use std::collections::BTreeMap;
+use url_encoded_data;
+
+pub static BANG_PREFIX: &str = "!";
+
+#[derive(Debug)]
+struct Bang<'a> {
+    pub url: &'a str,
+    pub aliases: &'a [&'a str]
+}
+
+impl<'a> Bang<'_> {
+    fn new(url: &'a str, aliases: &'a [&'a str]) -> Bang<'a> {
+        Bang {url, aliases}
+    }
+}
+
+static BUILTIN_BANGS: Lazy<BTreeMap<&str, Bang>> = Lazy::new(|| {
+    let mut bangs = BTreeMap::new();
+    bangs.insert("Google", Bang::new("https://google.com/search?q={}", &["g", "google"] as &[&str]));
+
+    bangs.insert("DuckDuckGo", Bang::new("https://duckduckgo.com/?q={}", &["d", "ddg", "duckduckgo"] as &[&str]));
+
+    bangs.insert("Wikipedia", Bang::new("https://wikipedia.org/w/index.php?search={}", &["w", "wiki", "wikipedia"] as &[&str]));
+    bangs
+});
+
+#[derive(Debug, Clone)]
+struct BangLoc<'b> {
+    pub url: &'b str,
+    pub start_idx: usize,
+    pub len: usize
+}
+
+impl<'b> BangLoc<'_> {
+    fn new(url: &'b str, start_idx: usize, len: usize) -> BangLoc<'b> {
+        BangLoc {url, start_idx, len}
+    }
+}
+
+pub fn redirect_bang(query: &String) -> Option<String> {
+    if !query.contains(BANG_PREFIX) {
+        return None;
+    }
+    let bangs = query.match_indices(BANG_PREFIX).filter(|(bang_start_idx, _)| {
+        if *bang_start_idx == 0 || query.chars().nth(*bang_start_idx - 1).unwrap().is_whitespace() {
+            true
+        } else {
+            false
+        }
+    }).map(|(bang_start_idx, _)| {
+        let rest = query.get(bang_start_idx + 1..query.len()).unwrap();
+        BUILTIN_BANGS.iter().map(|(_, bang)| {
+            let alias = bang.aliases.iter()
+                .filter(|alias| rest.starts_with(**alias))
+                .filter(
+                    |alias| rest.chars()
+                    .nth(alias.len())
+                    .unwrap_or(' ')
+                    .is_whitespace())
+                .max_by(|a, b| a.len().cmp(&b.len()))?;
+            Some(BangLoc::new(bang.url, bang_start_idx, alias.len()))
+        }).filter(|bang| bang.is_some()).map(|bang| bang.unwrap()).next()
+    }).filter(|bang| bang.is_some())
+    .map(|bang| bang.unwrap())
+    .collect::<Vec<_>>();
+
+
+    let bang = bangs.first()?;
+    let end_idx = {
+        let mut end_idx = bang.start_idx + 1 + bang.len;
+        if end_idx < query.len() {
+            end_idx += 1;
+        }
+        end_idx
+    };
+
+    let start_idx = if end_idx == query.len() && bang.start_idx > 0 {
+        bang.start_idx - 1
+    } else {
+        bang.start_idx
+    };
+
+
+    let query_split = query.split_once(query.get(start_idx..end_idx).unwrap()).unwrap();
+
+    let query_trimmed = format!("{}{}", query_split.0, query_split.1);
+
+    // A hack to get URL escaping without using a proper URL layout, hopefully has no other issues apart from prepending '=' to the string
+    let query_encoded = url_encoded_data::stringify(&[("", query_trimmed.as_str())]);
+    let query_encoded = query_encoded.get(1..query_encoded.len()).unwrap().to_owned();
+
+
+    let bang_url_split = bang.url.split_once("{}").unwrap();
+
+    let bang_url = format!(
+        "{}{}{}",
+        bang_url_split.0,
+        query_encoded,
+        bang_url_split.1
+    );
+
+    Some(bang_url)
+}
--- a/asklyphe-frontend/src/main.rs
+++ b/asklyphe-frontend/src/main.rs
@ -14,7 +14,7 @@
 pub mod searchbot;
 pub mod wikipedia;
 pub mod unit_converter;
-pub mod spellcheck;
+pub mod bangs;
 pub mod routes;

 use std::{env, process};
--- a/asklyphe-frontend/src/routes/search.rs
+++ b/asklyphe-frontend/src/routes/search.rs
@ -18,8 +18,7 @@ use crate::unit_converter;
 use crate::unit_converter::UnitConversion;
 use crate::wikipedia::WikipediaSummary;
 use crate::{wikipedia, Opts, ALPHA, BUILT_ON, GIT_COMMIT, VERSION, YEAR};
-use crate::spellcheck;
-use crate::spellcheck::SpellCheckResults;
+use crate::bangs;
 use askama::Template;
 use asklyphe_common::nats;
 use asklyphe_common::nats::bingservice::{
@ -70,7 +69,6 @@ pub struct Complications {
    disabled: bool,
    wikipedia: Option<WikipediaSummary>,
    unit_converter: Option<UnitConversion>,
-    spellcheck: Option<SpellCheckResults>,
 }

 pub async fn search(
@ -161,21 +159,23 @@ pub async fn search_js(
        let mut complications = Complications::default();
        // todo: better way of specifying that user doesn't want complications
        if !query.contains("-complications") {
-            /*let mut wikiquery = query.clone().to_lowercase();
+            let mut wikiquery = query.clone().to_lowercase();
            wikiquery.retain(|c| c.is_alphanumeric() || c.is_ascii_whitespace());
            wikiquery = wikiquery.replace(' ', "%20");
            // todo: proper url escaping
            let wikipedia_comp =
                tokio::spawn(async move { wikipedia::get_wikipedia_page(&wikiquery, 20).await });
-            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();*/
+            complications.wikipedia = wikipedia_comp.await.unwrap_or_default();

            let mut unit_query = query.clone().to_lowercase();
            unit_query = unit_query.replace("metre", "meter");
            let unit_comp = unit_converter::convert_unit(&unit_query);
            complications.unit_converter = unit_comp;

-            let corrections = spellcheck::check(&query);
-            complications.spellcheck = corrections;
+            let bang_redirect = bangs::redirect_bang(&query);
+            if let Some(redirect) = bang_redirect {
+                return Redirect::to(&redirect).into_response();
+            }
        } else {
            complications.disabled = true;
            query = query.replace("-complications", "");
@ -288,6 +288,11 @@ pub async fn search_nojs(
            unit_query = unit_query.replace("metre", "meter");
            let unit_comp = unit_converter::convert_unit(&unit_query);
            complications.unit_converter = unit_comp;
+
+            let bang_redirect = bangs::redirect_bang(&query);
+            if let Some(redirect) = bang_redirect {
+                return Redirect::to(&redirect).into_response();
+            }
        } else {
            complications.disabled = true;
            query = query.replace("-complications", "");
--- a/asklyphe-frontend/src/spellcheck.rs
+++ b/asklyphe-frontend/src/spellcheck.rs
@ -1,258 +0,0 @@
-use once_cell::sync::Lazy;
-use tracing::{debug, error};
-use std::{cmp, mem};
-use std::collections::BTreeMap;
-use std::sync::Mutex;
-
-// how to generate words.txt:
-// clone https://github.com/en-wl/wordlist && cd wordlist
-// make
-// ./scowl wl --deaccent > words0.txt
-// filtered with this python script:
-// -----------------------------------
-// with open("words0.txt", "r") as f:
-// 	out = []
-// 	for line in f:
-// 		line = line.lower()
-// 		if not line in out:
-// 			out.append(line)
-// 	out.sort()
-// 	with open("words.txt", "w") as out_file:
-// 		for line in out:
-// 			out_file.write(f'{line}')
-// ------------------------------------
-// then use regex or similar to enclose every line in quotes and add comma, then add 'static KNOWN_WORDS: &[&str] = &[' to the start and '];' to the end
-include!("./words.txt");
-
-// a cache of misspelled words and the closest match in the database
-static MATCH_CACHE: Lazy<Mutex<BTreeMap<String, Option<&str>>>> = Lazy::new(|| Mutex::new(BTreeMap::new()));
-
-// max distance before no alternatives are considered
-const MAX_DISTANCE: usize = 6;
-// max input text size before spellcheck is not run. on my laptop 13,000 chars of input takes around 4 seconds so this should be fine
-// update: got a larger word database and it doesn't take 4 seconds anymore lmao
-// update 2: added binary search & caching and now 50000 chars takes ~2-4 seconds
-const MAX_QUERY_WORDS: usize = 512;
-// Not really a huge issue, just used to hopefully reduce the allocations made in levenshtein_distance & provide minor performance improvements
-// not needed for now
-// const MAX_WORD_SIZE: usize = 64;
-
-pub type SpellCheckResults = Vec<SpellCheckResult>;
-
-#[derive(Debug)]
-pub struct SpellCheckResult {
-	pub orig: String,
-	pub correction: &'static str,
-}
-
-pub fn check(query: &String) -> Option<SpellCheckResults> {
-	error!("Query: {}", query);
-	/*let query: &str = {
-		if query.len() > MAX_QUERY_SIZE {
-			error!("Query is too large to be spell checked, only checking first {} chars", MAX_QUERY_SIZE);
-			query.get(0..MAX_QUERY_SIZE).unwrap()
-			// return None;
-		} else {
-			query
-		}
-	};*/
-
-	// TODO: look into how 'wc -w' counts words and copy how it splits things
-	let query_flattened = prepare(query);
-	let words = query_flattened
-		.split_whitespace()
-		.filter(|word| word.len() > 0)
-		// .filter(|word|)
-		.collect::<Vec<_>>();
-
-	error!("Words in query: {}", words.len());
-
-	if (words.len() > MAX_QUERY_WORDS) {
-		error!("{} is too many words in query to spell check", words.len());
-		// return None;
-	}
-
-	let mut distances: SpellCheckResults = vec![];
-	for qword in words {
-		// error!("Word: {}", qword);
-		// error!("is known: {:?}", KNOWN_WORDS.binary_search(&qword));
-		if KNOWN_WORDS.binary_search(&qword).is_ok() {
-			// error!("Exact word match: {}", qword);
-		} else {
-			let mut cache = MATCH_CACHE.lock().unwrap();
-			if cache.contains_key(qword) {
-				// We don't need to tell the user if there is no suggestion for an unknown word
-				if (cache.get(qword).unwrap().is_some()) {
-					// TODO: don't push duplicate misspelled words
-					distances.push(SpellCheckResult{orig: qword.to_owned(), correction: cache.get(qword).unwrap().unwrap()});
-				}
-			} else {
-				let closest_match = KNOWN_WORDS.iter()
-					.map(|kword| (kword, levenshtein_distance(&qword, &kword)))
-					.min_by(|a, b| a.1.cmp(&b.1)).unwrap();
-
-				assert!(closest_match.1 > 0, "Found exact match not caught by binary search, is the word database properly sorted?");
-
-				if closest_match.1 <= MAX_DISTANCE {
-					cache.insert(qword.to_owned(), Some(*closest_match.0));
-					distances.push(SpellCheckResult{orig: qword.to_owned(), correction: *closest_match.0});
-				} else {
-					// even though there is no close enough match, cache it anyway so that it doesn't have to be looked up every time
-					cache.insert(qword.to_owned(), None);
-				}
-			}
-		}
-		// error!("End");
-	}
-	error!("Spell check results:");
-	for word in &distances {
-		debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
-	}
-
-	if distances.len() > 0 {
-		Some(distances)
-	} else {
-		None
-	}
-
-/*	let distances = prepare(query).split_whitespace()
-		.filter(|qword| qword.len() > 0)
-		.map(|qword| qword.to_lowercase())
-		.map(
-			|qword| {
-				let mut exact_match = false;
-				KNOWN_WORDS.iter()
-					.map(|kword| kword.to_lowercase())
-					.map(
-						|kword|
-						(qword.clone(), kword.clone(), levenshtein_distance(&qword, &kword)))
-							// totally isn't jank at all and is the best solution totally
-							.take_while(|val| if exact_match {false} else if val.2 == 0 {exact_match = true; true} else {true})
-							// .map(|val| {error!("Val: {:?}", val); val})
-							.min_by(|a, b| a.2.cmp(&b.2)).unwrap()
-			})/*.filter_map(|word| word)*/.filter(|word| word.2 > 0 && word.2 <= MAX_DISTANCE)/*.filter(|(_, _, dist)| *dist > 0 && *dist <= MAX_DISTANCE)*/.map(|word| SpellCheckResult{orig: word.0, correction: word.1.to_owned().to_owned()})/*.filter(|(_, _, d)| *d > 0)*/
-		.map(|word| {
-			debug!("instead of '{}' did you mean '{}'?", word.orig, word.correction);
-			word
-		})
-		.collect::<Vec<_>>();
-	/*for word in &distances {
-		debug!("instead of '{}' did you mean '{}'? (distance of )", word.0, word.1/*, word.2*/);
-	}*/
-
-	if distances.len() > 0 {
-		Some(distances)
-	} else {
-		None
-	}*/
-	// None
-	// vec![]
-}
-
-// TODO: handle symbols better, probably with a regex
-fn prepare(s: &str) -> String {
-	s.replace("\"", "")
-		.replace(",", " ")
-		.replace(":", " ")
-		.replace(".", " ")
-		.replace("/", " ")
-		.replace("&", " ")
-		.replace("!", " ")
-		.replace("?", " ")
-		/*.replace("'", "")*/
-		.replace("0", "")
-		.replace("1", "")
-		.replace("2", "")
-		.replace("3", "")
-		.replace("4", "")
-		.replace("5", "")
-		.replace("6", "")
-		.replace("7", "")
-		.replace("8", "")
-		.replace("9", "")
-		.to_lowercase()
-}
-
-// cost of 2 for add/remove, cost of 1 for replace
-fn levenshtein_distance(a: &str, other: &str) -> usize {
-	// debug!("Self: '{}', Other: '{}'", a, other);
-	// let mut dist: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
-	// let mut dist_prev: &mut [usize; MAX_WORD_SIZE] = &mut [0usize; MAX_WORD_SIZE];
-
-	let mut dist = vec![0usize; other.len() + 1];
-	let mut dist_prev = vec![0usize; other.len() + 1];
-
-	for i in 0..=other.len() {
-		dist_prev[i] = i;
-	}
-
-	for i in 1..=a.len() {
-		dist[0] = i;
-
-		for j in 1..=other.len() {
-			if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
-				dist[j] = dist_prev[j - 1];
-			} else {
-				// TODO: make addition/subtraction 1 more expensive than replacement, presumably by adding '+ 1' to 2/3 of these
-				// motivation: honex from bee movie script is turned into hone instead of honey, this will also generally improve results & is what wikipedia says to do (best reason)
-				dist[j] = 1 + cmp::min(
-					*dist.get(j - 1).unwrap() + 1,
-					cmp::min(*dist_prev.get(j).unwrap() + 1, *dist_prev.get(j - 1).unwrap()));
-			}
-		}
-		mem::swap(&mut dist, &mut dist_prev);
-	}
-	dist_prev[other.len()]
-
-	
-
-	/*let mut distances = vec![vec![0usize; other.len() + 1]; a.len() + 1];
-	for i in 1..=a.len() {
-		distances[i][0] = i;
-	}
-
-	for j in 1..=other.len() {
-		distances[0][j] = j;
-	}
-
-	/*unsafe {
-	for i in 1..=a.len() {
-		for j in 1..=other.len() {
-			if *a.get_unchecked(i - 1..i) == *other.get_unchecked(j - 1..j) {
-				// 0
-				distances[i][j] = *distances.get_unchecked(i - 1).get_unchecked(j - 1);
-			} else {
-				// 1
-				distances[i][j] = 1 + cmp::min(
-					(*distances.get_unchecked(i - 1).get_unchecked(j - 1)),
-					cmp::min(
-						(*distances.get_unchecked(i - 1).get_unchecked(j)),
-						(*distances.get_unchecked(i).get_unchecked(j - 1))
-					)
-				);
-			}
-		}
-	}
-	}*/
-
-
-	for i in 1..=a.len()  {
-		for j in 1..=other.len() {
-			if a.get(i - 1..i).unwrap() == other.get(j - 1..j).unwrap() {
-				// 0
-				distances[i][j] = *distances.get(i - 1).unwrap().get(j - 1).unwrap();
-			} else {
-				// 1
-				distances[i][j] = 1 + cmp::min(
-					(*distances.get(i - 1).unwrap().get(j - 1).unwrap()),
-					cmp::min(
-						(*distances.get(i - 1).unwrap().get(j).unwrap()),
-						(*distances.get(i).unwrap().get(j - 1).unwrap())
-					)
-				);
-			}
-		}
-	}
-	*distances.get(a.len()).unwrap().get(other.len()).unwrap()*/
-}
-
Author	SHA1	Message	Date
Book-reader	24067eca99	I was just writing the PR and realised that aliases is a better name than keys	2025-05-08 20:18:00 +12:00
Book-reader	3696d4cb6d	spaces to follow at least a little code style :(	2025-05-08 20:12:03 +12:00
Book-reader	f06b84bf66	a little cleanup	2025-05-08 20:11:15 +12:00
Book-reader	43aae463e8	use a more sensible name	2025-05-08 19:57:38 +12:00
Book-reader	87458f30b6	bangs: fix redict including extra space sometimes	2025-05-08 19:40:35 +12:00
Book-reader	64a771f8cc	make it work fully	2025-05-08 19:21:13 +12:00
Book-reader	bac21898c9	just make the bangs better	2025-05-08 18:40:38 +12:00
Book-reader	245744a317	properly format bang urls	2025-05-08 15:46:18 +12:00
Book-reader	109e20c7b4	initial mostly working thing	2025-05-08 15:03:00 +12:00