asklyphe/searchservice/src/hacks.rs

62 lines
2.9 KiB
Rust
Raw Normal View History

2025-03-12 12:32:15 -07:00
/*
* searchservice hacks.rs
* - awful awful solutions to our issues
*
* Copyright (C) 2025 Real Microsoft, LLC
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::ops::{ RangeInclusive};
const BURMESE_RANGE: RangeInclusive<char> = '\u{1000}'..='\u{104f}';
const CHINESE_RANGE1: RangeInclusive<char> = '\u{4e00}'..='\u{9fff}';
const CHINESE_RANGE2: RangeInclusive<char> = '\u{3400}'..='\u{4dbf}';
const CHINESE_RANGE3: RangeInclusive<char> = '\u{20000}'..='\u{2a6df}';
const CHINESE_RANGE4: RangeInclusive<char> = '\u{2A700}'..='\u{2B73F}';
const CHINESE_RANGE5: RangeInclusive<char> = '\u{2B740}'..='\u{2B81F}';
const CHINESE_RANGE6: RangeInclusive<char> = '\u{2B820}'..='\u{2CEAF}';
const CHINESE_RANGE7: RangeInclusive<char> = '\u{2CEB0}'..='\u{2EBEF}';
const CHINESE_RANGE8: RangeInclusive<char> = '\u{30000}'..='\u{3134F}';
const CHINESE_RANGE9: RangeInclusive<char> = '\u{31350}'..='\u{323AF}';
const CHINESE_RANGE10: RangeInclusive<char> = '\u{2EBF0}'..='\u{2EE5F}';
const CHINESE_RANGE11: RangeInclusive<char> = '\u{F900}'..='\u{FAFF}';
const JAPANESE_RANGE1: RangeInclusive<char> = '\u{3040}'..='\u{309F}';
/// KATAKANA
const JAPANESE_RANGE2: RangeInclusive<char> = '\u{30A0}'..='\u{30FF}';
const JAVANESE_RANGE: RangeInclusive<char> = '\u{A980}'..='\u{A9DF}';
const KHMER_RANGE1: RangeInclusive<char> = '\u{1780}'..='\u{17FF}';
const KHMER_RANGE2: RangeInclusive<char> = '\u{19E0}'..='\u{19FF}';
const LAO_RANGE: RangeInclusive<char> = '\u{0E80}'..='\u{0EFF}';
const PHAGSPA_RANGE: RangeInclusive<char> = '\u{A840}'..='\u{A87F}';
const TAITHAM_RANGE: RangeInclusive<char> = '\u{1A20}'..='\u{1AAF}';
const THAI_RANGE: RangeInclusive<char> = '\u{0E00}'..='\u{E07F}';
const TIBETAN_RANGE: RangeInclusive<char> = '\u{0F00}'..='\u{0FFF}';
const NO_WORD_BOUNDRIES: &[RangeInclusive<char>] = &[
BURMESE_RANGE,
CHINESE_RANGE1, CHINESE_RANGE2, CHINESE_RANGE3, CHINESE_RANGE4, CHINESE_RANGE5, CHINESE_RANGE6, CHINESE_RANGE7, CHINESE_RANGE8, CHINESE_RANGE9, CHINESE_RANGE10, CHINESE_RANGE11,
JAPANESE_RANGE1, JAPANESE_RANGE2,
JAVANESE_RANGE,
KHMER_RANGE1, KHMER_RANGE2,
LAO_RANGE,
PHAGSPA_RANGE,
TAITHAM_RANGE,
THAI_RANGE,
TIBETAN_RANGE,
];
pub fn is_from_language_that_doesnt_use_word_separators(str: &str) -> bool {
for c in str.chars() {
for range in NO_WORD_BOUNDRIES {
if range.contains(&c) {
return true;
}
}
}
false
}