forked from asklyphe-public/asklyphe
62 lines
2.9 KiB
Rust
62 lines
2.9 KiB
Rust
|
/*
|
||
|
* searchservice hacks.rs
|
||
|
* - awful awful solutions to our issues
|
||
|
*
|
||
|
* Copyright (C) 2025 Real Microsoft, LLC
|
||
|
*
|
||
|
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
use std::ops::{ RangeInclusive};
|
||
|
|
||
|
const BURMESE_RANGE: RangeInclusive<char> = '\u{1000}'..='\u{104f}';
|
||
|
const CHINESE_RANGE1: RangeInclusive<char> = '\u{4e00}'..='\u{9fff}';
|
||
|
const CHINESE_RANGE2: RangeInclusive<char> = '\u{3400}'..='\u{4dbf}';
|
||
|
const CHINESE_RANGE3: RangeInclusive<char> = '\u{20000}'..='\u{2a6df}';
|
||
|
const CHINESE_RANGE4: RangeInclusive<char> = '\u{2A700}'..='\u{2B73F}';
|
||
|
const CHINESE_RANGE5: RangeInclusive<char> = '\u{2B740}'..='\u{2B81F}';
|
||
|
const CHINESE_RANGE6: RangeInclusive<char> = '\u{2B820}'..='\u{2CEAF}';
|
||
|
const CHINESE_RANGE7: RangeInclusive<char> = '\u{2CEB0}'..='\u{2EBEF}';
|
||
|
const CHINESE_RANGE8: RangeInclusive<char> = '\u{30000}'..='\u{3134F}';
|
||
|
const CHINESE_RANGE9: RangeInclusive<char> = '\u{31350}'..='\u{323AF}';
|
||
|
const CHINESE_RANGE10: RangeInclusive<char> = '\u{2EBF0}'..='\u{2EE5F}';
|
||
|
const CHINESE_RANGE11: RangeInclusive<char> = '\u{F900}'..='\u{FAFF}';
|
||
|
const JAPANESE_RANGE1: RangeInclusive<char> = '\u{3040}'..='\u{309F}';
|
||
|
/// KATAKANA
|
||
|
const JAPANESE_RANGE2: RangeInclusive<char> = '\u{30A0}'..='\u{30FF}';
|
||
|
const JAVANESE_RANGE: RangeInclusive<char> = '\u{A980}'..='\u{A9DF}';
|
||
|
const KHMER_RANGE1: RangeInclusive<char> = '\u{1780}'..='\u{17FF}';
|
||
|
const KHMER_RANGE2: RangeInclusive<char> = '\u{19E0}'..='\u{19FF}';
|
||
|
const LAO_RANGE: RangeInclusive<char> = '\u{0E80}'..='\u{0EFF}';
|
||
|
const PHAGSPA_RANGE: RangeInclusive<char> = '\u{A840}'..='\u{A87F}';
|
||
|
const TAITHAM_RANGE: RangeInclusive<char> = '\u{1A20}'..='\u{1AAF}';
|
||
|
const THAI_RANGE: RangeInclusive<char> = '\u{0E00}'..='\u{E07F}';
|
||
|
const TIBETAN_RANGE: RangeInclusive<char> = '\u{0F00}'..='\u{0FFF}';
|
||
|
const NO_WORD_BOUNDRIES: &[RangeInclusive<char>] = &[
|
||
|
BURMESE_RANGE,
|
||
|
CHINESE_RANGE1, CHINESE_RANGE2, CHINESE_RANGE3, CHINESE_RANGE4, CHINESE_RANGE5, CHINESE_RANGE6, CHINESE_RANGE7, CHINESE_RANGE8, CHINESE_RANGE9, CHINESE_RANGE10, CHINESE_RANGE11,
|
||
|
JAPANESE_RANGE1, JAPANESE_RANGE2,
|
||
|
JAVANESE_RANGE,
|
||
|
KHMER_RANGE1, KHMER_RANGE2,
|
||
|
LAO_RANGE,
|
||
|
PHAGSPA_RANGE,
|
||
|
TAITHAM_RANGE,
|
||
|
THAI_RANGE,
|
||
|
TIBETAN_RANGE,
|
||
|
];
|
||
|
|
||
|
pub fn is_from_language_that_doesnt_use_word_separators(str: &str) -> bool {
|
||
|
for c in str.chars() {
|
||
|
for range in NO_WORD_BOUNDRIES {
|
||
|
if range.contains(&c) {
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
false
|
||
|
}
|