Skip to content

Commit

Permalink
日本語判定の調整
Browse files Browse the repository at this point in the history
  • Loading branch information
phoepsilonix committed Nov 6, 2024
1 parent 98b349f commit 260c415
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.3.5"
version = "0.3.6"
authors = ["Masato TOYOSHIMA", "phoepsilonix <[email protected]>"]
edition = "2021"
rust-version = "1.82"
Expand Down
9 changes: 7 additions & 2 deletions crates/dict-to-mozc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,13 @@ fn id_expr(clsexpr: &str, _id_def: &mut IdDef, class_map: &mut MyIndexMap<String
static KANA_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[ぁ-ゖァ-ヺ・]+$").unwrap());
//static EISUU_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9' ]+$").unwrap());
static KIGOU_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z' ]+$").unwrap());
//static JAPANESE_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\p{Hiragana}\p{Katakana}\p{Han}\p{Punct}ー\-  0-9]+$").unwrap());
static JAPANESE_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\x{3005}\x{3007}\x{303b}\x{3400}-\x{9FFF}\x{F900}-\x{FAFF}\x{20000}-\x{2FFFF}\p{Hiragana}\p{Katakana}\p{Punct}ー\-  0-9]+$").unwrap());
// 地名チェックに用いる日本語判定
// 漢字、ひらがな、カタカナから始まる単語を日本語とみなす。
// 2文字目以降は、漢字、ひらがな、カタカナ以外に、
// 句読点(Punct)、長音ー記号を含む修飾文字(Letter Modifier),
// (全角含む)空白(Zs),ラテン文字、数字などを容認する。
// (2文字目以降は任意の文字列にしてもいいかもしれない。)
static JAPANESE_CHECK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[\x{3005}\x{3007}\x{303b}\x{3400}-\x{9FFF}\x{F900}-\x{FAFF}\x{20000}-\x{2FFFF}\p{Hiragana}\p{Katakana}][\x{3005}\x{3007}\x{303b}\x{3400}-\x{9FFF}\x{F900}-\x{FAFF}\x{20000}-\x{2FFFF}\p{Hiragana}\p{Katakana}\p{Lm}\p{Punct}\p{Zs}\p{Latin}\p{Number}]*$").unwrap());

fn is_kana(str: &str) -> bool {
KANA_CHECK.is_match(&str)
Expand Down

0 comments on commit 260c415

Please sign in to comment.