diff --git a/app/models/search.rb b/app/models/search.rb index f81b9a9d81..3b8127ef18 100644 --- a/app/models/search.rb +++ b/app/models/search.rb @@ -1,4 +1,6 @@ module Search + CJK_PATTERN = /\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/ + def self.table_name_prefix "search_" end diff --git a/app/models/search/highlighter.rb b/app/models/search/highlighter.rb index d2b53d7720..8672ed3c83 100644 --- a/app/models/search/highlighter.rb +++ b/app/models/search/highlighter.rb @@ -13,8 +13,14 @@ def highlight(text) result = text.dup terms.each do |term| - result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| - "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + if term.match?(Search::CJK_PATTERN) + result.gsub!(/(#{Regexp.escape(term)})/i) do |match| + "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + end + else + result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| + "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + end end end diff --git a/app/models/search/query.rb b/app/models/search/query.rb index 5fe0829e41..6bf06ef420 100644 --- a/app/models/search/query.rb +++ b/app/models/search/query.rb @@ -33,7 +33,7 @@ def sanitize(terms) end def remove_invalid_search_characters(terms) - terms.gsub(/[^\w"]/, " ") + terms.gsub(/[^\p{L}\p{N}_"]/, " ") end def remove_unbalanced_quotes(terms) diff --git a/app/models/search/record/sqlite.rb b/app/models/search/record/sqlite.rb index ae0d34281b..73b3b675fa 100644 --- a/app/models/search/record/sqlite.rb +++ b/app/models/search/record/sqlite.rb @@ -8,9 +8,10 @@ module Search::Record::SQLite has_one :search_records_fts, -> { with_rowid }, class_name: "Search::Record::SQLite::Fts", foreign_key: :rowid, primary_key: :id, dependent: :destroy + before_save :stem_content after_save :upsert_to_fts5_table - scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", query) } + scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", Search::Stemmer.stem(query.to_s)) } end class_methods do @@ -42,6 +43,11 @@ def comment_body end private + def stem_content + self.title = Search::Stemmer.stem(title) if title_changed? + self.content = Search::Stemmer.stem(content) if content_changed? + end + def escape_fts_highlight(html) return nil unless html.present? diff --git a/app/models/search/stemmer.rb b/app/models/search/stemmer.rb index f6a6c56d44..2ae0e1bf15 100644 --- a/app/models/search/stemmer.rb +++ b/app/models/search/stemmer.rb @@ -5,9 +5,43 @@ module Search::Stemmer def stem(value) if value.present? - value.gsub(/[^\w\s]/, "").split(/\s+/).map { |word| STEMMER.stem(word.downcase) }.join(" ") + tokenize(value).join(" ") else value end end + + private + def tokenize(value) + tokens = [] + current_word = +"" + + value.each_char do |char| + if cjk_character?(char) + if current_word.present? + tokens << stem_word(current_word) + current_word = +"" + end + tokens << char + elsif char =~ /[\p{L}\p{N}_]/ + current_word << char + else + if current_word.present? + tokens << stem_word(current_word) + current_word = +"" + end + end + end + + tokens << stem_word(current_word) if current_word.present? + tokens + end + + def cjk_character?(char) + char.match?(Search::CJK_PATTERN) + end + + def stem_word(word) + STEMMER.stem(word.downcase) + end end diff --git a/test/models/search/highlighter_test.rb b/test/models/search/highlighter_test.rb index a065c4da19..822e4913d6 100644 --- a/test/models/search/highlighter_test.rb +++ b/test/models/search/highlighter_test.rb @@ -82,6 +82,34 @@ class Search::HighlighterTest < ActiveSupport::TestCase assert_equal "<script>#{mark('test')}</script>", result end + test "highlight Chinese characters" do + highlighter = Search::Highlighter.new("测试") + result = highlighter.highlight("这是一个测试文本") + + assert_equal "这是一个#{mark('测试')}文本", result + end + + test "highlight Japanese characters" do + highlighter = Search::Highlighter.new("テスト") + result = highlighter.highlight("これはテストです") + + assert_equal "これは#{mark('テスト')}です", result + end + + test "highlight Korean characters" do + highlighter = Search::Highlighter.new("테스트") + result = highlighter.highlight("이것은 테스트입니다") + + assert_equal "이것은 #{mark('테스트')}입니다", result + end + + test "highlight mixed CJK and English" do + highlighter = Search::Highlighter.new("world 世界") + result = highlighter.highlight("hello world 你好世界") + + assert_equal "hello #{mark('world')} 你好#{mark('世界')}", result + end + private def mark(text) "#{Search::Highlighter::OPENING_MARK}#{text}#{Search::Highlighter::CLOSING_MARK}" diff --git a/test/models/search/query_test.rb b/test/models/search/query_test.rb new file mode 100644 index 0000000000..628b9e69e8 --- /dev/null +++ b/test/models/search/query_test.rb @@ -0,0 +1,57 @@ +require "test_helper" + +class Search::QueryTest < ActiveSupport::TestCase + setup do + @account = accounts(:"37s") + Current.account = @account + end + + test "sanitize preserves ASCII words" do + query = build_query("hello world") + + assert_equal "hello world", query.terms + end + + test "sanitize preserves Chinese characters" do + query = build_query("测试文本") + + assert_equal "测试文本", query.terms + end + + test "sanitize preserves Japanese characters" do + query = build_query("テスト") + + assert_equal "テスト", query.terms + end + + test "sanitize preserves Korean characters" do + query = build_query("테스트") + + assert_equal "테스트", query.terms + end + + test "sanitize preserves mixed CJK and English" do + query = build_query("hello 世界 test") + + assert_equal "hello 世界 test", query.terms + end + + test "sanitize removes special characters but preserves CJK" do + query = build_query("测试@文本") + + assert_equal "测试 文本", query.terms + end + + test "sanitize preserves quoted phrases with CJK" do + query = build_query('"你好世界"') + + assert_equal '"你好世界"', query.terms + end + + private + def build_query(terms) + query = Search::Query.wrap(terms) + query.validate + query + end +end diff --git a/test/models/search/stemmer_test.rb b/test/models/search/stemmer_test.rb index 858ed2dca1..e2963ee10e 100644 --- a/test/models/search/stemmer_test.rb +++ b/test/models/search/stemmer_test.rb @@ -12,4 +12,40 @@ class Search::StemmerTest < ActiveSupport::TestCase assert_equal "test run jump walk", result end + + test "split Chinese characters for FTS indexing" do + result = Search::Stemmer.stem("测试") + + assert_equal "测 试", result + end + + test "split Japanese characters for FTS indexing" do + result = Search::Stemmer.stem("テスト") + + assert_equal "テ ス ト", result + end + + test "split Korean characters for FTS indexing" do + result = Search::Stemmer.stem("테스트") + + assert_equal "테 스 트", result + end + + test "mixed CJK and English" do + result = Search::Stemmer.stem("running 测试 test") + + assert_equal "run 测 试 test", result + end + + test "mixed CJK and English without spaces" do + result = Search::Stemmer.stem("hello世界test") + + assert_equal "hello 世 界 test", result + end + + test "CJK punctuation is treated as separator" do + result = Search::Stemmer.stem("你好。世界") + + assert_equal "你 好 世 界", result + end end