diff --git a/app/models/search.rb b/app/models/search.rb index f81b9a9d81..3b8127ef18 100644 --- a/app/models/search.rb +++ b/app/models/search.rb @@ -1,4 +1,6 @@ module Search + CJK_PATTERN = /\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/ + def self.table_name_prefix "search_" end diff --git a/app/models/search/highlighter.rb b/app/models/search/highlighter.rb index d2b53d7720..1a0bfa2715 100644 --- a/app/models/search/highlighter.rb +++ b/app/models/search/highlighter.rb @@ -13,8 +13,14 @@ def highlight(text) result = text.dup terms.each do |term| - result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| - "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + if term.match?(Search::CJK_PATTERN) + result.gsub!(/(#{Regexp.escape(term)})/) do |match| + "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + end + else + result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| + "#{OPENING_MARK}#{match}#{CLOSING_MARK}" + end end end @@ -22,22 +28,10 @@ def highlight(text) end def snippet(text, max_words: 20) - words = text.split(/\s+/) - match_index = words.index { |word| terms.any? { |term| word.downcase.include?(term.downcase) } } - - if words.length <= max_words - highlight(text) - elsif match_index - start_index = [ 0, match_index - max_words / 2 ].max - end_index = [ words.length - 1, start_index + max_words - 1 ].min - - snippet_text = words[start_index..end_index].join(" ") - snippet_text = "...#{snippet_text}" if start_index > 0 - snippet_text = "#{snippet_text}..." if end_index < words.length - 1 - - highlight(snippet_text) + if cjk_dominant?(text) + snippet_for_cjk(text, max_chars: max_words * 3) else - text.truncate_words(max_words, omission: "...") + snippet_for_western(text, max_words: max_words) end end @@ -65,4 +59,50 @@ def escape_highlight_marks(html) .gsub(CGI.escapeHTML(CLOSING_MARK), CLOSING_MARK.html_safe) .html_safe end + + def cjk_dominant?(text) + return false if text.length < 3 + + cjk_chars = text.scan(Search::CJK_PATTERN).length + cjk_chars > text.length / 3 + end + + def snippet_for_cjk(text, max_chars:) + match_index = terms.map { |term| text.index(term) }.compact.min + + if text.length <= max_chars + highlight(text) + elsif match_index + start_index = [ 0, match_index - max_chars / 2 ].max + end_index = [ text.length, start_index + max_chars ].min + + snippet_text = text[start_index...end_index] + snippet_text = "...#{snippet_text}" if start_index > 0 + snippet_text = "#{snippet_text}..." if end_index < text.length + + highlight(snippet_text) + else + "#{text[0, max_chars]}..." + end + end + + def snippet_for_western(text, max_words:) + words = text.split(/\s+/) + match_index = words.index { |word| terms.any? { |term| word.downcase.include?(term.downcase) } } + + if words.length <= max_words + highlight(text) + elsif match_index + start_index = [ 0, match_index - max_words / 2 ].max + end_index = [ words.length - 1, start_index + max_words - 1 ].min + + snippet_text = words[start_index..end_index].join(" ") + snippet_text = "...#{snippet_text}" if start_index > 0 + snippet_text = "#{snippet_text}..." if end_index < words.length - 1 + + highlight(snippet_text) + else + text.truncate_words(max_words, omission: "...") + end + end end diff --git a/app/models/search/query.rb b/app/models/search/query.rb index 5fe0829e41..6bf06ef420 100644 --- a/app/models/search/query.rb +++ b/app/models/search/query.rb @@ -33,7 +33,7 @@ def sanitize(terms) end def remove_invalid_search_characters(terms) - terms.gsub(/[^\w"]/, " ") + terms.gsub(/[^\p{L}\p{N}_"]/, " ") end def remove_unbalanced_quotes(terms) diff --git a/app/models/search/record/sqlite.rb b/app/models/search/record/sqlite.rb index ae0d34281b..701de592b1 100644 --- a/app/models/search/record/sqlite.rb +++ b/app/models/search/record/sqlite.rb @@ -2,26 +2,17 @@ module Search::Record::SQLite extend ActiveSupport::Concern included do - attribute :result_title, :string - attribute :result_content, :string - has_one :search_records_fts, -> { with_rowid }, class_name: "Search::Record::SQLite::Fts", foreign_key: :rowid, primary_key: :id, dependent: :destroy after_save :upsert_to_fts5_table - scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", query) } + scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", Search::Stemmer.stem(query.to_s)) } end class_methods do def search_fields(query) - opening_mark = connection.quote(Search::Highlighter::OPENING_MARK) - closing_mark = connection.quote(Search::Highlighter::CLOSING_MARK) - ellipsis = connection.quote(Search::Highlighter::ELIPSIS) - - [ "highlight(search_records_fts, 0, #{opening_mark}, #{closing_mark}) AS result_title", - "snippet(search_records_fts, 1, #{opening_mark}, #{closing_mark}, #{ellipsis}, 20) AS result_content", - "#{connection.quote(query.terms)} AS query" ] + "#{connection.quote(query.terms)} AS query" end def for(account_id) @@ -30,28 +21,28 @@ def for(account_id) end def card_title - escape_fts_highlight(result_title || card.title) + highlight(card.title, show: :full) if card_id end def card_description - escape_fts_highlight(result_content) unless comment + highlight(card.description.to_plain_text, show: :snippet) if card_id end def comment_body - escape_fts_highlight(result_content) if comment + highlight(comment.body.to_plain_text, show: :snippet) if comment end private - def escape_fts_highlight(html) - return nil unless html.present? - - CGI.escapeHTML(html) - .gsub(CGI.escapeHTML(Search::Highlighter::OPENING_MARK), Search::Highlighter::OPENING_MARK) - .gsub(CGI.escapeHTML(Search::Highlighter::CLOSING_MARK), Search::Highlighter::CLOSING_MARK) - .html_safe + def highlight(text, show:) + if text.present? && attribute?(:query) + highlighter = Search::Highlighter.new(query) + show == :snippet ? highlighter.snippet(text) : highlighter.highlight(text) + else + text + end end def upsert_to_fts5_table - Fts.upsert(id, title, content) + Fts.upsert(id, Search::Stemmer.stem(title), Search::Stemmer.stem(content)) end end diff --git a/app/models/search/stemmer.rb b/app/models/search/stemmer.rb index f6a6c56d44..2ae0e1bf15 100644 --- a/app/models/search/stemmer.rb +++ b/app/models/search/stemmer.rb @@ -5,9 +5,43 @@ module Search::Stemmer def stem(value) if value.present? - value.gsub(/[^\w\s]/, "").split(/\s+/).map { |word| STEMMER.stem(word.downcase) }.join(" ") + tokenize(value).join(" ") else value end end + + private + def tokenize(value) + tokens = [] + current_word = +"" + + value.each_char do |char| + if cjk_character?(char) + if current_word.present? + tokens << stem_word(current_word) + current_word = +"" + end + tokens << char + elsif char =~ /[\p{L}\p{N}_]/ + current_word << char + else + if current_word.present? + tokens << stem_word(current_word) + current_word = +"" + end + end + end + + tokens << stem_word(current_word) if current_word.present? + tokens + end + + def cjk_character?(char) + char.match?(Search::CJK_PATTERN) + end + + def stem_word(word) + STEMMER.stem(word.downcase) + end end diff --git a/test/models/card/searchable_test.rb b/test/models/card/searchable_test.rb index d6dcdf60cb..8c704e737c 100644 --- a/test/models/card/searchable_test.rb +++ b/test/models/card/searchable_test.rb @@ -44,7 +44,8 @@ class Card::SearchableTest < ActiveSupport::TestCase if search_record_class.connection.adapter_name == "SQLite" fts_entry = search_record.search_records_fts assert_not_nil fts_entry, "FTS entry should exist" - assert_equal card.title, fts_entry.title + # FTS stores stemmed content for search matching, not original text + assert_equal Search::Stemmer.stem(card.title), fts_entry.title end # Delete the card diff --git a/test/models/search/highlighter_test.rb b/test/models/search/highlighter_test.rb index a065c4da19..ce973d685d 100644 --- a/test/models/search/highlighter_test.rb +++ b/test/models/search/highlighter_test.rb @@ -82,6 +82,53 @@ class Search::HighlighterTest < ActiveSupport::TestCase assert_equal "<script>#{mark('test')}</script>", result end + test "highlight CJK text" do + highlighter = Search::Highlighter.new("中文") + result = highlighter.highlight("这是中文测试") + + assert_equal "这是#{mark('中文')}测试", result + end + + test "highlight Japanese text" do + highlighter = Search::Highlighter.new("日本") + result = highlighter.highlight("これは日本語です") + + assert_equal "これは#{mark('日本')}語です", result + end + + test "highlight Korean text" do + highlighter = Search::Highlighter.new("한국") + result = highlighter.highlight("이것은 한국어입니다") + + assert_equal "이것은 #{mark('한국')}어입니다", result + end + + test "highlight mixed CJK and English" do + highlighter = Search::Highlighter.new("test 中文") + result = highlighter.highlight("This is a test about 中文内容") + + assert_equal "This is a #{mark('test')} about #{mark('中文')}内容", result + end + + test "snippet handles CJK text without spaces" do + highlighter = Search::Highlighter.new("中文") + text = "这是一段很长的中文文本用于测试摘要功能是否正常工作" + result = highlighter.snippet(text, max_words: 20) + + assert_includes result, mark("中文") + end + + test "snippet truncates long CJK text around match" do + highlighter = Search::Highlighter.new("目标") + # 100+ characters, match in the middle + text = "前面有很多很多很多很多很多的文字内容" + "目标词汇" + "后面也有很多很多很多很多很多的文字内容" + result = highlighter.snippet(text, max_words: 10) # max_chars = 30 + + assert_includes result, mark("目标") + assert result.start_with?("...") + assert result.end_with?("...") + end + private def mark(text) "#{Search::Highlighter::OPENING_MARK}#{text}#{Search::Highlighter::CLOSING_MARK}" diff --git a/test/models/search/stemmer_test.rb b/test/models/search/stemmer_test.rb index 858ed2dca1..fb24df0914 100644 --- a/test/models/search/stemmer_test.rb +++ b/test/models/search/stemmer_test.rb @@ -12,4 +12,34 @@ class Search::StemmerTest < ActiveSupport::TestCase assert_equal "test run jump walk", result end + + test "stem CJK characters" do + result = Search::Stemmer.stem("测试中文") + + assert_equal "测 试 中 文", result + end + + test "stem Japanese characters" do + result = Search::Stemmer.stem("日本語テスト") + + assert_equal "日 本 語 テ ス ト", result + end + + test "stem Korean characters" do + result = Search::Stemmer.stem("한국어테스트") + + assert_equal "한 국 어 테 스 트", result + end + + test "stem mixed CJK and English" do + result = Search::Stemmer.stem("hello世界test") + + assert_equal "hello 世 界 test", result + end + + test "stem mixed with English stemming" do + result = Search::Stemmer.stem("running 测试 jumping") + + assert_equal "run 测 试 jump", result + end end