Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/models/search.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
module Search
CJK_PATTERN = /\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/

def self.table_name_prefix
"search_"
end
Expand Down
74 changes: 57 additions & 17 deletions app/models/search/highlighter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,25 @@ def highlight(text)
result = text.dup

terms.each do |term|
result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
if term.match?(Search::CJK_PATTERN)
result.gsub!(/(#{Regexp.escape(term)})/) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
end
else
result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
end
end
end

escape_highlight_marks(result)
end

def snippet(text, max_words: 20)
words = text.split(/\s+/)
match_index = words.index { |word| terms.any? { |term| word.downcase.include?(term.downcase) } }

if words.length <= max_words
highlight(text)
elsif match_index
start_index = [ 0, match_index - max_words / 2 ].max
end_index = [ words.length - 1, start_index + max_words - 1 ].min

snippet_text = words[start_index..end_index].join(" ")
snippet_text = "...#{snippet_text}" if start_index > 0
snippet_text = "#{snippet_text}..." if end_index < words.length - 1

highlight(snippet_text)
if cjk_dominant?(text)
snippet_for_cjk(text, max_chars: max_words * 3)
else
text.truncate_words(max_words, omission: "...")
snippet_for_western(text, max_words: max_words)
end
end

Expand Down Expand Up @@ -65,4 +59,50 @@ def escape_highlight_marks(html)
.gsub(CGI.escapeHTML(CLOSING_MARK), CLOSING_MARK.html_safe)
.html_safe
end

def cjk_dominant?(text)
return false if text.length < 3

cjk_chars = text.scan(Search::CJK_PATTERN).length
cjk_chars > text.length / 3
end

def snippet_for_cjk(text, max_chars:)
match_index = terms.map { |term| text.index(term) }.compact.min

if text.length <= max_chars
highlight(text)
elsif match_index
start_index = [ 0, match_index - max_chars / 2 ].max
end_index = [ text.length, start_index + max_chars ].min

snippet_text = text[start_index...end_index]
snippet_text = "...#{snippet_text}" if start_index > 0
snippet_text = "#{snippet_text}..." if end_index < text.length

highlight(snippet_text)
else
"#{text[0, max_chars]}..."
end
end

def snippet_for_western(text, max_words:)
words = text.split(/\s+/)
match_index = words.index { |word| terms.any? { |term| word.downcase.include?(term.downcase) } }

if words.length <= max_words
highlight(text)
elsif match_index
start_index = [ 0, match_index - max_words / 2 ].max
end_index = [ words.length - 1, start_index + max_words - 1 ].min

snippet_text = words[start_index..end_index].join(" ")
snippet_text = "...#{snippet_text}" if start_index > 0
snippet_text = "#{snippet_text}..." if end_index < words.length - 1

highlight(snippet_text)
else
text.truncate_words(max_words, omission: "...")
end
end
end
2 changes: 1 addition & 1 deletion app/models/search/query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def sanitize(terms)
end

def remove_invalid_search_characters(terms)
terms.gsub(/[^\w"]/, " ")
terms.gsub(/[^\p{L}\p{N}_"]/, " ")
end

def remove_unbalanced_quotes(terms)
Expand Down
35 changes: 13 additions & 22 deletions app/models/search/record/sqlite.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,17 @@ module Search::Record::SQLite
extend ActiveSupport::Concern

included do
attribute :result_title, :string
attribute :result_content, :string

has_one :search_records_fts, -> { with_rowid },
class_name: "Search::Record::SQLite::Fts", foreign_key: :rowid, primary_key: :id, dependent: :destroy

after_save :upsert_to_fts5_table

scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", query) }
scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", Search::Stemmer.stem(query.to_s)) }
end

class_methods do
def search_fields(query)
opening_mark = connection.quote(Search::Highlighter::OPENING_MARK)
closing_mark = connection.quote(Search::Highlighter::CLOSING_MARK)
ellipsis = connection.quote(Search::Highlighter::ELIPSIS)

[ "highlight(search_records_fts, 0, #{opening_mark}, #{closing_mark}) AS result_title",
"snippet(search_records_fts, 1, #{opening_mark}, #{closing_mark}, #{ellipsis}, 20) AS result_content",
"#{connection.quote(query.terms)} AS query" ]
"#{connection.quote(query.terms)} AS query"
end

def for(account_id)
Expand All @@ -30,28 +21,28 @@ def for(account_id)
end

def card_title
escape_fts_highlight(result_title || card.title)
highlight(card.title, show: :full) if card_id
end

def card_description
escape_fts_highlight(result_content) unless comment
highlight(card.description.to_plain_text, show: :snippet) if card_id
end

def comment_body
escape_fts_highlight(result_content) if comment
highlight(comment.body.to_plain_text, show: :snippet) if comment
end

private
def escape_fts_highlight(html)
return nil unless html.present?

CGI.escapeHTML(html)
.gsub(CGI.escapeHTML(Search::Highlighter::OPENING_MARK), Search::Highlighter::OPENING_MARK)
.gsub(CGI.escapeHTML(Search::Highlighter::CLOSING_MARK), Search::Highlighter::CLOSING_MARK)
.html_safe
def highlight(text, show:)
if text.present? && attribute?(:query)
highlighter = Search::Highlighter.new(query)
show == :snippet ? highlighter.snippet(text) : highlighter.highlight(text)
else
text
end
end

def upsert_to_fts5_table
Fts.upsert(id, title, content)
Fts.upsert(id, Search::Stemmer.stem(title), Search::Stemmer.stem(content))
end
end
36 changes: 35 additions & 1 deletion app/models/search/stemmer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,43 @@ module Search::Stemmer

def stem(value)
if value.present?
value.gsub(/[^\w\s]/, "").split(/\s+/).map { |word| STEMMER.stem(word.downcase) }.join(" ")
tokenize(value).join(" ")
else
value
end
end

private
def tokenize(value)
tokens = []
current_word = +""

value.each_char do |char|
if cjk_character?(char)
if current_word.present?
tokens << stem_word(current_word)
current_word = +""
end
tokens << char
elsif char =~ /[\p{L}\p{N}_]/
current_word << char
else
if current_word.present?
tokens << stem_word(current_word)
current_word = +""
end
end
end

tokens << stem_word(current_word) if current_word.present?
tokens
end

def cjk_character?(char)
char.match?(Search::CJK_PATTERN)
end

def stem_word(word)
STEMMER.stem(word.downcase)
end
end
3 changes: 2 additions & 1 deletion test/models/card/searchable_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ class Card::SearchableTest < ActiveSupport::TestCase
if search_record_class.connection.adapter_name == "SQLite"
fts_entry = search_record.search_records_fts
assert_not_nil fts_entry, "FTS entry should exist"
assert_equal card.title, fts_entry.title
# FTS stores stemmed content for search matching, not original text
assert_equal Search::Stemmer.stem(card.title), fts_entry.title
end

# Delete the card
Expand Down
47 changes: 47 additions & 0 deletions test/models/search/highlighter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,53 @@ class Search::HighlighterTest < ActiveSupport::TestCase
assert_equal "&lt;script&gt;#{mark('test')}&lt;/script&gt;", result
end

test "highlight CJK text" do
highlighter = Search::Highlighter.new("中文")
result = highlighter.highlight("这是中文测试")

assert_equal "这是#{mark('中文')}测试", result
end

test "highlight Japanese text" do
highlighter = Search::Highlighter.new("日本")
result = highlighter.highlight("これは日本語です")

assert_equal "これは#{mark('日本')}語です", result
end

test "highlight Korean text" do
highlighter = Search::Highlighter.new("한국")
result = highlighter.highlight("이것은 한국어입니다")

assert_equal "이것은 #{mark('한국')}어입니다", result
end

test "highlight mixed CJK and English" do
highlighter = Search::Highlighter.new("test 中文")
result = highlighter.highlight("This is a test about 中文内容")

assert_equal "This is a #{mark('test')} about #{mark('中文')}内容", result
end

test "snippet handles CJK text without spaces" do
highlighter = Search::Highlighter.new("中文")
text = "这是一段很长的中文文本用于测试摘要功能是否正常工作"
result = highlighter.snippet(text, max_words: 20)

assert_includes result, mark("中文")
end

test "snippet truncates long CJK text around match" do
highlighter = Search::Highlighter.new("目标")
# 100+ characters, match in the middle
text = "前面有很多很多很多很多很多的文字内容" + "目标词汇" + "后面也有很多很多很多很多很多的文字内容"
result = highlighter.snippet(text, max_words: 10) # max_chars = 30

assert_includes result, mark("目标")
assert result.start_with?("...")
assert result.end_with?("...")
end

private
def mark(text)
"#{Search::Highlighter::OPENING_MARK}#{text}#{Search::Highlighter::CLOSING_MARK}"
Expand Down
30 changes: 30 additions & 0 deletions test/models/search/stemmer_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,34 @@ class Search::StemmerTest < ActiveSupport::TestCase

assert_equal "test run jump walk", result
end

test "stem CJK characters" do
result = Search::Stemmer.stem("测试中文")

assert_equal "测 试 中 文", result
end

test "stem Japanese characters" do
result = Search::Stemmer.stem("日本語テスト")

assert_equal "日 本 語 テ ス ト", result
end

test "stem Korean characters" do
result = Search::Stemmer.stem("한국어테스트")

assert_equal "한 국 어 테 스 트", result
end

test "stem mixed CJK and English" do
result = Search::Stemmer.stem("hello世界test")

assert_equal "hello 世 界 test", result
end

test "stem mixed with English stemming" do
result = Search::Stemmer.stem("running 测试 jumping")

assert_equal "run 测 试 jump", result
end
end