Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/models/search.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
module Search
CJK_PATTERN = /\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/

def self.table_name_prefix
"search_"
end
Expand Down
10 changes: 8 additions & 2 deletions app/models/search/highlighter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ def highlight(text)
result = text.dup

terms.each do |term|
result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
if term.match?(Search::CJK_PATTERN)
result.gsub!(/(#{Regexp.escape(term)})/i) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
end
else
result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match|
"#{OPENING_MARK}#{match}#{CLOSING_MARK}"
end
end
end

Expand Down
2 changes: 1 addition & 1 deletion app/models/search/query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def sanitize(terms)
end

def remove_invalid_search_characters(terms)
terms.gsub(/[^\w"]/, " ")
terms.gsub(/[^\p{L}\p{N}_"]/, " ")
end

def remove_unbalanced_quotes(terms)
Expand Down
8 changes: 7 additions & 1 deletion app/models/search/record/sqlite.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ module Search::Record::SQLite
has_one :search_records_fts, -> { with_rowid },
class_name: "Search::Record::SQLite::Fts", foreign_key: :rowid, primary_key: :id, dependent: :destroy

before_save :stem_content
after_save :upsert_to_fts5_table

scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", query) }
scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", Search::Stemmer.stem(query.to_s)) }
end

class_methods do
Expand Down Expand Up @@ -42,6 +43,11 @@ def comment_body
end

private
def stem_content
self.title = Search::Stemmer.stem(title) if title_changed?
self.content = Search::Stemmer.stem(content) if content_changed?
end

def escape_fts_highlight(html)
return nil unless html.present?

Expand Down
36 changes: 35 additions & 1 deletion app/models/search/stemmer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,43 @@ module Search::Stemmer

def stem(value)
if value.present?
value.gsub(/[^\w\s]/, "").split(/\s+/).map { |word| STEMMER.stem(word.downcase) }.join(" ")
tokenize(value).join(" ")
else
value
end
end

private
def tokenize(value)
tokens = []
current_word = +""

value.each_char do |char|
if cjk_character?(char)
if current_word.present?
tokens << stem_word(current_word)
current_word = +""
end
tokens << char
elsif char =~ /[\p{L}\p{N}_]/
current_word << char
else
if current_word.present?
tokens << stem_word(current_word)
current_word = +""
end
end
end

tokens << stem_word(current_word) if current_word.present?
tokens
end

def cjk_character?(char)
char.match?(Search::CJK_PATTERN)
end

def stem_word(word)
STEMMER.stem(word.downcase)
end
end
28 changes: 28 additions & 0 deletions test/models/search/highlighter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,34 @@ class Search::HighlighterTest < ActiveSupport::TestCase
assert_equal "&lt;script&gt;#{mark('test')}&lt;/script&gt;", result
end

test "highlight Chinese characters" do
highlighter = Search::Highlighter.new("测试")
result = highlighter.highlight("这是一个测试文本")

assert_equal "这是一个#{mark('测试')}文本", result
end

test "highlight Japanese characters" do
highlighter = Search::Highlighter.new("テスト")
result = highlighter.highlight("これはテストです")

assert_equal "これは#{mark('テスト')}です", result
end

test "highlight Korean characters" do
highlighter = Search::Highlighter.new("테스트")
result = highlighter.highlight("이것은 테스트입니다")

assert_equal "이것은 #{mark('테스트')}입니다", result
end

test "highlight mixed CJK and English" do
highlighter = Search::Highlighter.new("world 世界")
result = highlighter.highlight("hello world 你好世界")

assert_equal "hello #{mark('world')} 你好#{mark('世界')}", result
end

private
def mark(text)
"#{Search::Highlighter::OPENING_MARK}#{text}#{Search::Highlighter::CLOSING_MARK}"
Expand Down
57 changes: 57 additions & 0 deletions test/models/search/query_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
require "test_helper"

class Search::QueryTest < ActiveSupport::TestCase
setup do
@account = accounts(:"37s")
Current.account = @account
end

test "sanitize preserves ASCII words" do
query = build_query("hello world")

assert_equal "hello world", query.terms
end

test "sanitize preserves Chinese characters" do
query = build_query("测试文本")

assert_equal "测试文本", query.terms
end

test "sanitize preserves Japanese characters" do
query = build_query("テスト")

assert_equal "テスト", query.terms
end

test "sanitize preserves Korean characters" do
query = build_query("테스트")

assert_equal "테스트", query.terms
end

test "sanitize preserves mixed CJK and English" do
query = build_query("hello 世界 test")

assert_equal "hello 世界 test", query.terms
end

test "sanitize removes special characters but preserves CJK" do
query = build_query("测试@文本")

assert_equal "测试 文本", query.terms
end

test "sanitize preserves quoted phrases with CJK" do
query = build_query('"你好世界"')

assert_equal '"你好世界"', query.terms
end

private
def build_query(terms)
query = Search::Query.wrap(terms)
query.validate
query
end
end
36 changes: 36 additions & 0 deletions test/models/search/stemmer_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,40 @@ class Search::StemmerTest < ActiveSupport::TestCase

assert_equal "test run jump walk", result
end

test "split Chinese characters for FTS indexing" do
result = Search::Stemmer.stem("测试")

assert_equal "测 试", result
end

test "split Japanese characters for FTS indexing" do
result = Search::Stemmer.stem("テスト")

assert_equal "テ ス ト", result
end

test "split Korean characters for FTS indexing" do
result = Search::Stemmer.stem("테스트")

assert_equal "테 스 트", result
end

test "mixed CJK and English" do
result = Search::Stemmer.stem("running 测试 test")

assert_equal "run 测 试 test", result
end

test "mixed CJK and English without spaces" do
result = Search::Stemmer.stem("hello世界test")

assert_equal "hello 世 界 test", result
end

test "CJK punctuation is treated as separator" do
result = Search::Stemmer.stem("你好。世界")

assert_equal "你 好 世 界", result
end
end