diff --git a/pom.xml b/pom.xml index 40b37ef50..b0c57f67c 100644 --- a/pom.xml +++ b/pom.xml @@ -66,6 +66,11 @@ lucene-analyzers-kuromoji ${lucene-version} + + org.apache.lucene + lucene-analyzers-icu + ${lucene-version} + org.apache.lucene lucene-facet diff --git a/src/main/scala/com/cloudant/clouseau/SupportedAnalyzers.scala b/src/main/scala/com/cloudant/clouseau/SupportedAnalyzers.scala index 3e2ae0121..a645a30b5 100644 --- a/src/main/scala/com/cloudant/clouseau/SupportedAnalyzers.scala +++ b/src/main/scala/com/cloudant/clouseau/SupportedAnalyzers.scala @@ -66,6 +66,7 @@ import org.apache.lucene.analysis.tr.TurkishAnalyzer // Extras import org.apache.lucene.analysis.ja.JapaneseTokenizer import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter +import org.apache.lucene.analysis.icu.ICUFoldingFilter import org.apache.lucene.analysis.core.LowerCaseFilter import org.apache.lucene.analysis.core.LetterTokenizer @@ -120,6 +121,13 @@ object SupportedAnalyzers { new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(new LowerCaseFilter(IndexService.version, tokenizer))) } }) + case "simple_icufolding" => + Some(new Analyzer() { + def createComponents(fieldName: String, reader: Reader): TokenStreamComponents = { + val tokenizer: Tokenizer = new LetterTokenizer(IndexService.version, reader); + new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer)) + } + }) case "arabic" => options.get("stopwords") match { case Some(stopwords: List[String]) => diff --git a/src/test/scala/com/cloudant/clouseau/AnalyzerServiceSpec.scala b/src/test/scala/com/cloudant/clouseau/AnalyzerServiceSpec.scala index e1ad47fc7..2b64d3ae8 100644 --- a/src/test/scala/com/cloudant/clouseau/AnalyzerServiceSpec.scala +++ b/src/test/scala/com/cloudant/clouseau/AnalyzerServiceSpec.scala @@ -30,6 +30,10 @@ class AnalyzerServiceSpec extends SpecificationWithJUnit { "demonstrate simple_asciifolding tokenization" in new analyzer_service { service.handleCall(null, ('analyze, "simple_asciifolding", "Ayşegül Özbayır")) must be equalTo (('ok, List("aysegul", "ozbayir"))) } + + "demonstrate simple_icufolding tokenization" in new analyzer_service { + service.handleCall(null, ('analyze, "simple_icufolding", "Ayşegül Özbayır")) must be equalTo (('ok, List("aysegul", "ozbayir"))) + } } } diff --git a/src/test/scala/com/cloudant/clouseau/SupportedAnalyzersSpec.scala b/src/test/scala/com/cloudant/clouseau/SupportedAnalyzersSpec.scala index 87c6d099f..653bd3391 100644 --- a/src/test/scala/com/cloudant/clouseau/SupportedAnalyzersSpec.scala +++ b/src/test/scala/com/cloudant/clouseau/SupportedAnalyzersSpec.scala @@ -78,6 +78,9 @@ class SupportedAnalyzersSpec extends SpecificationWithJUnit { "simple_asciifolding" in { createAnalyzer("simple_asciifolding") must haveClass[Some[Analyzer]] } + "simple_icufolding" in { + createAnalyzer("simple_icufolding") must haveClass[Some[Analyzer]] + } "email" in { createAnalyzer("email") must haveClass[Some[UAX29URLEmailAnalyzer]] }