diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4730f28..09a29cd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - es_version: [8.9.2, 8.15.2] + es_version: [8.15.2] steps: - name: Checkout project sources uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index a1fc39c..93c83dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .gradle /build/ +/bin/ +*.class + # Ignore Gradle GUI config gradle-app.setting diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0b44ffc --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +ES_VERSION ?= 8.15.2 +JAVA_HOME ?= /usr/lib/jvm/java-17-openjdk-amd64 + +.PHONY: all build + +all: build + +build: + @echo "Building with Elasticsearch version $(ES_VERSION)" + @export JAVA_HOME=$(JAVA_HOME) && \ + export PATH=$$JAVA_HOME/bin:$$PATH && \ + ./gradlew build -Pelasticsearch.version=$(ES_VERSION) diff --git a/README.md b/README.md index e5e8570..d4e9b23 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,17 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest). ## Build - ES_VERSION=8.9.2 + ES_VERSION=8.15.2 ./gradlew build -Pelasticsearch.version=$ES_VERSION + export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + export PATH=$JAVA_HOME/bin:$PATH + JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=8.15.2 + + ## Testing - gradle test --debug + gradle test --info --tests "Croatian*" ## Install diff --git a/gradle.properties b/gradle.properties index 0686d99..25c21ef 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,2 +1,2 @@ -elasticsearch.version=8.5.3 +elasticsearch.version=8.15.2 plugin.version=1.2.0-SNAPSHOT diff --git a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java index f90006b..84a3b8b 100644 --- a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java +++ b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java @@ -1,5 +1,6 @@ package cz.monitora.elasticsearch; +import cz.monitora.elasticsearch.analyzer.croatian.CroatianStemFilterFactory; import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory; import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory; import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory; @@ -20,6 +21,7 @@ public Map> getToken AnalysisPlugin.requiresAnalysisSettings(LowerCaseTokenFilterFactory::new)); extra.put("monitora_czech_stem", CzechStemFilterFactory::new); extra.put("monitora_slovak_stem", SlovakStemFilterFactory::new); + extra.put("monitora_croatian_stem", CroatianStemFilterFactory::new); return extra; } } diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilter.java new file mode 100644 index 0000000..ba2c4b8 --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package cz.monitora.elasticsearch.analyzer.croatian; + +import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link CroatianStemmer} to stem Croatian words. + * + *

To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a + * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link + * TokenStream}. + * + *

NOTE: Input is expected to be in lowercase, but with diacritical marks + * + * @see SetKeywordMarkerFilter + */ +public final class CroatianStemFilter extends TokenFilter { + private final CroatianStemmer stemmer = new CroatianStemmer(); + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public CroatianStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAttr.buffer(), termAttr.length()); + termAttr.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilterFactory.java new file mode 100644 index 0000000..4fe65ff --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemFilterFactory.java @@ -0,0 +1,21 @@ +package cz.monitora.elasticsearch.analyzer.croatian; + +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +public class CroatianStemFilterFactory extends AbstractTokenFilterFactory { + + /** Creates a new CroatianStemFilterFactory */ + public CroatianStemFilterFactory( + IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(name, settings); + } + + @Override + public TokenStream create(TokenStream input) { + return new CroatianStemFilter(input); + } +} diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmer.java b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmer.java new file mode 100644 index 0000000..7af0c8f --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmer.java @@ -0,0 +1,470 @@ +package cz.monitora.elasticsearch.analyzer.croatian; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Stemmer based on the algorithm described in Reliable Baselines for Sentiment Analysis in + * Resource-Limited Languages: The Serbian Movie Review Dataset, Vuk Batanović, Boško Nikolić, Milan + * Milosavljević (http://www.lrec-conf.org/proceedings/lrec2016/pdf/284_Paper.pdf) + */ +public class CroatianStemmer { + + private HashMap transformations; + private HashMap exactMatches; + private HashSet stopset; + private ArrayList wordStart; + private ArrayList wordEnd; + private ArrayList wordPatterns; + private static final Pattern vowelPattern = Pattern.compile("[aeiouR]"); + + public CroatianStemmer() { + initRules(); + } + + public int stem(char[] s, int len) { + String word = new String(s, 0, len); + + if (stopset.contains(word)) { + return len; + } + + if (exactMatches.containsKey(word)) { + String replacement = exactMatches.get(word); + Arrays.fill(s, '\0'); + char[] replacementChars = replacement.toCharArray(); + System.arraycopy(replacementChars, 0, s, 0, replacementChars.length); + return replacementChars.length; + } + + String stemmed = transform(word); + for (Pattern pattern : wordPatterns) { + Matcher matcher = pattern.matcher(stemmed); + if (matcher.matches() && hasAVowel(matcher.group(1)) && matcher.group(1).length() > 1) { + return matcher.group(1).length(); + } + } + return stemmed.length(); + } + + private String transform(String word) { + for (String key : transformations.keySet()) { + if (word.endsWith(key)) { + return word.substring(0, word.length() - key.length()) + transformations.get(key); + } + } + return word; + } + + private boolean hasAVowel(String word) { + Matcher matcher = vowelPattern.matcher(capitalizeSyllabicR(word)); + return matcher.find(); + } + + private String capitalizeSyllabicR(String word) { + return word.replaceAll("(^|[^aeiou])r($|[^aeiou])", "$1R$2"); + } + + protected void initRules() { + // stuff which would be fucked using patterns + exactMatches = new HashMap<>(); + exactMatches.put("zao", "zli"); + exactMatches.put("zla", "zli"); + exactMatches.put("zlo", "zli"); + exactMatches.put("dobar", "dobro"); + exactMatches.put("dobro", "dobro"); + exactMatches.put("dobra", "dobro"); + + wordStart = new ArrayList<>(); + wordEnd = new ArrayList<>(); + wordPatterns = new ArrayList<>(); + + wordStart.add(".+(s|š)k"); + wordEnd.add( + "ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u"); + wordStart.add(".+(s|š)tv"); + wordEnd.add("ima|om|o|a|u"); + wordStart.add(".+(t|m|p|r|g)anij"); + wordEnd.add("ama|ima|om|a|u|e|i|"); + wordStart.add(".+an"); + wordEnd.add("inom|ina|inu|ine|ima|in|om|u|i|a|e|"); + wordStart.add(".+in"); + wordEnd.add("ima|ama|om|a|e|i|u|o|"); + wordStart.add(".+on"); + wordEnd.add("ovima|ova|ove|ovi|ima|om|a|e|i|u|"); + wordStart.add(".+n"); + wordEnd.add( + "ijima|ijega|ijemu|ijeg|ijem|ijim|ijih|ijoj|iji|ije|ija|iju|ima|ome|omu|oga|oj|om|ih|im|og|o|e|a|u|i|"); + wordStart.add(".+(a|e|u)ć"); + wordEnd.add("oga|ome|omu|ega|emu|ima|oj|ih|om|eg|em|og|uh|im|e|a"); + wordStart.add(".+ugov"); + wordEnd.add("ima|i|e|a"); + wordStart.add(".+ug"); + wordEnd.add("ama|om|a|e|i|u|o"); + wordStart.add(".+log"); + wordEnd.add("ama|om|a|u|e|"); + wordStart.add(".+[^eo]g"); + wordEnd.add("ovima|ama|ovi|ove|ova|om|a|e|i|u|o|"); + wordStart.add(".+(rrar|ott|ss|ll)i"); + wordEnd.add("jem|ja|ju|o|"); + wordStart.add(".+uj"); + wordEnd.add("ući|emo|ete|mo|em|eš|e|u|"); + wordStart.add(".+(c|č|ć|đ|l|r)aj"); + wordEnd.add("evima|evi|eva|eve|ama|ima|em|a|e|i|u|"); + wordStart.add(".+(b|c|d|l|n|m|ž|g|f|p|r|s|t|z)ij"); + wordEnd.add("ima|ama|om|a|e|i|u|o|"); + wordStart.add(".+[^z]nal"); + wordEnd.add("ima|ama|om|a|e|i|u|o|"); + wordStart.add(".+ijal"); + wordEnd.add("ima|ama|om|a|e|i|u|o|"); + wordStart.add(".+ozil"); + wordEnd.add("ima|om|a|e|u|i|"); + wordStart.add(".+olov"); + wordEnd.add("ima|i|a|e"); + wordStart.add(".+ol"); + wordEnd.add("ima|om|a|u|e|i|"); + wordStart.add(".+lem"); + wordEnd.add("ama|ima|om|a|e|i|u|o|"); + wordStart.add(".+ram"); + wordEnd.add("ama|om|a|e|i|u|o"); + wordStart.add(".+(a|d|e|o)r"); + wordEnd.add("ama|ima|om|u|a|e|i|"); + wordStart.add(".+(e|i)s"); + wordEnd.add("ima|om|e|a|u"); + wordStart.add(".+(t|n|j|k|j|t|b|g|v)aš"); + wordEnd.add("ama|ima|om|em|a|u|i|e|"); + wordStart.add(".+(e|i)š"); + wordEnd.add("ima|ama|om|em|i|e|a|u|"); + wordStart.add(".+ikat"); + wordEnd.add("ima|om|a|e|i|u|o|"); + wordStart.add(".+lat"); + wordEnd.add("ima|om|a|e|i|u|o|"); + wordStart.add(".+et"); + wordEnd.add("ama|ima|om|a|e|i|u|o|"); + wordStart.add(".+(e|i|k|o)st"); + wordEnd.add("ima|ama|om|a|e|i|u|o|"); + wordStart.add(".+išt"); + wordEnd.add("ima|em|a|e|u"); + wordStart.add(".+ova"); + wordEnd.add("smo|ste|hu|ti|še|li|la|le|lo|t|h|o"); + wordStart.add(".+(a|e|i)v"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|ama|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|"); + wordStart.add(".+[^dkml]ov"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|"); + wordStart.add(".+(m|l)ov"); + wordEnd.add("ima|om|a|u|e|i|"); + wordStart.add(".+el"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|"); + wordStart.add(".+(a|e|š)nj"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|ega|emu|eg|em|im|ih|oj|om|og|a|e|i|o|u"); + wordStart.add(".+čin"); + wordEnd.add("ama|ome|omu|oga|ima|og|om|im|ih|oj|a|u|i|o|e|"); + wordStart.add(".+roši"); + wordEnd.add("vši|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o"); + wordStart.add(".+oš"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|"); + wordStart.add(".+(e|o)vit"); + wordEnd.add( + "ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u|"); + wordStart.add(".+ast"); + wordEnd.add( + "ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u|"); + wordStart.add(".+k"); + wordEnd.add( + "ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|"); + wordStart.add(".+(e|a|i|u)va"); + wordEnd.add("jući|smo|ste|jmo|jte|ju|la|le|li|lo|mo|na|ne|ni|no|te|ti|še|hu|h|j|m|n|o|t|v|š|"); + wordStart.add(".+ir"); + wordEnd.add( + "ujemo|ujete|ujući|ajući|ivat|ujem|uješ|ujmo|ujte|avši|asmo|aste|ati|amo|ate|aju|aše|ahu|ala|alo|ali|ale|uje|uju|uj|al|an|am|aš|at|ah|ao"); + wordStart.add(".+ač"); + wordEnd.add( + "ismo|iste|iti|imo|ite|iše|eći|ila|ilo|ili|ile|ena|eno|eni|ene|io|im|iš|it|ih|en|i|e"); + wordStart.add(".+ača"); + wordEnd.add("vši|smo|ste|smo|ste|hu|ti|mo|te|še|la|lo|li|le|ju|na|no|ni|ne|o|m|š|t|h|n"); + wordStart.add(".+n"); + wordEnd.add("uvši|usmo|uste|ući|imo|ite|emo|ete|ula|ulo|ule|uli|uto|uti|uta|em|eš|uo|ut|e|u|i"); + wordStart.add(".+ni"); + wordEnd.add("vši|smo|ste|ti|mo|te|mo|te|la|lo|le|li|m|š|o"); + wordStart.add( + ".+((a|r|i|p|e|u)st|[^o]g|ik|uc|oj|aj|lj|ak|ck|čk|šk|uk|nj|im|ar|at|et|št|it|ot|ut|zn|zv)a"); + wordEnd.add( + "jući|vši|smo|ste|jmo|jte|jem|mo|te|je|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š"); + wordStart.add(".+ur"); + wordEnd.add( + "ajući|asmo|aste|ajmo|ajte|amo|ate|aju|ati|aše|ahu|ala|ali|ale|alo|ana|ano|ani|ane|al|at|ah|ao|aj|an|am|aš"); + wordStart.add(".+(a|i|o)staj"); + wordEnd.add("asmo|aste|ahu|ati|emo|ete|aše|ali|ući|ala|alo|ale|mo|ao|em|eš|at|ah|te|e|u|"); + wordStart.add(".+(b|c|č|ć|d|e|f|g|j|k|n|r|t|u|v)a"); + wordEnd.add("lama|lima|lom|lu|li|la|le|lo|l"); + wordStart.add(".+(t|č|j|ž|š)aj"); + wordEnd.add("evima|evi|eva|eve|ama|ima|em|a|e|i|u|"); + wordStart.add(".+([^o]m|ič|nč|uč|b|c|ć|d|đ|h|j|k|l|n|p|r|s|š|v|z|ž)a"); + wordEnd.add("jući|vši|smo|ste|jmo|jte|mo|te|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š"); + wordStart.add(".+(a|i|o)sta"); + wordEnd.add( + "dosmo|doste|doše|nemo|demo|nete|dete|nimo|nite|nila|vši|nem|dem|neš|deš|doh|de|ti|ne|nu|du|la|li|lo|le|t|o"); + wordStart.add(".+ta"); + wordEnd.add("smo|ste|jmo|jte|vši|ti|mo|te|ju|še|la|lo|le|li|na|no|ni|ne|n|j|o|m|š|t|h"); + wordStart.add(".+inj"); + wordEnd.add("asmo|aste|ati|emo|ete|ali|ala|alo|ale|aše|ahu|em|eš|at|ah|ao"); + wordStart.add(".+as"); + wordEnd.add("temo|tete|timo|tite|tući|tem|teš|tao|te|li|ti|la|lo|le"); + wordStart.add(".+(elj|ulj|tit|ac|ič|od|oj|et|av|ov)i"); + wordEnd.add("vši|eći|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o"); + wordStart.add(".+(tit|jeb|ar|ed|uš|ič)i"); + wordEnd.add("jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o"); + wordStart.add(".+(b|č|d|l|m|p|r|s|š|ž)i"); + wordEnd.add( + "jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|lu|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o"); + wordStart.add(".+luč"); + wordEnd.add( + "ujete|ujući|ujemo|ujem|uješ|ismo|iste|ujmo|ujte|uje|uju|iše|iti|imo|ite|ila|ilo|ili|ile|ena|eno|eni|ene|uj|io|en|im|iš|it|ih|e|i"); + wordStart.add(".+jeti"); + wordEnd.add("smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o"); + wordStart.add(".+e"); + wordEnd.add("lama|lima|lom|lu|li|la|le|lo|l"); + wordStart.add(".+i"); + wordEnd.add("lama|lima|lom|lu|li|la|le|lo|l"); + wordStart.add(".+at"); + wordEnd.add( + "ijega|ijemu|ijima|ijeg|ijem|ijih|ijim|ima|oga|ome|omu|iji|ije|ija|iju|oj|og|om|im|ih|a|u|i|e|o|"); + wordStart.add(".+et"); + wordEnd.add("avši|ući|emo|imo|em|eš|e|u|i"); + wordStart.add(".+"); + wordEnd.add( + "ajući|alima|alom|avši|asmo|aste|ajmo|ajte|ivši|amo|ate|aju|ati|aše|ahu|ali|ala|ale|alo|ana|ano|ani|ane|am|aš|at|ah|ao|aj|an"); + wordStart.add(".+"); + wordEnd.add( + "anje|enje|anja|enja|enom|enoj|enog|enim|enih|anom|anoj|anog|anim|anih|eno|ovi|ova|oga|ima|ove|enu|anu|ena|ama"); + wordStart.add(".+"); + wordEnd.add( + "nijega|nijemu|nijima|nijeg|nijem|nijim|nijih|nima|niji|nije|nija|niju|noj|nom|nog|nim|nih|an|na|nu|ni|ne|no"); + wordStart.add(".+"); + wordEnd.add("om|og|im|ih|em|oj|an|u|o|i|e|a"); + + // Initialize stop words + stopset = new HashSet<>(); + stopset.add("biti"); + stopset.add("jesam"); + stopset.add("budem"); + stopset.add("sam"); + stopset.add("jesi"); + stopset.add("budeš"); + stopset.add("si"); + stopset.add("jesmo"); + stopset.add("budemo"); + stopset.add("smo"); + stopset.add("jeste"); + stopset.add("budete"); + stopset.add("ste"); + stopset.add("jesu"); + stopset.add("budu"); + stopset.add("su"); + stopset.add("bih"); + stopset.add("bijah"); + stopset.add("bjeh"); + stopset.add("bijaše"); + stopset.add("bi"); + stopset.add("bje"); + stopset.add("bješe"); + stopset.add("bijasmo"); + stopset.add("bismo"); + stopset.add("bjesmo"); + stopset.add("bijaste"); + stopset.add("biste"); + stopset.add("bjeste"); + stopset.add("bijahu"); + // stopset.add("biste"); // Batanović: Ponavljanja + // stopset.add("bjeste"); // Repetitions + // stopset.add("bijahu"); + // stopset.add("bi"); + stopset.add("biše"); + stopset.add("bjehu"); + // stopset.add("bješe"); + stopset.add("bio"); + stopset.add("bili"); + stopset.add("budimo"); + stopset.add("budite"); + stopset.add("bila"); + stopset.add("bilo"); + stopset.add("bile"); + stopset.add("ću"); + stopset.add("ćeš"); + stopset.add("će"); + stopset.add("ćemo"); + stopset.add("ćete"); + stopset.add("želim"); + stopset.add("želiš"); + stopset.add("želi"); + stopset.add("želimo"); + stopset.add("želite"); + stopset.add("žele"); + stopset.add("moram"); + stopset.add("moraš"); + stopset.add("mora"); + stopset.add("moramo"); + stopset.add("morate"); + stopset.add("moraju"); + stopset.add("trebam"); + stopset.add("trebaš"); + stopset.add("treba"); + stopset.add("trebamo"); + stopset.add("trebate"); + stopset.add("trebaju"); + stopset.add("mogu"); + stopset.add("možeš"); + stopset.add("može"); + stopset.add("možemo"); + stopset.add("možete"); + + // Initialize transformations + transformations = new HashMap<>(); + transformations.put("lozi", "loga"); + transformations.put("lozima", "loga"); + transformations.put("pjesi", "pjeh"); + transformations.put("pjesima", "pjeh"); + transformations.put("vojci", "vojka"); + transformations.put("bojci", "bojka"); + transformations.put("jaci", "jak"); + transformations.put("jacima", "jak"); + transformations.put("čajan", "čajni"); + transformations.put("ijeran", "ijerni"); + transformations.put("laran", "larni"); + transformations.put("ijesan", "ijesni"); + transformations.put("anjac", "anjca"); + transformations.put("ajac", "ajca"); + transformations.put("ajaca", "ajca"); + transformations.put("ljaca", "ljca"); + transformations.put("ljac", "ljca"); + transformations.put("ejac", "ejca"); + transformations.put("ejaca", "ejca"); + transformations.put("ojac", "ojca"); + transformations.put("ojaca", "ojca"); + transformations.put("ajaka", "ajka"); + transformations.put("ojaka", "ojka"); + transformations.put("šaca", "šca"); + transformations.put("šac", "šca"); + transformations.put("inzima", "ing"); + transformations.put("inzi", "ing"); + transformations.put("tvenici", "tvenik"); + transformations.put("tetici", "tetika"); + transformations.put("teticima", "tetika"); + transformations.put("nstava", "nstva"); + transformations.put("nicima", "nik"); + transformations.put("ticima", "tik"); + transformations.put("zicima", "zik"); + transformations.put("snici", "snik"); + transformations.put("kuse", "kusi"); + transformations.put("kusan", "kusni"); + transformations.put("kustava", "kustva"); + transformations.put("dušan", "dušni"); + transformations.put("antan", "antni"); + transformations.put("bilan", "bilni"); + transformations.put("tilan", "tilni"); + transformations.put("avilan", "avilni"); + transformations.put("silan", "silni"); + transformations.put("gilan", "gilni"); + transformations.put("rilan", "rilni"); + transformations.put("nilan", "nilni"); + transformations.put("alan", "alni"); + transformations.put("ozan", "ozni"); + transformations.put("rave", "ravi"); + transformations.put("stavan", "stavni"); + transformations.put("pravan", "pravni"); + transformations.put("tivan", "tivni"); + transformations.put("sivan", "sivni"); + transformations.put("atan", "atni"); + transformations.put("cenata", "centa"); + transformations.put("denata", "denta"); + transformations.put("genata", "genta"); + transformations.put("lenata", "lenta"); + transformations.put("menata", "menta"); + transformations.put("jenata", "jenta"); + transformations.put("venata", "venta"); + transformations.put("tetan", "tetni"); + transformations.put("pletan", "pletni"); + transformations.put("šave", "šavi"); + transformations.put("manata", "manta"); + transformations.put("tanata", "tanta"); + transformations.put("lanata", "lanta"); + transformations.put("sanata", "santa"); + transformations.put("ačak", "ačka"); + transformations.put("ačaka", "ačka"); + transformations.put("ušak", "uška"); + transformations.put("atak", "atka"); + transformations.put("ataka", "atka"); + transformations.put("atci", "atka"); + transformations.put("atcima", "atka"); + transformations.put("etak", "etka"); + transformations.put("etaka", "etka"); + transformations.put("itak", "itka"); + transformations.put("itaka", "itka"); + transformations.put("itci", "itka"); + transformations.put("otak", "otka"); + transformations.put("otaka", "otka"); + transformations.put("utak", "utka"); + transformations.put("utaka", "utka"); + transformations.put("utci", "utka"); + transformations.put("utcima", "utka"); + transformations.put("eskan", "eskna"); + transformations.put("tičan", "tični"); + transformations.put("ojsci", "ojska"); + transformations.put("esama", "esma"); + transformations.put("metara", "metra"); + transformations.put("centar", "centra"); + transformations.put("centara", "centra"); + transformations.put("istara", "istra"); + transformations.put("istar", "istra"); + transformations.put("ošću", "osti"); + transformations.put("daba", "dba"); + transformations.put("čcima", "čka"); + transformations.put("čci", "čka"); + transformations.put("mac", "mca"); + transformations.put("maca", "mca"); + transformations.put("naca", "nca"); + transformations.put("nac", "nca"); + transformations.put("voljan", "voljni"); + transformations.put("anaka", "anki"); + transformations.put("vac", "vca"); + transformations.put("vaca", "vca"); + transformations.put("saca", "sca"); + transformations.put("sac", "sca"); + // transformations.put("naca", "nca"); // Batanović: Ponavljanja + // transformations.put("nac", "nca"); // Repetitions + transformations.put("raca", "rca"); + transformations.put("rac", "rca"); + transformations.put("aoca", "alca"); + transformations.put("alaca", "alca"); + transformations.put("alac", "alca"); + transformations.put("elaca", "elca"); + transformations.put("elac", "elca"); + transformations.put("olaca", "olca"); + transformations.put("olac", "olca"); + transformations.put("olce", "olca"); + transformations.put("njac", "njca"); + transformations.put("njaca", "njca"); + transformations.put("ekata", "ekta"); + transformations.put("ekat", "ekta"); + transformations.put("izam", "izma"); + transformations.put("izama", "izma"); + transformations.put("jebe", "jebi"); + transformations.put("baci", "baci"); + transformations.put("ašan", "ašni"); + + // Compile patterns + for (int i = 0; i < wordStart.size(); i++) { + String pattern = "^(" + wordStart.get(i) + ")(" + wordEnd.get(i) + ")$"; + wordPatterns.add(Pattern.compile(pattern)); + } + } +} diff --git a/src/test/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmerTest.java b/src/test/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmerTest.java new file mode 100644 index 0000000..72c75f6 --- /dev/null +++ b/src/test/java/cz/monitora/elasticsearch/analyzer/croatian/CroatianStemmerTest.java @@ -0,0 +1,60 @@ +package cz.monitora.elasticsearch.analyzer.croatian; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class CroatianStemmerTest { + + @ParameterizedTest + @MethodSource("provideData") + public void test_stem(String val, String exp) { + final CroatianStemmer stemmer = new CroatianStemmer(); + char[] ch = val.toCharArray(); + assertEquals(exp, new String(Arrays.copyOfRange(ch, 0, stemmer.stem(ch, ch.length)))); + } + + private static Stream provideData() { + /* + In [11]: for one in ElasticSynonym.objects.filter(language='hr'): + ...: for s in one.synonyms: + ...: print(f'Arguments.of("{s.lower()}", "{one.word.lower()}"),') + ...: print() + */ + + return Stream.of( + Arguments.of("zlatko", "zlatk"), + Arguments.of("zlatka", "zlatk"), + Arguments.of("zlatku", "zlatk"), + Arguments.of("zlatkom", "zlatk"), + Arguments.of("hrkaća", "hrkać"), + Arguments.of("hrkaću", "hrkać"), + Arguments.of("hrkaćem", "hrkać"), + Arguments.of("ivana", "ivan"), + Arguments.of("ivanu", "ivan"), + Arguments.of("ivanom", "ivan"), + Arguments.of("bešlića", "bešlić"), + Arguments.of("bešliću", "bešlić"), + Arguments.of("bešlićem", "bešlić"), + Arguments.of("zorana", "zoran"), + Arguments.of("zoranu", "zoran"), + Arguments.of("zoranom", "zoran"), + Arguments.of("milanoviću", "milanović"), + Arguments.of("milanovićem", "milanović"), + Arguments.of("milanovića", "milanović"), + Arguments.of("dobroslava", "dobroslav"), + Arguments.of("dobroslavu", "dobroslav"), + Arguments.of("dobroslavom", "dobroslav"), + Arguments.of("dobar", "dobro"), + Arguments.of("dobro", "dobro"), + Arguments.of("dobra", "dobro"), + Arguments.of("zao", "zli"), + Arguments.of("zla", "zli"), + Arguments.of("zlo", "zli")); + } +}