monitora-media · DavidJiricek · Dec 12, 2025 · Nov 25, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -12,13 +12,13 @@ jobs:
                 es_version: [8.15.2]
         steps:
             - name: Checkout project sources
-              uses: actions/checkout@v2
-            - uses: actions/setup-java@v3
+              uses: actions/checkout@v4
+            - uses: actions/setup-java@v4
               with:
                   distribution: temurin
-                  java-version: 17
+                  java-version: 21
             - name: Setup Gradle
-              uses: gradle/gradle-build-action@v2
+              uses: gradle/actions/setup-gradle@v4
 
             - name: Run release build with Gradle Wrapper
               run: ./gradlew build -Pplugin.version=${{ github.ref_name }} -Pelasticsearch.version=${{ matrix.es_version }}

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 ES_VERSION ?= 8.15.2
-JAVA_HOME ?= /usr/lib/jvm/java-17-openjdk-amd64
+JAVA_HOME ?= /usr/lib/jvm/java-21-openjdk-amd64
 
 .PHONY: all build
 

diff --git a/README.md b/README.md
@@ -15,14 +15,20 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest).
     ES_VERSION=8.15.2
     ./gradlew build -Pelasticsearch.version=$ES_VERSION
 
-    export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+    export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64
     export PATH=$JAVA_HOME/bin:$PATH
-    JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=8.15.2
+    JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=$ES_VERSION
 
 
 ## Testing
 
-    gradle test --info --tests "Croatian*"
+Unit tests:
+
+    ./gradlew test --info --tests "Croatian*"
+
+Performance tests:
+
+    ./gradlew performanceTest
 
 ## Install
 
@@ -35,9 +41,9 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest).
 Analysis filter that converts to lowercase but keeps the originally-cased token in the stream as
 well.
 
-### Czech and Slovak stemmers
+### Czech, Slovak, Croatian and Slovenian stemmers
 
-Specialized stemmers
+Specialized stemmers for Slavic languages
 
 ### Example index settings
 
@@ -55,6 +61,12 @@ Specialized stemmers
         "mslovak_stem": {
             "type": "monitora_slovak_stem",
             "with_asciifold": "true"
+        },
+        "mslovenian_stem": {
+            "type": "monitora_slovenian_stem"
+        },
+        "mcroatian_stem": {
+            "type": "monitora_croatian_stem"
         }
     },
     "analyzer": {

diff --git a/build.gradle b/build.gradle
@@ -18,8 +18,9 @@ repositories {
 }
 
 dependencies {
-    testImplementation(platform('org.junit:junit-bom:5.9.2'))
+    testImplementation(platform('org.junit:junit-bom:5.11.4'))
 	testImplementation('org.junit.jupiter:junit-jupiter')
+	testRuntimeOnly('org.junit.platform:junit-platform-launcher')
 }
 
 test {
@@ -31,8 +32,33 @@ test {
     }
     testLogging {
         events "passed", "skipped", "failed"
+        showStandardStreams = true
+    }
+    useJUnitPlatform {
+        excludeTags 'performance'
+    }
+}
+
+// Separate task for running performance tests
+task performanceTest(type: Test) {
+    description = 'Runs performance analysis tests for stemmers'
+    group = 'verification'
+
+    testClassesDirs = sourceSets.test.output.classesDirs
+    classpath = sourceSets.test.runtimeClasspath
+
+    filter {
+        include "**/*PerformanceTest.class"
+    }
+
+    testLogging {
+        events "passed", "skipped", "failed"
+        showStandardStreams = true
+    }
+
+    useJUnitPlatform {
+        includeTags 'performance'
     }
-    useJUnitPlatform {}
 }
 
 compileJava {
@@ -46,8 +72,8 @@ group = "cz.monitora.elasticsearch"
 version = "${property("plugin.version")}-${property("elasticsearch.version")}"
 
 esplugin {
-  name 'monitora_utils'
-  description 'Utils for Elasticsearch'
-  classname 'cz.monitora.elasticsearch.MonitoraESPlugin'
-  licenseFile rootProject.file('LICENSE')
+  name = 'monitora_utils'
+  description = 'Utils for Elasticsearch'
+  classname = 'cz.monitora.elasticsearch.MonitoraESPlugin'
+  licenseFile = rootProject.file('LICENSE')
 }
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-9.0-bin.zip
 networkTimeout=10000
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java
@@ -4,6 +4,7 @@
 import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory;
 import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory;
 import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory;
+import cz.monitora.elasticsearch.analyzer.slovenian.SlovenianStemFilterFactory;
 import java.util.HashMap;
 import java.util.Map;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
@@ -22,6 +23,7 @@ public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getToken
     extra.put("monitora_czech_stem", CzechStemFilterFactory::new);
     extra.put("monitora_slovak_stem", SlovakStemFilterFactory::new);
     extra.put("monitora_croatian_stem", CroatianStemFilterFactory::new);
+    extra.put("monitora_slovenian_stem", SlovenianStemFilterFactory::new);
     return extra;
   }
 }
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilter.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package cz.monitora.elasticsearch.analyzer.slovenian;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SlovenianStemmer} to stem Slovenian words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
+ * <p><b>NOTE</b>: Input is expected to be in lowercase, but with diacritical marks
+ *
+ * @see SetKeywordMarkerFilter
+ */
+public final class SlovenianStemFilter extends TokenFilter {
+  private final SlovenianStemmer stemmer = new SlovenianStemmer();
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public SlovenianStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAttr.buffer(), termAttr.length());
+        termAttr.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilterFactory.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package cz.monitora.elasticsearch.analyzer.slovenian;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+
+public class SlovenianStemFilterFactory extends AbstractTokenFilterFactory {
+
+  public SlovenianStemFilterFactory(
+      IndexSettings indexSettings, Environment env, String name, Settings settings) {
+    super(name, settings);
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new SlovenianStemFilter(input);
+  }
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmer.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmer.java
@@ -0,0 +1,102 @@
+/* MIT License
+ *
+ * Copyright (c) 2025
+ * Port of a Snowball-style Slovenian stemmer (conservative).
+ * Based on community implementations and Snowball design principles.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software ...
+ */
+
+package cz.monitora.elasticsearch.analyzer.slovenian;
+
+import java.text.Normalizer;
+import java.util.Locale;
+import java.util.Arrays;
+import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+import org.apache.lucene.analysis.CharArraySet;
+
+
+public final class SlovenianStemmer {
+  private static final String[] suffixes2 = {
+    "ih","im","om","am","em","ov","ev","in","mi","eh","ah"
+  };
+  private static final String[] suffixes3 = {
+    "ega","emu","ima","imi","ami","oma","ama","ove","ova","ovs","ina","ino","ini","ine"
+  };
+  private static final String[] suffix3_remove2 = {
+    "rja", "rje", "rju", "rjo"
+  };
+  private static final String[] suffix4_remove3 = {
+    "rjem"
+  };
+  //private static final CharArraySet dont_stem = new CharArraySet(
+  //  Arrays.asList("skupina", "telekom"),
+  //  false
+  //);
+
+  public int stem(char[] s, int len) {
+    //if (dont_stem.contains(s, 0, len)) return len;
+
+    int r1 = calculateR1(s, len);
+    if (r1 >= len) return len;
+
+    if (len - 4 >= r1) {
+      for (String suf : suffix4_remove3) {
+        if (endsWith(s, len, suf)) {
+          return len - suf.length() + 1;
+        }
+      }
+    }
+
+    if (len - 3 >= r1) {
+      for (String suf : suffixes3) {
+        if (endsWith(s, len, suf)) {
+          return len - suf.length();
+        }
+      }
+      for (String suf : suffix3_remove2) {
+        if (endsWith(s, len, suf)) {
+          return len - suf.length() + 1;
+        }
+      }
+    }
+
+    if (len - 2 >= r1) {
+      for (String suf : suffixes2) {
+        if (endsWith(s, len, suf)) {
+          // protect very short stems
+          if (len - suf.length() >= 2) return len - suf.length();
+        }
+      }
+    }
+
+    if (len - 1 >= r1) {
+      char last = s[len - 1];
+      if (isVowel(last)) {
+        if (len - 1 >= 2) return len - 1;
+      }
+    }
+
+    return len;
+  }
+
+  // R1: first region after the first non-vowel following a vowel.
+  private int calculateR1(char[] s, int len) {
+    boolean foundVowel = false;
+    for (int i = 0; i < len; i++) {
+      if (isVowel(s[i])) {
+        foundVowel = true;
+      } else if (foundVowel) {
+        return i + 1;
+      }
+    }
+    return len;
+  }
+
+  // Conservative vowel test for Slovenian. Note: syllabic 'r' isn't treated as vowel here.
+  private boolean isVowel(char c) {
+    // includes Slovene-specific vowels (č/š/ž are consonants, preserved)
+    return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
+  }
+}