Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ jobs:
es_version: [8.15.2]
steps:
- name: Checkout project sources
uses: actions/checkout@v2
- uses: actions/setup-java@v3
uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 17
java-version: 21
- name: Setup Gradle
uses: gradle/gradle-build-action@v2
uses: gradle/actions/setup-gradle@v4

- name: Run release build with Gradle Wrapper
run: ./gradlew build -Pplugin.version=${{ github.ref_name }} -Pelasticsearch.version=${{ matrix.es_version }}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ES_VERSION ?= 8.15.2
JAVA_HOME ?= /usr/lib/jvm/java-17-openjdk-amd64
JAVA_HOME ?= /usr/lib/jvm/java-21-openjdk-amd64

.PHONY: all build

Expand Down
22 changes: 17 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest).
ES_VERSION=8.15.2
./gradlew build -Pelasticsearch.version=$ES_VERSION

export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64
export PATH=$JAVA_HOME/bin:$PATH
JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=8.15.2
JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=$ES_VERSION


## Testing

gradle test --info --tests "Croatian*"
Unit tests:

./gradlew test --info --tests "Croatian*"

Performance tests:

./gradlew performanceTest

## Install

Expand All @@ -35,9 +41,9 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest).
Analysis filter that converts to lowercase but keeps the originally-cased token in the stream as
well.

### Czech and Slovak stemmers
### Czech, Slovak, Croatian and Slovenian stemmers

Specialized stemmers
Specialized stemmers for Slavic languages

### Example index settings

Expand All @@ -55,6 +61,12 @@ Specialized stemmers
"mslovak_stem": {
"type": "monitora_slovak_stem",
"with_asciifold": "true"
},
"mslovenian_stem": {
"type": "monitora_slovenian_stem"
},
"mcroatian_stem": {
"type": "monitora_croatian_stem"
}
},
"analyzer": {
Expand Down
38 changes: 32 additions & 6 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ repositories {
}

dependencies {
testImplementation(platform('org.junit:junit-bom:5.9.2'))
testImplementation(platform('org.junit:junit-bom:5.11.4'))
testImplementation('org.junit.jupiter:junit-jupiter')
testRuntimeOnly('org.junit.platform:junit-platform-launcher')
}

test {
Expand All @@ -31,8 +32,33 @@ test {
}
testLogging {
events "passed", "skipped", "failed"
showStandardStreams = true
}
useJUnitPlatform {
excludeTags 'performance'
}
}

// Separate task for running performance tests
task performanceTest(type: Test) {
description = 'Runs performance analysis tests for stemmers'
group = 'verification'

testClassesDirs = sourceSets.test.output.classesDirs
classpath = sourceSets.test.runtimeClasspath

filter {
include "**/*PerformanceTest.class"
}

testLogging {
events "passed", "skipped", "failed"
showStandardStreams = true
}

useJUnitPlatform {
includeTags 'performance'
}
useJUnitPlatform {}
}

compileJava {
Expand All @@ -46,8 +72,8 @@ group = "cz.monitora.elasticsearch"
version = "${property("plugin.version")}-${property("elasticsearch.version")}"

esplugin {
name 'monitora_utils'
description 'Utils for Elasticsearch'
classname 'cz.monitora.elasticsearch.MonitoraESPlugin'
licenseFile rootProject.file('LICENSE')
name = 'monitora_utils'
description = 'Utils for Elasticsearch'
classname = 'cz.monitora.elasticsearch.MonitoraESPlugin'
licenseFile = rootProject.file('LICENSE')
}
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-9.0-bin.zip
networkTimeout=10000
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
2 changes: 2 additions & 0 deletions src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory;
import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory;
import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory;
import cz.monitora.elasticsearch.analyzer.slovenian.SlovenianStemFilterFactory;
import java.util.HashMap;
import java.util.Map;
import org.elasticsearch.index.analysis.TokenFilterFactory;
Expand All @@ -22,6 +23,7 @@ public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getToken
extra.put("monitora_czech_stem", CzechStemFilterFactory::new);
extra.put("monitora_slovak_stem", SlovakStemFilterFactory::new);
extra.put("monitora_croatian_stem", CroatianStemFilterFactory::new);
extra.put("monitora_slovenian_stem", SlovenianStemFilterFactory::new);
return extra;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cz.monitora.elasticsearch.analyzer.slovenian;

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;

/**
* A {@link TokenFilter} that applies {@link SlovenianStemmer} to stem Slovenian words.
*
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
* TokenStream}.
*
* <p><b>NOTE</b>: Input is expected to be in lowercase, but with diacritical marks
*
* @see SetKeywordMarkerFilter
*/
public final class SlovenianStemFilter extends TokenFilter {
private final SlovenianStemmer stemmer = new SlovenianStemmer();
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

public SlovenianStemFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAttr.buffer(), termAttr.length());
termAttr.setLength(newlen);
}
return true;
} else {
return false;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cz.monitora.elasticsearch.analyzer.slovenian;

import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class SlovenianStemFilterFactory extends AbstractTokenFilterFactory {

public SlovenianStemFilterFactory(
IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(name, settings);
}

@Override
public TokenStream create(TokenStream input) {
return new SlovenianStemFilter(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/* MIT License
*
* Copyright (c) 2025
* Port of a Snowball-style Slovenian stemmer (conservative).
* Based on community implementations and Snowball design principles.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software ...
*/

package cz.monitora.elasticsearch.analyzer.slovenian;

import java.text.Normalizer;
import java.util.Locale;
import java.util.Arrays;
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
import org.apache.lucene.analysis.CharArraySet;


public final class SlovenianStemmer {
private static final String[] suffixes2 = {
"ih","im","om","am","em","ov","ev","in","mi","eh","ah"
};
private static final String[] suffixes3 = {
"ega","emu","ima","imi","ami","oma","ama","ove","ova","ovs","ina","ino","ini","ine"
};
private static final String[] suffix3_remove2 = {
"rja", "rje", "rju", "rjo"
};
private static final String[] suffix4_remove3 = {
"rjem"
};
//private static final CharArraySet dont_stem = new CharArraySet(
// Arrays.asList("skupina", "telekom"),
// false
//);

public int stem(char[] s, int len) {
//if (dont_stem.contains(s, 0, len)) return len;

int r1 = calculateR1(s, len);
if (r1 >= len) return len;

if (len - 4 >= r1) {
for (String suf : suffix4_remove3) {
if (endsWith(s, len, suf)) {
return len - suf.length() + 1;
}
}
}

if (len - 3 >= r1) {
for (String suf : suffixes3) {
if (endsWith(s, len, suf)) {
return len - suf.length();
}
}
for (String suf : suffix3_remove2) {
if (endsWith(s, len, suf)) {
return len - suf.length() + 1;
}
}
}

if (len - 2 >= r1) {
for (String suf : suffixes2) {
if (endsWith(s, len, suf)) {
// protect very short stems
if (len - suf.length() >= 2) return len - suf.length();
}
}
}

if (len - 1 >= r1) {
char last = s[len - 1];
if (isVowel(last)) {
if (len - 1 >= 2) return len - 1;
}
}

return len;
}

// R1: first region after the first non-vowel following a vowel.
private int calculateR1(char[] s, int len) {
boolean foundVowel = false;
for (int i = 0; i < len; i++) {
if (isVowel(s[i])) {
foundVowel = true;
} else if (foundVowel) {
return i + 1;
}
}
return len;
}

// Conservative vowel test for Slovenian. Note: syllabic 'r' isn't treated as vowel here.
private boolean isVowel(char c) {
// includes Slovene-specific vowels (č/š/ž are consonants, preserved)
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}
}
Loading