diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 09a29cd..d42dc8a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,13 +12,13 @@ jobs: es_version: [8.15.2] steps: - name: Checkout project sources - uses: actions/checkout@v2 - - uses: actions/setup-java@v3 + uses: actions/checkout@v4 + - uses: actions/setup-java@v4 with: distribution: temurin - java-version: 17 + java-version: 21 - name: Setup Gradle - uses: gradle/gradle-build-action@v2 + uses: gradle/actions/setup-gradle@v4 - name: Run release build with Gradle Wrapper run: ./gradlew build -Pplugin.version=${{ github.ref_name }} -Pelasticsearch.version=${{ matrix.es_version }} diff --git a/Makefile b/Makefile index 0b44ffc..2837b7c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ES_VERSION ?= 8.15.2 -JAVA_HOME ?= /usr/lib/jvm/java-17-openjdk-amd64 +JAVA_HOME ?= /usr/lib/jvm/java-21-openjdk-amd64 .PHONY: all build diff --git a/README.md b/README.md index d4e9b23..f9cca96 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,20 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest). ES_VERSION=8.15.2 ./gradlew build -Pelasticsearch.version=$ES_VERSION - export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 export PATH=$JAVA_HOME/bin:$PATH - JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=8.15.2 + JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64 PATH=$JAVA_HOME/bin:$PATH ./gradlew build -Pelasticsearch.version=$ES_VERSION ## Testing - gradle test --info --tests "Croatian*" +Unit tests: + + ./gradlew test --info --tests "Croatian*" + +Performance tests: + + ./gradlew performanceTest ## Install @@ -35,9 +41,9 @@ in the [release](https://github.com/monitora-media/es-utils/releases/latest). Analysis filter that converts to lowercase but keeps the originally-cased token in the stream as well. -### Czech and Slovak stemmers +### Czech, Slovak, Croatian and Slovenian stemmers -Specialized stemmers +Specialized stemmers for Slavic languages ### Example index settings @@ -55,6 +61,12 @@ Specialized stemmers "mslovak_stem": { "type": "monitora_slovak_stem", "with_asciifold": "true" + }, + "mslovenian_stem": { + "type": "monitora_slovenian_stem" + }, + "mcroatian_stem": { + "type": "monitora_croatian_stem" } }, "analyzer": { diff --git a/build.gradle b/build.gradle index 1c97f62..a77a76c 100644 --- a/build.gradle +++ b/build.gradle @@ -18,8 +18,9 @@ repositories { } dependencies { - testImplementation(platform('org.junit:junit-bom:5.9.2')) + testImplementation(platform('org.junit:junit-bom:5.11.4')) testImplementation('org.junit.jupiter:junit-jupiter') + testRuntimeOnly('org.junit.platform:junit-platform-launcher') } test { @@ -31,8 +32,33 @@ test { } testLogging { events "passed", "skipped", "failed" + showStandardStreams = true + } + useJUnitPlatform { + excludeTags 'performance' + } +} + +// Separate task for running performance tests +task performanceTest(type: Test) { + description = 'Runs performance analysis tests for stemmers' + group = 'verification' + + testClassesDirs = sourceSets.test.output.classesDirs + classpath = sourceSets.test.runtimeClasspath + + filter { + include "**/*PerformanceTest.class" + } + + testLogging { + events "passed", "skipped", "failed" + showStandardStreams = true + } + + useJUnitPlatform { + includeTags 'performance' } - useJUnitPlatform {} } compileJava { @@ -46,8 +72,8 @@ group = "cz.monitora.elasticsearch" version = "${property("plugin.version")}-${property("elasticsearch.version")}" esplugin { - name 'monitora_utils' - description 'Utils for Elasticsearch' - classname 'cz.monitora.elasticsearch.MonitoraESPlugin' - licenseFile rootProject.file('LICENSE') + name = 'monitora_utils' + description = 'Utils for Elasticsearch' + classname = 'cz.monitora.elasticsearch.MonitoraESPlugin' + licenseFile = rootProject.file('LICENSE') } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index f398c33..a67ba05 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-9.0-bin.zip networkTimeout=10000 zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java index 84a3b8b..4ca3c89 100644 --- a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java +++ b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java @@ -4,6 +4,7 @@ import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory; import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory; import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory; +import cz.monitora.elasticsearch.analyzer.slovenian.SlovenianStemFilterFactory; import java.util.HashMap; import java.util.Map; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -22,6 +23,7 @@ public Map> getToken extra.put("monitora_czech_stem", CzechStemFilterFactory::new); extra.put("monitora_slovak_stem", SlovakStemFilterFactory::new); extra.put("monitora_croatian_stem", CroatianStemFilterFactory::new); + extra.put("monitora_slovenian_stem", SlovenianStemFilterFactory::new); return extra; } } diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilter.java new file mode 100644 index 0000000..f1763fa --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package cz.monitora.elasticsearch.analyzer.slovenian; + +import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SlovenianStemmer} to stem Slovenian words. + * + *

To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a + * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link + * TokenStream}. + * + *

NOTE: Input is expected to be in lowercase, but with diacritical marks + * + * @see SetKeywordMarkerFilter + */ +public final class SlovenianStemFilter extends TokenFilter { + private final SlovenianStemmer stemmer = new SlovenianStemmer(); + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SlovenianStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAttr.buffer(), termAttr.length()); + termAttr.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilterFactory.java new file mode 100644 index 0000000..1a4d6d6 --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemFilterFactory.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package cz.monitora.elasticsearch.analyzer.slovenian; + +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +public class SlovenianStemFilterFactory extends AbstractTokenFilterFactory { + + public SlovenianStemFilterFactory( + IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(name, settings); + } + + @Override + public TokenStream create(TokenStream input) { + return new SlovenianStemFilter(input); + } +} diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmer.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmer.java new file mode 100644 index 0000000..3bbadb3 --- /dev/null +++ b/src/main/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmer.java @@ -0,0 +1,102 @@ +/* MIT License + * + * Copyright (c) 2025 + * Port of a Snowball-style Slovenian stemmer (conservative). + * Based on community implementations and Snowball design principles. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software ... + */ + +package cz.monitora.elasticsearch.analyzer.slovenian; + +import java.text.Normalizer; +import java.util.Locale; +import java.util.Arrays; +import static org.apache.lucene.analysis.util.StemmerUtil.endsWith; +import org.apache.lucene.analysis.CharArraySet; + + +public final class SlovenianStemmer { + private static final String[] suffixes2 = { + "ih","im","om","am","em","ov","ev","in","mi","eh","ah" + }; + private static final String[] suffixes3 = { + "ega","emu","ima","imi","ami","oma","ama","ove","ova","ovs","ina","ino","ini","ine" + }; + private static final String[] suffix3_remove2 = { + "rja", "rje", "rju", "rjo" + }; + private static final String[] suffix4_remove3 = { + "rjem" + }; + //private static final CharArraySet dont_stem = new CharArraySet( + // Arrays.asList("skupina", "telekom"), + // false + //); + + public int stem(char[] s, int len) { + //if (dont_stem.contains(s, 0, len)) return len; + + int r1 = calculateR1(s, len); + if (r1 >= len) return len; + + if (len - 4 >= r1) { + for (String suf : suffix4_remove3) { + if (endsWith(s, len, suf)) { + return len - suf.length() + 1; + } + } + } + + if (len - 3 >= r1) { + for (String suf : suffixes3) { + if (endsWith(s, len, suf)) { + return len - suf.length(); + } + } + for (String suf : suffix3_remove2) { + if (endsWith(s, len, suf)) { + return len - suf.length() + 1; + } + } + } + + if (len - 2 >= r1) { + for (String suf : suffixes2) { + if (endsWith(s, len, suf)) { + // protect very short stems + if (len - suf.length() >= 2) return len - suf.length(); + } + } + } + + if (len - 1 >= r1) { + char last = s[len - 1]; + if (isVowel(last)) { + if (len - 1 >= 2) return len - 1; + } + } + + return len; + } + + // R1: first region after the first non-vowel following a vowel. + private int calculateR1(char[] s, int len) { + boolean foundVowel = false; + for (int i = 0; i < len; i++) { + if (isVowel(s[i])) { + foundVowel = true; + } else if (foundVowel) { + return i + 1; + } + } + return len; + } + + // Conservative vowel test for Slovenian. Note: syllabic 'r' isn't treated as vowel here. + private boolean isVowel(char c) { + // includes Slovene-specific vowels (č/š/ž are consonants, preserved) + return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; + } +} diff --git a/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerPerformanceTest.java b/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerPerformanceTest.java new file mode 100644 index 0000000..5ffd9f9 --- /dev/null +++ b/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerPerformanceTest.java @@ -0,0 +1,268 @@ +package cz.monitora.elasticsearch.analyzer.slovenian; + +import static org.junit.jupiter.api.Assertions.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +/** + * Performance analysis test for Slovenian stemmer. + * This test evaluates the stemmer's precision across comprehensive datasets + * and provides detailed failure reports. + * + * Evaluation metric: All inflected forms within a group should stem to the same string. + * The stemmed result does not have to match the base form (Map key), it just needs to be + * consistent across all forms in the Set. Success = all forms produce identical stem. + * + * Run manually (not part of regular test suite) to analyze stemmer quality. + * Use: ./gradlew performanceTest + */ +public class SlovenianStemmerPerformanceTest { + static class DatasetResult { + String name; + int totalEntries; + int successfulGroupings; + List failures; + + DatasetResult(String name, int totalEntries, int successfulGroupings, List failures) { + this.name = name; + this.totalEntries = totalEntries; + this.successfulGroupings = successfulGroupings; + this.failures = failures; + } + + double getPrecision() { + return totalEntries > 0 ? (double) successfulGroupings / totalEntries * 100 : 0; + } + } + + static class FailureReport { + String expectedBase; + Set uniqueStems; + Map formToStem; + + FailureReport(String expectedBase, Set uniqueStems, Map formToStem) { + this.expectedBase = expectedBase; + this.uniqueStems = uniqueStems; + this.formToStem = formToStem; + } + } + + public static final Map> slovenian_keywords_dataset = Map.ofEntries( + // --- Companies & Brands --- + Map.entry("nomago", Set.of("nomago", "nomaga", "nomagu", "nomagom")), + Map.entry("nomago bikes", Set.of("nomago bikes", "nomaga bikes", "nomagu bikes")), + Map.entry("Nomago travel", Set.of("nomago travel", "nomaga travel", "nomagu travel")), + Map.entry("nova ljubljanska banka", Set.of("nove ljubljanske banke", "novi ljubljanski banki", "novo ljubljansko banko")), + Map.entry("halcom ca", Set.of("halcom ca", "halcoma ca", "halcomu ca", "halcomom ca")), + Map.entry("telekom slovenija", Set.of("telekom slovenije", "telekoma slovenije", "telekomu slovenije", "telekomom slovenije")), + Map.entry("lidl slovenija", Set.of("lidl slovenija", "lidla slovenija", "lidlu slovenija", "lidlom slovenija")), + Map.entry("skupina triglav", Set.of("skupine triglav", "skupini triglav", "skupino triglav")), + Map.entry("triglav lab", Set.of("triglav lab", "triglav laba", "triglav labu", "triglav labom")), + Map.entry("skupina gen", Set.of("skupine gen", "skupini gen", "skupino gen")), + Map.entry("agencija taktik", Set.of("agencija taktik", "agencije taktik", "agenciji taktik", "agencijo taktik")), + + // --- Personal Names (Male) --- + Map.entry("Miha tavčar", Set.of("miha tavčar", "mihe tavčarja", "mihi tavčarju", "miho tavčarjem")), + Map.entry("Blaž Brodnjak", Set.of("blaž brodnjak", "blaža brodnjaka", "blažu brodnjaku", "blažem brodnjakom")), + Map.entry("Archibald Kremser", Set.of("archibald kremser", "archibalda kremserja", "archibaldu kremserju", "archibaldom kremserjem")), + Map.entry("Andreas Burkhardt", Set.of("andreas burkhardt", "andreasa burkhardta", "andreasu burkhardtu", "andreasom burkhardtom")), + Map.entry("Andrej Lasič", Set.of("andrej lasič", "andreja lasiča", "andreju lasiču", "andrejem lasičem")), + Map.entry("Antonio Argir", Set.of("antonio argir", "antonia argirja", "antoniu argirju", "antoniem argirjem")), + Map.entry("Reinhard Höll", Set.of("reinhard höll", "reinharda hölla", "reinhardu höllu", "reinhardom höllom")), + Map.entry("gregor pelhan", Set.of("gregor pelhan", "gregorja pelhana", "gregorju pelhanu", "gregorjem pelhanom")), + Map.entry("Aleksandar Spremić", Set.of("aleksandar spremić", "aleksandra spremića", "aleksandru spremiću", "aleksandrom spremićem")), + Map.entry("tadej pogačar", Set.of("tadej pogačar", "tadeja pogačarja", "tadeju pogačarju", "tadejem pogačarjem")), + Map.entry("luka dončič", Set.of("luka dončič", "luke dončiča", "luki dončiču", "luko dončičem")), + + // --- Personal Names (Female) --- + Map.entry("Hedvika Usenik", Set.of("hedvika usenik", "hedvike usenik", "hedviki usenik", "hedviko usenik")), + Map.entry("andreja pongračič", Set.of("andreja pongračič", "andreje pongračič", "andreji pongračič", "andrejo pongračič")), + Map.entry("saška rihtaršič", Set.of("saška rihtaršič", "saške rihtaršič", "saški rihtaršič", "saško rihtaršič")), + Map.entry("Irena Ilešič Čujovič", Set.of("irena ilešič čujovič", "irene ilešič čujovič", "ireni ilešič čujovič", "ireno ilešič čujovič")), + + // --- Institutions & Political --- + Map.entry("socialni demokrati", Set.of("socialni demokrati", "socialnih demokratov", "socialnim demokratom", "socialne demokrate", "socialnimi demokrati")), + Map.entry("nova slovenija", Set.of("nova slovenija", "nove slovenije", "novi sloveniji", "novo slovenijo")), + Map.entry("slovenska demokratska stranka", Set.of("slovenska demokratska stranka", "slovenske demokratske stranke", "slovenski demokratski stranki", "slovensko demokratsko stranko")), + Map.entry("mestna občina kranj", Set.of("mestna občina kranj", "mestne občine kranj", "mestni občini kranj", "mestno občino kranj")), + Map.entry("mok", Set.of("mok", "moka", "moku", "mokom")), + Map.entry("zavod za zdravstveno zavarovanje", Set.of("zavod za zdravstveno zavarovanje", "zavoda za zdravstveno zavarovanje", "zavodu za zdravstveno zavarovanje", "zavodom za zdravstveno zavarovanje")), + Map.entry("Nuklearna elektrarna Krško", Set.of("nuklearna elektrarna krško", "nuklearne elektrarne krško", "nuklearni elektrarni krško", "nuklearno elektrarno krško")), + Map.entry("Savske elektrarne", Set.of("savske elektrarne", "savskih elektrarn", "savskim elektrarnam", "savskimi elektrarnami")), + Map.entry("Termoelektrarna Brestanica", Set.of("termoelektrarna brestanica", "termoelektrarne brestanica", "termoelektrarni brestanica", "termoelektrarno brestanica")), + Map.entry("mestno gledališče ljubljansko", Set.of("mestno gledališče ljubljansko", "mestnega gledališča ljubljanskega", "mestnemu gledališču ljubljanskemu", "mestnim gledališčem ljubljanskim")), + + // --- General Services & Concepts --- + Map.entry("avtobusni prevozi", Set.of("avtobusni prevozi", "avtobusnih prevozov", "avtobusnim prevozom", "avtobusne prevoze", "avtobusnimi prevozi")), + Map.entry("avtobusne vozovnice", Set.of("avtobusne vozovnice", "avtobusnih vozovnic", "avtobusnim vozovnicam", "avtobusnimi vozovnicami")), + Map.entry("osebni kredit", Set.of("osebni kredit", "osebnega kredita", "osebnemu kreditu", "osebni krediti", "osebnih kreditov", "osebnim kreditom")), + Map.entry("stanovanjski kredit", Set.of("stanovanjski kredit", "stanovanjskega kredita", "stanovanjskemu kreditu", "stanovanjski krediti", "stanovanjskim kreditom")), + Map.entry("kvalificirano digitalno potrdilo", Set.of("kvalificirano digitalno potrdilo", "kvalificiranega digitalnega potrdila", "kvalificiranemu digitalnemu potrdilu", "kvalificiranim digitalnim potrdilom", "kvalificirana digitalna potrdila")), + Map.entry("zdravstveno zavarovanje", Set.of("zdravstveno zavarovanje", "zdravstvenega zavarovanja", "zdravstvenemu zavarovanju", "zdravstvenim zavarovanjem")), + Map.entry("hemofilia", Set.of("hemofilia", "hemofilije", "hemofiliji", "hemofilijo")), + Map.entry("Semaglutid", Set.of("semaglutid", "semaglutida", "semaglutidu", "semaglutidom")), + Map.entry("piščancem prijazna reja", Set.of("piščancem prijazna reja", "piščancem prijazne reje", "piščancem prijazni reji", "piščancem prijazno rejo")), + Map.entry("italijanska moda", Set.of("italijanska moda", "italijanske mode", "italijanski modi", "italijansko modo")), + Map.entry("Mesec italijanske mode", Set.of("mesec italijanske mode", "meseca italijanske mode", "mesecu italijanske mode", "mesecem italijanske mode")), + + // --- Phrases & Local --- + Map.entry("podkast iz kranja", Set.of("podkast iz kranja", "podkasta iz kranja", "podkastu iz kranja", "podkastom iz kranja", "podkasti iz kranja")), + Map.entry("kranjske novice", Set.of("kranjske novice", "kranjskih novic", "kranjskim novicam", "kranjskimi novicami")), + Map.entry("nori na poli", Set.of("nori na poli", "norih na poli", "norim na poli", "nore na poli", "norimi na poli"))); + + public static final Map> slovenian_common_words_dataset = Map.ofEntries( + // -- NOUNS -- + Map.entry("hiš", Set.of("hiša", "hiše", "hiši", "hišo", "hišah", "hišama")), + Map.entry("knjig", Set.of("knjiga", "knjigami")), + Map.entry("mest", Set.of("mesto", "mestu")), + + // -- MASCULINE & IRREGULAR -- + Map.entry("človek", Set.of("človek", "človeka")), + Map.entry("ljud", Set.of("ljudje", "ljudi", "ljudmi")), + + // -- ADJECTIVES -- + Map.entry("dobr", Set.of("dobri", "dobrega", "dobremu")), + Map.entry("lepš", Set.of("lepši")), + + // -- POSSESSIVES -- + Map.entry("očet", Set.of("očetov", "očetova")), + Map.entry("mater", Set.of("materin", "materina")), + + // -- VERBS -- + Map.entry("delal", Set.of("delal", "delala", "delali")), + + // -- SHORT WORDS -- + Map.entry("most", Set.of("most")), + Map.entry("tri", Set.of("tri")), + Map.entry("pes", Set.of("pes")), + Map.entry("vse", Set.of("vse")), + + // -- NAMES -- + Map.entry("gregor", Set.of("gregor","gregorja", "gregorju", "gregorjem")) + ); + + @Test + @Tag("performance") + public void analyze_stemmer_performance() { + final SlovenianStemmer stemmer = new SlovenianStemmer(); + + System.out.println("================================================================================"); + System.out.println("Slovenian Stemmer Performance Analysis"); + System.out.println("================================================================================\n"); + + Map datasetResults = new LinkedHashMap<>(); + + // Analyze common words dataset + DatasetResult commonWordsResult = analyzeDataset( + stemmer, + "slovenian_common_words_dataset", + slovenian_common_words_dataset + ); + datasetResults.put("Common Words", commonWordsResult); + + // Analyze keywords dataset + DatasetResult keywordsResult = analyzeDataset( + stemmer, + "slovenian_keywords_dataset", + slovenian_keywords_dataset + ); + datasetResults.put("Keywords", keywordsResult); + + // Print per-dataset results + for (Map.Entry entry : datasetResults.entrySet()) { + DatasetResult result = entry.getValue(); + System.out.printf("Dataset: %s%n", entry.getKey()); + System.out.printf(" Total entries: %d%n", result.totalEntries); + System.out.printf(" Successful groupings: %d%n", result.successfulGroupings); + System.out.printf(" Precision: %.1f%%%n%n", result.getPrecision()); + } + + // Calculate overall precision + int totalEntries = datasetResults.values().stream().mapToInt(r -> r.totalEntries).sum(); + int totalSuccesses = datasetResults.values().stream().mapToInt(r -> r.successfulGroupings).sum(); + double overallPrecision = totalEntries > 0 ? (double) totalSuccesses / totalEntries * 100 : 0; + + System.out.printf("Overall Precision: %.1f%% (%d/%d)%n%n", overallPrecision, totalSuccesses, totalEntries); + + // Print failures + System.out.println("================================================================================"); + System.out.println("FAILURES"); + System.out.println("================================================================================\n"); + + int failureCount = 0; + for (Map.Entry entry : datasetResults.entrySet()) { + if (!entry.getValue().failures.isEmpty()) { + System.out.printf("=== %s ===%n%n", entry.getKey()); + for (FailureReport failure : entry.getValue().failures) { + failureCount++; + System.out.printf("[%d] Expected: \"%s\"%n", failureCount, failure.expectedBase); + System.out.printf(" Got %d different stems: %s%n", failure.uniqueStems.size(), failure.uniqueStems); + System.out.println(" Examples:"); + for (Map.Entry e : failure.formToStem.entrySet()) { + System.out.printf(" \"%s\" → \"%s\"%n", e.getKey(), e.getValue()); + } + System.out.println(); + } + } + } + + if (failureCount == 0) { + System.out.println("No failures! All word forms stem correctly. ✓"); + } + } + + private DatasetResult analyzeDataset( + SlovenianStemmer stemmer, + String datasetName, + Map> dataset + ) { + int totalEntries = 0; + int successfulGroupings = 0; + List failures = new ArrayList<>(); + + for (Map.Entry> entry : dataset.entrySet()) { + totalEntries++; + String expectedBase = entry.getKey(); + Set inflectedForms = entry.getValue(); + + // Stem all forms, track unique stems + Set uniqueStems = new HashSet<>(); + Map formToStem = new LinkedHashMap<>(); + + for (String form : inflectedForms) { + String stemmed = stemTokens(stemmer, form); + uniqueStems.add(stemmed); + formToStem.put(form, stemmed); + } + + if (uniqueStems.size() == 1) { + successfulGroupings++; + } else { + failures.add(new FailureReport(expectedBase, uniqueStems, formToStem)); + } + } + + return new DatasetResult(datasetName, totalEntries, successfulGroupings, failures); + } + + private String stemTokens(SlovenianStemmer stemmer, String text) { + String[] tokens = text.trim().split("\\s+"); + StringBuilder result = new StringBuilder(); + for (int i = 0; i < tokens.length; i++) { + char[] chars = tokens[i].toCharArray(); + int newLen = stemmer.stem(chars, chars.length); + if (i > 0) result.append(" "); + result.append(new String(chars, 0, newLen)); + } + return result.toString(); + } +} diff --git a/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerTest.java b/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerTest.java new file mode 100644 index 0000000..80922c9 --- /dev/null +++ b/src/test/java/cz/monitora/elasticsearch/analyzer/slovenian/SlovenianStemmerTest.java @@ -0,0 +1,67 @@ +package cz.monitora.elasticsearch.analyzer.slovenian; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +public class SlovenianStemmerTest { + + @ParameterizedTest + @CsvSource({ + // hiš - house + "hiša, hiš", + "hiše, hiš", + "hiši, hiš", + "hišo, hiš", + "hišah, hiš", + "hišama, hiš", + // knjig - book + "knjiga, knjig", + "knjigami, knjig", + // mest - city + "mesto, mest", + "mestu, mest", + // človek - person/human + "človek, človek", + "človeka, človek", + // ljud - people + "ljudje, ljudj", + "ljudi, ljud", + "ljudmi, ljud", + // dobr - good + "dobri, dobr", + "dobrega, dobr", + "dobremu, dobr", + // lepš - beautiful/nicer + "lepši, lepš", + // očet - father's (possessive) + "očetov, očet", + "očetova, očet", + // mater - mother's (possessive) + "materin, mater", + "materina, mater", + // delal - worked + "delal, delal", + "delala, delal", + "delali, delal", + // most - bridge + "most, most", + // tri - three + "tri, tri", + // pes - dog + "pes, pes", + // vse - all/everything + "vse, vse", + // gregor - name Gregor + "gregorja, gregor", + "gregorju, gregor", + "gregorjem, gregor" + }) + public void test_stem(String input, String expectedStem) { + final SlovenianStemmer stemmer = new SlovenianStemmer(); + char[] ch = input.toCharArray(); + assertEquals(expectedStem, new String(Arrays.copyOfRange(ch, 0, stemmer.stem(ch, ch.length)))); + } +}