[BUGS#1251] refactor: FindMatches

- add internal search method to handle normal and segmented search conditions, also use for test purpose - drop threshold arguments for CalcMatchStatistics usage - FindMatchesThread.finderSearch to take threshold argument for testing. - update search() callers accordingly. Signed-off-by: Hiroshi Miura <[email protected]>
omegat-org · Dec 18, 2024 · 8779852 · 8779852
1 parent 6d6aad4
commit 8779852
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 99 deletions.
diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java
@@ -51,7 +51,6 @@
 import org.omegat.core.threads.LongProcessThread;
 import org.omegat.util.OConsts;
 import org.omegat.util.OStrings;
-import org.omegat.util.Preferences;
 import org.omegat.util.StringUtil;
 import org.omegat.util.Token;
 import org.omegat.util.gui.TextUtil;
@@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread {
     private final IProject project;
 
     public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
-        this(Core.getProject(), Core.getSegmenter(), callback, perFile,
-                Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
-                OConsts.FUZZY_MATCH_THRESHOLD));
+        this(Core.getProject(), Core.getSegmenter(), callback, perFile);
     }
 
-    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
-                               boolean perFile, int threshold) {
+    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) {
         this.project = project;
         this.callback = callback;
         this.perFile = perFile;
         finder = ThreadLocal.withInitial(
-                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
-                        false, false, threshold));
+                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1));
     }
 
     @Override
@@ -313,7 +308,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
     int calcMaxSimilarity(SourceTextEntry ste) {
         String srcNoXmlTags = removeXmlTags(ste);
         FindMatches localFinder = finder.get();
-        List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
+        List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
         final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
         int maxSimilarity = 0;
         CACHE: for (NearString near : nears) {

diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java
@@ -63,20 +63,19 @@
 
 /**
  * Class to find matches by specified criteria.
- *
+ * <p>
  * Since we can use stemmers to prepare tokens, we should use 3-pass comparison
  * of similarity. Similarity will be calculated in 3 steps:
- *
- * 1. Split original segment into word-only tokens using stemmer (with stop
- * words list), then compare tokens.
- *
- * 2. Split original segment into word-only tokens without stemmer, then compare
- * tokens.
- *
- * 3. Split original segment into not-only-words tokens (including numbers and
- * tags) without stemmer, then compare tokens.
- *
- * This class is not thread safe ! Must be used in the one thread only.
+ * <ol>
+ * <li>Split the original segment into word-only tokens using stemmer (with stop
+ * words list), then compare tokens.</li>
+ * <li>Split the original segment into word-only tokens without a stemmer,
+ * then compare tokens.</li>
+ * <li>Split the original segment into not-only-words tokens (including numbers
+ * and tags) without a stemmer, then compare tokens.</li>
+ * </ol>
+ * <p>
+ * This class is not thread safe! Must be used in the one thread only.
  *
  * @author Maxym Mykhalchuk
  * @author Alex Buloichik ([email protected])
@@ -127,46 +126,92 @@ public class FindMatches {
     /** Tokens for original string, includes numbers and tags. */
     private Token[] strTokensAll;
 
-    // This finder used for search separate segment matches
-    private FindMatches separateSegmentMatcher;
-
     private final int fuzzyMatchThreshold;
 
-    private final boolean applyThreshold;
-
     private final Segmenter segmenter;
 
-    /**
-     * @param searchExactlyTheSame
-     *            allows to search similarities with the same text as source
-     *            segment. This mode used only for separate sentence match in
-     *            paragraph project, i.e. where source is just part of current
-     *            source.
-     */
+    @Deprecated(since = "6.1.0")
     public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
             boolean searchExactlyTheSame) {
-        this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
-                Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
-                        OConsts.FUZZY_MATCH_THRESHOLD));
+        this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
+                Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
     }
 
-    public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
-            boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
+    /**
+     * FindMatches find fuzzy matched translation memories.
+     *
+     * @param project
+     *            OmegaT project.
+     * @param segmenter
+     *            used when running a segmentation search.
+     * @param maxCount
+     *            limit the maximum count of the results.
+     * @param searchExactlyTheSame
+     *            allows searching similarities with the same text as a source
+     *            segment. This mode is used only for separate sentence match
+     *            in a paragraph project, i.e., where a source is just part of
+     *            the current source.
+     * @param threshold
+     *            threshold to use.
+     */
+    public FindMatches(IProject project, Segmenter segmenter, int maxCount,
+            boolean searchExactlyTheSame, int threshold) {
         this.project = project;
         this.segmenter = segmenter;
         this.tok = project.getSourceTokenizer();
         this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
         this.maxCount = maxCount;
         this.searchExactlyTheSame = searchExactlyTheSame;
-        if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
-            separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
-        }
         this.fuzzyMatchThreshold = threshold;
-        this.applyThreshold = applyThreshold;
     }
 
-    public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
-            IStopped stop) throws StoppedException {
+    @Deprecated(since = "6.1.0")
+    public List<NearString> search(final String searchText, final boolean requiresTranslation,
+            final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
+        return search(searchText, fillSimilarityData, stop);
+    }
+
+    /**
+     * Search Translation memories.
+     *
+     * @param searchText
+     *        target segment or term to search.
+     * @param fillSimilarityData
+     *        fill similarity data into the result of NearString objects.
+     * @param stop
+     *        IStopped callback object to indicate cancel operation.
+     * @return
+     *        List of NearString objects, which hold matched translation entry.
+     * @throws StoppedException
+     *        raised when stopped during a search process.
+     */
+    public List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop)
+            throws StoppedException {
+        return search(searchText, fillSimilarityData, stop,
+                !project.getProjectProperties().isSentenceSegmentingEnabled());
+    }
+
+    /**
+     * Search Translation memories.
+     * <p>
+     * Internal method to handle search conditions.
+     * It is accessible as package-private for testing.
+     *
+     * @param searchText
+     *        target segment or term to search.
+     * @param fillSimilarityData
+     *        fill similarity data into the result of NearString objects.
+     * @param stop
+     *        IStopped callback object to indicate cancel operation.
+     * @param runSeparateSegmentMatch
+     *        Also search with segmented terms search.
+     * @return
+     *        List of NearString objects.
+     * @throws StoppedException
+     *        When stopped the process during search.
+     */
+    List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop,
+                            boolean runSeparateSegmentMatch) throws StoppedException {
         result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
         srcText = searchText;
         removedText = "";
@@ -196,7 +241,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                     // skip original==original entry comparison
                     return;
                 }
-                if (requiresTranslation && trans.translation == null) {
+                if (trans.translation == null) {
                     return;
                 }
                 String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
@@ -211,7 +256,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                 // skip original==original entry comparison
                 return;
             }
-            if (requiresTranslation && trans.translation == null) {
+            if (trans.translation == null) {
                 return;
             }
             String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
@@ -225,7 +270,6 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
          */
         int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
                 Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
-        // travel by translation memories
         for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
             int penalty = 0;
             Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
@@ -235,11 +279,11 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
             for (ITMXEntry tmen : en.getValue().getEntries()) {
                 checkStopped(stop);
                 if (tmen.getSourceText() == null) {
-                    // Not all TMX entries have a source; in that case there can
-                    // be no meaningful match, so skip.
+                    // Not all TMX entries have a source; skip it in
+                    // the case, because of no meaningful.
                     continue;
                 }
-                if (requiresTranslation && tmen.getTranslationText() == null) {
+                if (tmen.getTranslationText() == null) {
                     continue;
                 }
                 int tmenPenalty = penalty;
@@ -260,7 +304,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                         ste.isSourceTranslationFuzzy(), 0);
             }
         }
-        if (separateSegmentMatcher != null) {
+        if (runSeparateSegmentMatch) {
+            FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
+                    fuzzyMatchThreshold);
             // split paragraph even when segmentation disabled, then find
             // matches for every segment
             List<StringBuilder> spaces = new ArrayList<>();
@@ -273,9 +319,10 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                 List<String> ftrans = new ArrayList<>(segments.size());
                 // multiple segments
                 for (String onesrc : segments) {
-                    // find match for a separate segment
-                    List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
-                            false, stop);
+                    // find match for a separate segment.
+                    // WARN: the 5th argument should be
+                    // `false` to avoid an infinite-loop.
+                    List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false, false);
                     if (!segmentMatch.isEmpty()
                             && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
                         fsrc.add(segmentMatch.get(0).source);
@@ -385,7 +432,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
         }
 
         // BUGS#1236 - stat display does not use threshold config check
-        if (applyThreshold && similarityStem < fuzzyMatchThreshold
+        if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
                 && similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
             return;
         }

diff --git a/src/org/omegat/gui/matches/FindMatchesThread.java b/src/org/omegat/gui/matches/FindMatchesThread.java
@@ -33,13 +33,16 @@
 import java.util.List;
 import java.util.logging.Logger;
 
+import org.omegat.core.Core;
 import org.omegat.core.data.IProject;
 import org.omegat.core.data.SourceTextEntry;
 import org.omegat.core.events.IStopped;
 import org.omegat.core.matching.NearString;
+import org.omegat.core.segmentation.Segmenter;
 import org.omegat.core.statistics.FindMatches;
 import org.omegat.gui.common.EntryInfoSearchThread;
 import org.omegat.util.OConsts;
+import org.omegat.util.Preferences;
 
 /**
  * Find matches in separate thread then show a result in the matches' pane.
@@ -82,7 +85,9 @@ protected List<NearString> search() throws Exception {
         long before = System.currentTimeMillis();
 
         try {
-            List<NearString> result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged);
+            List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
+                    this::isEntryChanged, Preferences.getPreferenceDefault(
+                            Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
             LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
             return result;
         } catch (FindMatches.StoppedException ex) {
@@ -97,8 +102,9 @@ protected List<NearString> search() throws Exception {
      * @param isEntryChanged stop and raise StopException when it returns true.
      * @return result as a list of NearString.
      */
-    protected static List<NearString> finderSearch(IProject project, String srcText, IStopped isEntryChanged) {
-        FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
-        return finder.search(srcText, true, true, isEntryChanged);
+    protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
+                                                   IStopped isEntryChanged, int threshold) {
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
+        return finder.search(srcText, true, isEntryChanged);
     }
 }
diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java
@@ -25,6 +25,7 @@
 
 package org.omegat.core.statistics;
 
+import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
@@ -35,6 +36,7 @@
 
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.omegat.core.Core;
@@ -48,6 +50,7 @@
 import org.omegat.core.data.ProtectedPart;
 import org.omegat.core.data.SourceTextEntry;
 import org.omegat.core.data.TMXEntry;
+import org.omegat.core.segmentation.SRX;
 import org.omegat.core.segmentation.Segmenter;
 import org.omegat.filters2.FilterContext;
 import org.omegat.filters2.IFilter;
@@ -64,13 +67,17 @@
 
 public class CalcMatchStatisticsTest {
 
+    @BeforeClass
+    public static void setup() throws IOException {
+        TestPreferencesInitializer.init();
+    }
+
     @Test
     public void testCalcMatchStatics() throws Exception {
         TestProject project = new TestProject(new ProjectPropertiesTest());
         IStatsConsumer callback = new TestStatsConsumer();
-        Segmenter segmenter = new Segmenter(Preferences.getSRX());
-        CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter,
-                callback, 30);
+        Segmenter segmenter = new Segmenter(SRX.getDefault());
+        CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
         calcMatchStatistics.start();
         try {
             calcMatchStatistics.join();
@@ -123,7 +130,8 @@ public void testCalcMatchStatics() throws Exception {
         Assert.assertEquals("5699", result[7][4]);
 
         // change threshold
-        calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70);
+        Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70);
+        calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
         calcMatchStatistics.start();
         try {
             calcMatchStatistics.join();
@@ -362,9 +370,8 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics {
         private MatchStatCounts result;
         private final IStatsConsumer callback;
 
-        CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback,
-                                int threshold) {
-            super(project, segmenter, callback, false, threshold);
+        CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) {
+            super(project, segmenter, callback, false);
             this.project = project;
             this.callback = callback;
         }