Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: FindMatches class #1222

Merged
merged 5 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/org/omegat/core/statistics/CalcMatchStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public class CalcMatchStatistics extends LongProcessThread {
public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
this(Core.getProject(), Core.getSegmenter(), callback, perFile,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
OConsts.FUZZY_MATCH_THRESHOLD));
}

public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
Expand All @@ -119,8 +119,7 @@ public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer
this.callback = callback;
this.perFile = perFile;
finder = ThreadLocal.withInitial(
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
false, false, threshold));
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold));
}

@Override
Expand Down Expand Up @@ -313,7 +312,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
Expand Down
85 changes: 51 additions & 34 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,27 +126,15 @@ public class FindMatches {
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;

// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;

private final int fuzzyMatchThreshold;

private final boolean applyThreshold;

private final Segmenter segmenter;

/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source
* segment. This mode used only for separate sentence match in
* paragraph project, i.e. where source is just part of current
* source.
*/
@Deprecated(since = "6.1.0")
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
}

/**
Expand All @@ -166,19 +154,21 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM
* @param threshold
* threshold to use.
*/
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
public FindMatches(IProject project, Segmenter segmenter, int maxCount,
boolean searchExactlyTheSame, int threshold) {
this.project = project;
this.segmenter = segmenter;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
}
this.fuzzyMatchThreshold = threshold;
this.applyThreshold = applyThreshold;
}

@Deprecated(since = "6.1.0")
public List<NearString> search(final String searchText, final boolean requiresTranslation,
final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
return search(searchText, fillSimilarityData, stop);
}

/**
Expand All @@ -195,8 +185,33 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean
* @throws StoppedException
* raised when stopped during a search process.
*/
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
public List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop)
throws StoppedException {
return search(searchText, fillSimilarityData, stop,
!project.getProjectProperties().isSentenceSegmentingEnabled());
}

/**
* Search Translation memories.
* <p>
* Internal method to handle search conditions.
* It is accessible as package-private for testing.
*
* @param searchText
* target segment or term to search.
* @param fillSimilarityData
* fill similarity data into the result of NearString objects.
* @param stop
* IStopped callback object to indicate cancel operation.
* @param runSeparateSegmentMatch
* Also search with segmented terms search.
* @return
* List of NearString objects.
* @throws StoppedException
* When stopped the process during search.
*/
List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop,
boolean runSeparateSegmentMatch) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
Expand Down Expand Up @@ -226,7 +241,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -241,7 +256,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -255,7 +270,6 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
*/
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
Expand All @@ -265,11 +279,11 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; in that case there can
// be no meaningful match, so skip.
// Not all TMX entries have a source; skip it in
// the case, because of no meaningful.
continue;
}
if (requiresTranslation && tmen.getTranslationText() == null) {
if (tmen.getTranslationText() == null) {
continue;
}
int tmenPenalty = penalty;
Expand All @@ -290,7 +304,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
ste.isSourceTranslationFuzzy(), 0);
}
}
if (separateSegmentMatcher != null) {
if (runSeparateSegmentMatch) {
FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
fuzzyMatchThreshold);
// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<>();
Expand All @@ -303,9 +319,10 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
List<String> ftrans = new ArrayList<>(segments.size());
// multiple segments
for (String onesrc : segments) {
// find match for a separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
// find match for a separate segment.
// WARN: the 5th argument should be
// `false` to avoid an infinite-loop.
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
Expand Down Expand Up @@ -415,7 +432,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

// BUGS#1236 - stat display does not use threshold config check
if (applyThreshold && similarityStem < fuzzyMatchThreshold
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
return;
}
Expand Down
32 changes: 26 additions & 6 deletions src/org/omegat/gui/matches/FindMatchesThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand All @@ -32,17 +33,22 @@
import java.util.List;
import java.util.logging.Logger;

import org.omegat.core.Core;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.core.statistics.FindMatches;
import org.omegat.gui.common.EntryInfoSearchThread;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;

/**
* Find matches in separate thread then show result in the matches pane.
* Find matches in separate thread then show a result in the matches' pane.
*
* @author Alex Buloichik ([email protected])
* @author Hiroshi Miura
*/
public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {
private static final Logger LOGGER = Logger.getLogger(FindMatchesThread.class.getName());
Expand All @@ -52,9 +58,9 @@ public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {

/**
* Entry which is processed currently.
*
* If entry in controller was changed, it means user has moved to another entry, and there is no sense to
* continue.
* <p>
* If entry in controller was changed, it means the user has moved to
* another entry, and there is no sense to continue.
*/
private final SourceTextEntry processedEntry;

Expand All @@ -79,12 +85,26 @@ protected List<NearString> search() throws Exception {
long before = System.currentTimeMillis();

try {
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
List<NearString> result = finder.search(processedEntry.getSrcText(), true, true, this::isEntryChanged);
List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
this::isEntryChanged, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
return result;
} catch (FindMatches.StoppedException ex) {
throw new EntryChangedException();
}
}

/**
* Search matches (static for test purpose).
* @param project OmegaT project.
* @param srcText source text to look for.
* @param isEntryChanged stop and raise StopException when it returns true.
* @return result as a list of NearString.
*/
protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
IStopped isEntryChanged, int threshold) {
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
return finder.search(srcText, true, isEntryChanged);
}
}
16 changes: 16 additions & 0 deletions test/data/tmx/penalty-010/segment_1.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx PUBLIC "-//LISA OSCAR:1998//DTD for Translation Memory eXchange//EN" "tmx14.dtd">

<tmx version="1.4">
<header creationtoolversion="0.1" adminlang="en" segtype="paragraph" creationdate="20230930T155211Z" datatype="unknown" srclang="ja" creationtool="txt2tmx" o-tmf="TextEdit"></header>
<body>
<tu>
<tuv xml:lang="fr">
<seg>weird behavior</seg>
</tuv>
<tuv xml:lang="ja">
<seg>地力の搾取と浪費が現われる。(1)</seg>
</tuv>
</tu>
</body>
</tmx>
46 changes: 46 additions & 0 deletions test/data/tmx/test-multiple-entries.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx14.dtd">
<tmx version="1.4">
<header datatype="plaintext" srclang="en-US" adminlang="EN-US" o-tmf="OmegaT TMX" segtype="sentence"
creationtoolversion="test" creationtool="test"/>
<body>
<!-- Default translations -->
<tu>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200523T143256Z">
<seg>Altre</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on Linux.</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131725Z" creationid="id" creationdate="20200526T131725Z">
<seg>Per l’installazioni nant’à i sistemi Linux.</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on other operating systems (such as FreeBSD and Solaris).</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131840Z" creationid="id"
creationdate="20200526T131840Z">
<seg>Per l’installazioni nant’à d’altri sistemi (cum’è FreeBSD è Solaris).</seg>
</tuv>
</tu>
<!-- Alternative translations -->
<tu>
<prop type="file">website/download.html</prop>
<prop type="prev">For installation on Linux.</prop>
<prop type="next">For installation on other operating systems (such as FreeBSD and Solaris).&lt;br0/></prop>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131742Z" creationid="id" creationdate="20200526T131742Z">
<seg>Altri</seg>
</tuv>
</tu>
</body>
</tmx>
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.segmentation.SRX;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IFilter;
Expand All @@ -68,7 +69,7 @@ public class CalcMatchStatisticsTest {
public void testCalcMatchStatics() throws Exception {
TestProject project = new TestProject(new ProjectPropertiesTest());
IStatsConsumer callback = new TestStatsConsumer();
Segmenter segmenter = new Segmenter(Preferences.getSRX());
Segmenter segmenter = new Segmenter(SRX.getDefault());
CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter,
callback, 30);
calcMatchStatistics.start();
Expand Down Expand Up @@ -123,7 +124,7 @@ public void testCalcMatchStatics() throws Exception {
Assert.assertEquals("5699", result[7][4]);

// change threshold
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70);
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, -1);
calcMatchStatistics.start();
try {
calcMatchStatistics.join();
Expand Down
Loading
Loading