Skip to content

Commit

Permalink
[BUGS#1248,1251] refactor FindMatches
Browse files Browse the repository at this point in the history
- Split FindMatches#search method to searchNormal and searchSegmented, and remove boolean allowSeparateSegmentMatch argument from constructor.
- Refactor FindMatchesThread#search to call searchNormal then searchSegmented if necessary.
searchSEgmented method skips external TMs.

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Feb 17, 2024
1 parent c2472af commit 4665a8e
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 95 deletions.
2 changes: 1 addition & 1 deletion config/checkstyle/suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
<!-- util/Preferences -->
<suppress files="Preferences\.java" checks="LineLength" lines="197"/>
<!-- core/stat -->
<suppress checks="(ParameterNumber|MethodLength)" files="FindMatches\.java" lines="164,350,459"/>
<suppress checks="ParameterNumber" files="FindMatches\.java"/>
<!-- util/xml -->
<suppress checks="(EmptyBlock|MethodLength)" files="XMLStreamReader\.java"/>
<!-- util -->
Expand Down
4 changes: 2 additions & 2 deletions src/org/omegat/core/statistics/CalcMatchStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public class CalcMatchStatistics extends LongProcessThread {
private final ThreadLocal<ISimilarityCalculator> distanceCalculator = ThreadLocal
.withInitial(LevenshteinDistance::new);
private final ThreadLocal<FindMatches> finder = ThreadLocal.withInitial(
() -> new FindMatches(Core.getProject(), OConsts.MAX_NEAR_STRINGS, true, false, false));
() -> new FindMatches(Core.getProject(), OConsts.MAX_NEAR_STRINGS, false, false));
private final StringBuilder textForLog = new StringBuilder();

public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
Expand Down Expand Up @@ -299,7 +299,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
List<NearString> nears = localFinder.search(srcNoXmlTags, false, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
Expand Down
202 changes: 111 additions & 91 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay, Alex Buloichik
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand Down Expand Up @@ -42,8 +43,6 @@
import org.omegat.core.data.ExternalTMFactory;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.DefaultTranslationsIterator;
import org.omegat.core.data.IProject.MultipleTranslationsIterator;
import org.omegat.core.data.ITMXEntry;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
Expand All @@ -64,31 +63,30 @@

/**
* Class to find matches by specified criteria.
*
* <p>
* Since we can use stemmers to prepare tokens, we should use 3-pass comparison
* of similarity. Similarity will be calculated in 3 steps:
*
* 1. Split original segment into word-only tokens using stemmer (with stop
* words list), then compare tokens.
*
* 2. Split original segment into word-only tokens without stemmer, then compare
* tokens.
*
* 3. Split original segment into not-only-words tokens (including numbers and
* tags) without stemmer, then compare tokens.
*
* This class is not thread safe ! Must be used in the one thread only.
* <ol>
* <li>Split the original segment into word-only tokens using stemmer (with stop
* words list), then compare tokens.</li>
* <li>Split the original segment into word-only tokens without a stemmer, then compare
* tokens.</li>
* <li>Split the original segment into not-only-words tokens (including numbers and
* tags) without a stemmer, then compare tokens.</li>
* </ol>
* This class is not thread safe! Must be used in the one thread only.
*
* @author Maxym Mykhalchuk
* @author Alex Buloichik ([email protected])
* @author Martin Fleurke
* @author Aaron Madlon-Kay
* @author Hiroshi Miura
*/
public class FindMatches {

/**
* According to gettext source code, PO fuzzies are created above 60%
* https://sourceforge.net/p/omegat/feature-requests/1258/
* According to gettext source code, PO fuzzy items are created above 60%
* <a href="https://sourceforge.net/p/omegat/feature-requests/1258/">RFE#1258</a>
*/
static final int PENALTY_FOR_FUZZY = 40;
private static final int PENALTY_FOR_REMOVED = 5;
Expand Down Expand Up @@ -127,46 +125,53 @@ public class FindMatches {
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;

// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;

private final int fuzzyMatchThreshold;
private int fuzzyMatchThreshold = 0;

private final boolean applyThreshold;

/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source
* segment. This mode used only for separate sentence match in
* paragraph project, i.e. where source is just part of current
* source.
*/
/** Constructor(deprecated). */
@Deprecated
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this(project, maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true);
boolean searchExactlyTheSame, boolean applyThreshold) {
this(project, maxCount, searchExactlyTheSame, applyThreshold);
}

public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold) {
/**
* Constructor.
* @param project OmegaT project.
* @param maxCount limit the maximum count of the results.
* @param searchExactlyTheSame
* allows searching similarities with the same text as source
* segment. This mode is used only for separate sentence match in
* a paragraph project, i.e. where a source is just part of the
* current source.
* @param applyThreshold
* Cut off the result by a custom threshold. It is useful
* when results are used only for the display.
*/
public FindMatches(IProject project, int maxCount, boolean searchExactlyTheSame, boolean applyThreshold) {
this.project = project;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, 1, false, true);
}
this.fuzzyMatchThreshold = Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD);
this.result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
this.applyThreshold = applyThreshold;
}

public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
return searchNormal(searchText, requiresTranslation, fillSimilarityData, false, stop);
}

private void init(String searchText) {
result.clear();
srcText = searchText;
removedText = "";
if (applyThreshold) {
fuzzyMatchThreshold = Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD);
}

// remove part that is to be removed according to user settings.
// Rationale: it might be a big string influencing the 'editing
Expand All @@ -187,10 +192,16 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
strTokensNoStem = tokenizeNoStem(srcText);
strTokensAll = tokenizeAll(srcText);
/* HP: includes non - word tokens */
}

private List<NearString> searchNormal(String searchText, boolean requiresTranslation, boolean isFillSimilarityData,
boolean skipExternal, IStopped stop) throws StoppedException {
init(searchText);

// travel by project entries, including orphaned
if (project.getProjectProperties().isSupportDefaultTranslations()) {
project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
project.iterateByDefaultTranslations(new IProject.DefaultTranslationsIterator() {
@Override
public void iterate(String source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.equals(searchText)) {
Expand All @@ -207,7 +218,8 @@ public void iterate(String source, TMXEntry trans) {
}
});
}
project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
project.iterateByMultipleTranslations(new IProject.MultipleTranslationsIterator() {
@Override
public void iterate(EntryKey source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
Expand All @@ -224,6 +236,14 @@ public void iterate(EntryKey source, TMXEntry trans) {
}
});

if (!skipExternal) {
travelExternalTMs(requiresTranslation, stop);
}
finish(isFillSimilarityData, stop);
return result;
}

private void travelExternalTMs(boolean requiresTranslation, IStopped stop) {
/*
* Penalty applied for fuzzy matches in another language (if no match in
* the target language was found).
Expand Down Expand Up @@ -259,7 +279,46 @@ public void iterate(EntryKey source, TMXEntry trans) {
tmen.getCreationDate(), tmen.getChanger(), tmen.getChangeDate(), tmen.getProperties());
}
}
}

public List<NearString> searchSegmented(String searchText, IStopped stop) throws StoppedException {
FindMatches separateSegmentMatcher = new FindMatches(project, 1, true, true);
init(searchText);

// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<>();
List<Rule> brules = new ArrayList<>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
List<String> fsrc = new ArrayList<>(segments.size());
List<String> ftrans = new ArrayList<>(segments.size());
// multiple segments
for (String onesrc : segments) {
// find match for separate segment
List<NearString> segmentMatch = separateSegmentMatcher.searchNormal(onesrc, true, false, true, stop);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
} else {
fsrc.add("");
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
0, null);
}
return result;
}

private void finish(boolean fillSimilarityData, IStopped stop) {
// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
Expand All @@ -269,58 +328,19 @@ public void iterate(EntryKey source, TMXEntry trans) {
"", 0, "", 0, null);
}
}

if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
List<String> fsrc = new ArrayList<String>(segments.size());
List<String> ftrans = new ArrayList<String>(segments.size());
// multiple segments
for (short i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);

// find match for separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
} else {
fsrc.add("");
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
0, null);
}
}

if (fillSimilarityData) {
// fill similarity data only for result
// fill similarity data only for a result.
for (NearString near : result) {
// fix for bug 1586397
byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll,
tokenizeAll(near.source));
near.attr = similarityData;
}
}

return result;
}

/**
* Compare one entry with original entry.
* Compare one entry with the original entry.
*
* @param key
* entry to compare
Expand Down Expand Up @@ -381,7 +401,7 @@ protected void processEntry(EntryKey key, String source, String translation,
similarityStem -= realPenaltyForRemoved;

// check if we have chance by first percentage only
if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
if (noChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
return;
}

Expand All @@ -396,7 +416,7 @@ protected void processEntry(EntryKey key, String source, String translation,
similarityNoStem -= realPenaltyForRemoved;

// check if we have chance by first and second percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
if (noChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
return;
}

Expand All @@ -411,7 +431,7 @@ protected void processEntry(EntryKey key, String source, String translation,
simAdjusted -= realPenaltyForRemoved;

// check if we have chance by first, second and third percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
if (noChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
return;
}

Expand All @@ -437,9 +457,9 @@ protected void processEntry(EntryKey key, String source, String translation,
* exactly similarity
* @return true if we have chance
*/
protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
private boolean noChanceToAdd(int simStem, int simNoStem, int simExactly) {
if (result.size() < maxCount) {
return true;
return false;
}
NearString st = result.get(result.size() - 1);
int chance = Integer.compare(st.scores[0].score, simStem);
Expand All @@ -449,7 +469,7 @@ protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final
if (chance == 0) {
chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
}
return chance != 1;
return chance == 1;
}

/**
Expand Down Expand Up @@ -508,9 +528,9 @@ protected void addNearString(final EntryKey key, final String source, final Stri
/*
* Methods for tokenize strings with caching.
*/
Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeStemCache = new HashMap<>();
Map<String, Token[]> tokenizeNoStemCache = new HashMap<>();
Map<String, Token[]> tokenizeAllCache = new HashMap<>();

public Token[] tokenizeStem(String str) {
Token[] tokens = tokenizeStemCache.get(str);
Expand Down Expand Up @@ -552,8 +572,8 @@ protected void checkStopped(IStopped stop) throws StoppedException {
}

/**
* Process will throw this exception if it stopped.All callers must catch it
* and just skip.
* The Process will throw this exception if it stopped. All callers must
* catch it and just skip.
*/
@SuppressWarnings("serial")
public static class StoppedException extends RuntimeException {
Expand Down
Loading

0 comments on commit 4665a8e

Please sign in to comment.