-
-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- add internal search method to handle normal and segmented search conditions, also use for test purpose - drop threshold arguments for CalcMatchStatistics usage - FindMatchesThread.finderSearch to take threshold argument for testing. - update search() callers accordingly. Signed-off-by: Hiroshi Miura <[email protected]>
- Loading branch information
Showing
6 changed files
with
152 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,20 +63,19 @@ | |
|
||
/** | ||
* Class to find matches by specified criteria. | ||
* | ||
* <p> | ||
* Since we can use stemmers to prepare tokens, we should use 3-pass comparison | ||
* of similarity. Similarity will be calculated in 3 steps: | ||
* | ||
* 1. Split original segment into word-only tokens using stemmer (with stop | ||
* words list), then compare tokens. | ||
* | ||
* 2. Split original segment into word-only tokens without stemmer, then compare | ||
* tokens. | ||
* | ||
* 3. Split original segment into not-only-words tokens (including numbers and | ||
* tags) without stemmer, then compare tokens. | ||
* | ||
* This class is not thread safe ! Must be used in the one thread only. | ||
* <ol> | ||
* <li>Split the original segment into word-only tokens using stemmer (with stop | ||
* words list), then compare tokens.</li> | ||
* <li>Split the original segment into word-only tokens without a stemmer, | ||
* then compare tokens.</li> | ||
* <li>Split the original segment into not-only-words tokens (including numbers | ||
* and tags) without a stemmer, then compare tokens.</li> | ||
* </ol> | ||
* <p> | ||
* This class is not thread safe! Must be used in the one thread only. | ||
* | ||
* @author Maxym Mykhalchuk | ||
* @author Alex Buloichik ([email protected]) | ||
|
@@ -127,46 +126,92 @@ public class FindMatches { | |
/** Tokens for original string, includes numbers and tags. */ | ||
private Token[] strTokensAll; | ||
|
||
// This finder used for search separate segment matches | ||
private FindMatches separateSegmentMatcher; | ||
|
||
private final int fuzzyMatchThreshold; | ||
|
||
private final boolean applyThreshold; | ||
|
||
private final Segmenter segmenter; | ||
|
||
/** | ||
* @param searchExactlyTheSame | ||
* allows to search similarities with the same text as source | ||
* segment. This mode used only for separate sentence match in | ||
* paragraph project, i.e. where source is just part of current | ||
* source. | ||
*/ | ||
@Deprecated(since = "6.1.0") | ||
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch, | ||
boolean searchExactlyTheSame) { | ||
this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true, | ||
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, | ||
OConsts.FUZZY_MATCH_THRESHOLD)); | ||
this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault( | ||
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD)); | ||
} | ||
|
||
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch, | ||
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) { | ||
/** | ||
* FindMatches find fuzzy matched translation memories. | ||
* | ||
* @param project | ||
* OmegaT project. | ||
* @param segmenter | ||
* used when running a segmentation search. | ||
* @param maxCount | ||
* limit the maximum count of the results. | ||
* @param searchExactlyTheSame | ||
* allows searching similarities with the same text as a source | ||
* segment. This mode is used only for separate sentence match | ||
* in a paragraph project, i.e., where a source is just part of | ||
* the current source. | ||
* @param threshold | ||
* threshold to use. | ||
*/ | ||
public FindMatches(IProject project, Segmenter segmenter, int maxCount, | ||
boolean searchExactlyTheSame, int threshold) { | ||
this.project = project; | ||
this.segmenter = segmenter; | ||
this.tok = project.getSourceTokenizer(); | ||
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale(); | ||
this.maxCount = maxCount; | ||
this.searchExactlyTheSame = searchExactlyTheSame; | ||
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) { | ||
separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold); | ||
} | ||
this.fuzzyMatchThreshold = threshold; | ||
this.applyThreshold = applyThreshold; | ||
} | ||
|
||
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData, | ||
IStopped stop) throws StoppedException { | ||
@Deprecated(since = "6.1.0") | ||
public List<NearString> search(final String searchText, final boolean requiresTranslation, | ||
final boolean fillSimilarityData, final IStopped stop) throws StoppedException { | ||
return search(searchText, fillSimilarityData, stop); | ||
} | ||
|
||
/** | ||
* Search Translation memories. | ||
* | ||
* @param searchText | ||
* target segment or term to search. | ||
* @param fillSimilarityData | ||
* fill similarity data into the result of NearString objects. | ||
* @param stop | ||
* IStopped callback object to indicate cancel operation. | ||
* @return | ||
* List of NearString objects, which hold matched translation entry. | ||
* @throws StoppedException | ||
* raised when stopped during a search process. | ||
*/ | ||
public List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop) | ||
throws StoppedException { | ||
return search(searchText, fillSimilarityData, stop, | ||
!project.getProjectProperties().isSentenceSegmentingEnabled()); | ||
} | ||
|
||
/** | ||
* Search Translation memories. | ||
* <p> | ||
* Internal method to handle search conditions. | ||
* It is accessible as package-private for testing. | ||
* | ||
* @param searchText | ||
* target segment or term to search. | ||
* @param fillSimilarityData | ||
* fill similarity data into the result of NearString objects. | ||
* @param stop | ||
* IStopped callback object to indicate cancel operation. | ||
* @param runSeparateSegmentMatch | ||
* Also search with segmented terms search. | ||
* @return | ||
* List of NearString objects. | ||
* @throws StoppedException | ||
* When stopped the process during search. | ||
*/ | ||
List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop, | ||
boolean runSeparateSegmentMatch) throws StoppedException { | ||
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1); | ||
srcText = searchText; | ||
removedText = ""; | ||
|
@@ -196,7 +241,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
// skip original==original entry comparison | ||
return; | ||
} | ||
if (requiresTranslation && trans.translation == null) { | ||
if (trans.translation == null) { | ||
return; | ||
} | ||
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null; | ||
|
@@ -211,7 +256,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
// skip original==original entry comparison | ||
return; | ||
} | ||
if (requiresTranslation && trans.translation == null) { | ||
if (trans.translation == null) { | ||
return; | ||
} | ||
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null; | ||
|
@@ -225,7 +270,6 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
*/ | ||
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES, | ||
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT); | ||
// travel by translation memories | ||
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) { | ||
int penalty = 0; | ||
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey()); | ||
|
@@ -235,11 +279,11 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
for (ITMXEntry tmen : en.getValue().getEntries()) { | ||
checkStopped(stop); | ||
if (tmen.getSourceText() == null) { | ||
// Not all TMX entries have a source; in that case there can | ||
// be no meaningful match, so skip. | ||
// Not all TMX entries have a source; skip it in | ||
// the case, because of no meaningful. | ||
continue; | ||
} | ||
if (requiresTranslation && tmen.getTranslationText() == null) { | ||
if (tmen.getTranslationText() == null) { | ||
continue; | ||
} | ||
int tmenPenalty = penalty; | ||
|
@@ -260,7 +304,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
ste.isSourceTranslationFuzzy(), 0); | ||
} | ||
} | ||
if (separateSegmentMatcher != null) { | ||
if (runSeparateSegmentMatch) { | ||
FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true, | ||
fuzzyMatchThreshold); | ||
// split paragraph even when segmentation disabled, then find | ||
// matches for every segment | ||
List<StringBuilder> spaces = new ArrayList<>(); | ||
|
@@ -273,9 +319,10 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
List<String> ftrans = new ArrayList<>(segments.size()); | ||
// multiple segments | ||
for (String onesrc : segments) { | ||
// find match for a separate segment | ||
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, | ||
false, stop); | ||
// find match for a separate segment. | ||
// WARN: the 5th argument should be | ||
// `false` to avoid an infinite-loop. | ||
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false, false); | ||
if (!segmentMatch.isEmpty() | ||
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) { | ||
fsrc.add(segmentMatch.get(0).source); | ||
|
@@ -385,7 +432,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName, | |
} | ||
|
||
// BUGS#1236 - stat display does not use threshold config check | ||
if (applyThreshold && similarityStem < fuzzyMatchThreshold | ||
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold | ||
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) { | ||
return; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.