omegat-org · miurahr · Feb 16, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/config/checkstyle/suppressions.xml b/config/checkstyle/suppressions.xml
@@ -139,8 +139,8 @@
          lines="90,91,92,96,100,104,105,107,264,275,276,284,286,293,294,308,309,359,361,449,495,605,674,710"/>
 
     <!-- matching -->
-    <suppress files="NearString\.java" checks="TypeName" lines="51-55"/>
-    <suppress files="NearString\.java" checks="ParameterNumber" lines="85-150"/>
+    <suppress files="NearString\.java" checks="TypeName" lines="50-70"/>
+    <suppress files="NearString\.java" checks="ParameterNumber"/>
     <suppress files="LevenshteinDistance\.java" checks="LocalVariableName" lines="147,159"/>
 
     <!-- Aligner, AlignFilePickerController, AlignPanelController -->

diff --git a/doc_src/en/Dialogs_ProjectProperties.xml b/doc_src/en/Dialogs_ProjectProperties.xml
@@ -127,7 +127,11 @@
 		  mid-translation may force OmegaT to upgrade old translation memories
 		  that did not use sentence segmentation, but not vice versa. However,
 		  OmegaT will attempt to create fuzzy matches for paragraphs by
-		  combining existing sentence translations.</para>
+		  combining existing sentence translations. See <link
+		  linkend="dialog.preferences.tm.matches.paragraph.from.segmented.tmx"
+		  endterm="dialog.preferences.tm.matches.paragraph.from.segmented.tmx.title"/> to enable/disable
+		  the feature.
+		  </para>
 
 		  <para>If you change the segmentation while translating, you will have
 		  to reload the project for the new segmentation to take effect. This

diff --git a/doc_src/en/HowTo_UseTM.xml b/doc_src/en/HowTo_UseTM.xml
@@ -165,6 +165,24 @@
 	  one of its subfolders and the translated data will be immediately be
 	  available for matching purposes.</para>
 	  </listitem>
+
+          <listitem>
+            <para>When you ever run projects which use <link
+			    linkend="dialogs.project.properties.options.segmentation"
+			    endterm="dialogs.project.properties.options.segmentation.title"/>,
+		    and the current project is switched to the paragraph segmentation, you can enable a preference
+		    option <link
+	    linkend="dialog.preferences.tm.matches.paragraph.from.segmented.tmx"
+	    endterm="dialog.preferences.tm.matches.paragraph.from.segmented.tmx.title"/> that allow OmegaT matches
+		    in special way. In addition to normal fuzzy matches, OmegaT try to segment source text into
+		    sentences, and conduct fuzzy match on the segmented TMX entries, then construct paragraph from
+		    the result. A suggested paragraph can be one that is not in the translation memories. It is why
+		    an origin of TM is shown as empty.
+		    Please refer <link linkend="dialog.preferences.segmentation.setup.type"
+				    endterm="dialog.preferences.segmentation.setup.type.title"/> to know pros and
+		    cons of segmentation configuration.
+            </para>
+          </listitem>
 	</itemizedlist>
 
     <section id="how.to.tm.read.and.write">

diff --git a/doc_src/en/OmegaT_Preferences.xml b/doc_src/en/OmegaT_Preferences.xml
@@ -1476,6 +1476,15 @@ ${filePath}></programlisting></para>
 		  </table>
 		</listitem>
 	  </varlistentry>
+          <varlistentry id="dialog.preferences.tm.matches.paragraph.from.segmented.tmx">
+		  <term id="dialog.preferences.tm.matches.paragraph.from.segmented.tmx.title">
+			  <option>Paragraph from segment matches of segmented TMX on a non-segment project</option></term>
+            <listitem><para>Enable matches for paragraph, which is constructed from segment matches search on segmented
+		    TMX entries searched with sentences from a segmented source text, even when running on a
+		    non-segment(paragraph segmentation) project.</para>
+            </listitem>
+
+          </varlistentry>
 	</variablelist>
   </section>
 

@@ -1817,6 +1817,7 @@ EXT_TMX_SORT_KEY_SCORE=Stemming, no tags and no numbers
 EXT_TMX_SORT_KEY_SCORE_NO_STEM=No tags and no numbers
 EXT_TMX_SORT_KEY_ADJUSTED_SCORE=Full text, including tags and numbers
 EXT_TMX_FUZZY_THRESHOLD_KEY=Minimal threshold to show a fuzzy match:
+PARAGRAPH_MATCH_FROM_SEGMENT_TMX=Paragraph from segment matches of segmented TMX on a non-segment project
 PREFS_TITLE_TM_MATCHES=TM Matches
 
 # ViewOptions
@@ -2980,3 +2981,8 @@ DICTIONARY_LOAD_FILE=Loaded dictionary from '{0}': {1} ms
 DICTIONARY_LOAD_ERROR=Error load dictionary from '{0}': {1} 
 DICTIONARY_MANAGER_ERROR_SAVE_IGNORE=Error saving ignore words"
 EDITOR_CONTROLLER_EXCEPTION=bad location exception when changing case
+
+MATCHES_COMES_FROM_TM=From TM
+MATCHES_COMES_FROM_FILES=Files
+MATCHES_COMES_FROM_MEMORY=From Project
+MATCHES_COMES_FROM_TM_SUBSEG=Sub-segmented match
diff --git a/src/org/omegat/core/matching/NearString.java b/src/org/omegat/core/matching/NearString.java
@@ -49,50 +49,95 @@
  */
 public class NearString {
     public enum MATCH_SOURCE {
-        MEMORY, TM, FILES
-    };
+        /** From project memory */
+        MEMORY,
+        /** From external TM in project tm/ folder */
+        TM,
+        /** From files */
+        FILES,
+        /** From sub-segmented match */
+        TM_SUBSEG;
-        TM_SUBSEG;
+        TM_SUBSEG
-        TM_SUBSEG;
+        TM_SUBSEG
+    }
 
     public enum SORT_KEY {
-        SCORE, SCORE_NO_STEM, ADJUSTED_SCORE
+        /** normal score */
+        SCORE,
+        /** score without stemming */
+        SCORE_NO_STEM,
+        /** adjusted score */
+        ADJUSTED_SCORE
     }
 
-    public NearString(EntryKey key, ITMXEntry entry, MATCH_SOURCE comesFrom, boolean fuzzyMark,
-                      Scores scores, byte[] nearData, String projName) {
-        this(key, entry.getSourceText(), entry.getTranslationText(), comesFrom, fuzzyMark, scores,
-                nearData, projName, entry.getCreator(), entry.getCreationDate(), entry.getChanger(),
+    /**
+     * Constructor.
+     * 
+     * @param key
+     *            entry key
+     * @param entry
+     *            the TMX entry that has source text, translation text, creator,
+     *            creation date, changer, change date and properties.
+     * @param comesFrom
+     *            origin
+     * @param fuzzyMark
+     *            fuzzy or not
+     * @param scores
+     *            score values
+     * @param nearData
+     *            similarity data.
+     * @param projName
+     *            project name.
+     */
+    public NearString(EntryKey key, ITMXEntry entry, MATCH_SOURCE comesFrom, boolean fuzzyMark, Scores scores,
+            byte[] nearData, String projName) {
+        this(key, entry.getSourceText(), entry.getTranslationText(), comesFrom, fuzzyMark, scores, nearData,
+                projName, entry.getCreator(), entry.getCreationDate(), entry.getChanger(),
                 entry.getChangeDate(), entry.getProperties());
     }
 
     /**
      * Constructor, backward compatible.
-     * @param key entry key
-     * @param source source text
-     * @param translation translation text
-     * @param comesFrom origin
-     * @param fuzzyMark fuzzy or not
+     * 
+     * @param key
+     *            entry key
+     * @param source
+     *            source text
+     * @param translation
+     *            translation text
+     * @param comesFrom
+     *            origin
+     * @param fuzzyMark
+     *            fuzzy or not
      * @param nearScore
      * @param nearScoreNoStem
      * @param adjustedScore
-     * @param nearData similarity data.
-     * @param projName project name.
-     * @param creator creator name
-     * @param creationDate creation date
-     * @param changer changer name
-     * @param changedDate changer date
-     * @param props properties of entry.
+     * @param nearData
+     *            similarity data.
+     * @param projName
+     *            project name.
+     * @param creator
+     *            creator name
+     * @param creationDate
+     *            creation date
+     * @param changer
+     *            changer name
+     * @param changedDate
+     *            changer date
+     * @param props
+     *            properties of entry.
      */
     @Deprecated
     public NearString(EntryKey key, String source, String translation, MATCH_SOURCE comesFrom,
-            boolean fuzzyMark, int nearScore, int nearScoreNoStem, int adjustedScore,
-            byte[] nearData, String projName, String creator, long creationDate,
-            String changer, long changedDate, List<TMXProp> props) {
-        this(key, source, translation, comesFrom, fuzzyMark, new Scores(nearScore, nearScoreNoStem,
-                adjustedScore), nearData, projName, creator, creationDate, changer, changedDate, props);
+            boolean fuzzyMark, int nearScore, int nearScoreNoStem, int adjustedScore, byte[] nearData,
+            String projName, String creator, long creationDate, String changer, long changedDate,
+            List<TMXProp> props) {
+        this(key, source, translation, comesFrom, fuzzyMark,
+                new Scores(nearScore, nearScoreNoStem, adjustedScore), nearData, projName, creator,
+                creationDate, changer, changedDate, props);
     }
 
     private NearString(EntryKey key, String source, String translation, MATCH_SOURCE comesFrom,
-                      boolean fuzzyMark, Scores scores, byte[] nearData, String projName, String creator,
-                      long creationDate, String changer, long changedDate, List<TMXProp> props) {
+            boolean fuzzyMark, Scores scores, byte[] nearData, String projName, String creator,
+            long creationDate, String changer, long changedDate, List<TMXProp> props) {
         this.key = key;
         this.source = source;
         this.translation = translation;
@@ -110,18 +155,27 @@ private NearString(EntryKey key, String source, String translation, MATCH_SOURCE
 
     /**
      * Merge NearString object.
-     * @param ns NearString to merge.
-     * @param key entry key.
-     * @param entry TMXEntry entry
-     * @param comesFrom origin
-     * @param fuzzyMark fuzzy or not
-     * @param scores similarity score
-     * @param nearData similarity data
-     * @param projName project name
+     * 
+     * @param ns
+     *            NearString to merge.
+     * @param key
+     *            entry key.
+     * @param entry
+     *            TMXEntry entry
+     * @param comesFrom
+     *            origin
+     * @param fuzzyMark
+     *            fuzzy or not
+     * @param scores
+     *            similarity score
+     * @param nearData
+     *            similarity data
+     * @param projName
+     *            project name
      * @return NearString merged.
      */
     public static NearString merge(NearString ns, EntryKey key, ITMXEntry entry, MATCH_SOURCE comesFrom,
-                                   boolean fuzzyMark, Scores scores, byte[] nearData, String projName) {
+            boolean fuzzyMark, Scores scores, byte[] nearData, String projName) {
 
         List<String> projs = new ArrayList<>();
         List<Scores> mergedScores = new ArrayList<>();
@@ -134,9 +188,8 @@ public static NearString merge(NearString ns, EntryKey key, ITMXEntry entry, MAT
             projs.add(0, projName);
             mergedScores.add(0, merged.scores[0]);
         } else {
-            merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark,
-                    scores, ns.attr, null, ns.creator, ns.creationDate, ns.changer,
-                    ns.changedDate, ns.props);
+            merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark, scores,
+                    ns.attr, null, ns.creator, ns.creationDate, ns.changer, ns.changedDate, ns.props);
             projs.add(projName);
             mergedScores.add(merged.scores[0]);
         }
@@ -146,10 +199,11 @@ public static NearString merge(NearString ns, EntryKey key, ITMXEntry entry, MAT
     }
 
     @Deprecated
-    public static NearString merge(NearString ns, final EntryKey key, final String source, final String translation,
-            MATCH_SOURCE comesFrom, final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem,
-            final int adjustedScore, final byte[] nearData, final String projName, final String creator,
-            final long creationDate, final String changer, final long changedDate, final List<TMXProp> props) {
+    public static NearString merge(NearString ns, final EntryKey key, final String source,
+            final String translation, MATCH_SOURCE comesFrom, final boolean fuzzyMark, final int nearScore,
+            final int nearScoreNoStem, final int adjustedScore, final byte[] nearData, final String projName,
+            final String creator, final long creationDate, final String changer, final long changedDate,
+            final List<TMXProp> props) {
 
         List<String> projs = new ArrayList<>();
         List<Scores> mergedScores = new ArrayList<>();
@@ -159,7 +213,8 @@ public static NearString merge(NearString ns, final EntryKey key, final String s
         NearString merged;
         if (nearScore > ns.scores[0].score) {
             merged = new NearString(key, source, translation, comesFrom, fuzzyMark, nearScore,
-                    nearScoreNoStem, adjustedScore, nearData, null, creator, creationDate, changer, changedDate, props);
+                    nearScoreNoStem, adjustedScore, nearData, null, creator, creationDate, changer,
+                    changedDate, props);
             projs.add(0, projName);
             mergedScores.add(0, merged.scores[0]);
         } else {
@@ -204,11 +259,17 @@ public static class Scores {
         public final int scoreNoStem;
         /** adjusted similarity score for match including all tokens */
         public final int adjustedScore;
+        public final int penalty;
 
         public Scores(int score, int scoreNoStem, int adjustedScore) {
+            this(score, scoreNoStem, adjustedScore, 0);
+        }
+
+        public Scores(int score, int scoreNoStem, int adjustedScore, int penalty) {
             this.score = score;
             this.scoreNoStem = scoreNoStem;
             this.adjustedScore = adjustedScore;
+            this.penalty = penalty;
         }
 
         public String toString() {

diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java
@@ -51,7 +51,6 @@
 import org.omegat.core.threads.LongProcessThread;
 import org.omegat.util.OConsts;
 import org.omegat.util.OStrings;
-import org.omegat.util.Preferences;
 import org.omegat.util.StringUtil;
 import org.omegat.util.Token;
 import org.omegat.util.gui.TextUtil;
@@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread {
     private final IProject project;
 
     public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
-        this(Core.getProject(), Core.getSegmenter(), callback, perFile,
-                Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
-                OConsts.FUZZY_MATCH_THRESHOLD));
+        this(Core.getProject(), Core.getSegmenter(), callback, perFile);
     }
 
-    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
-                               boolean perFile, int threshold) {
+    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) {
         this.project = project;
         this.callback = callback;
         this.perFile = perFile;
         finder = ThreadLocal.withInitial(
-                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
-                        false, false, threshold));
+                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1));
     }
 
     @Override
@@ -313,7 +308,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
     int calcMaxSimilarity(SourceTextEntry ste) {
         String srcNoXmlTags = removeXmlTags(ste);
         FindMatches localFinder = finder.get();
-        List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
+        List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
         final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
         int maxSimilarity = 0;
         CACHE: for (NearString near : nears) {