diff --git a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java index e90b345be..125c31b26 100644 --- a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java +++ b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java @@ -1,6 +1,7 @@ package org.jembi.jempi.linker.backend; import com.fasterxml.jackson.core.JsonProcessingException; +import org.apache.commons.codec.language.Soundex; import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; @@ -27,6 +28,7 @@ public final class LinkerProbabilistic { static final JaccardSimilarity JACCARD_SIMILARITY = new JaccardSimilarity(); static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity(); static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity(); + static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity(); private static final int METRIC_MIN = 0; private static final int METRIC_MAX = 1; private static final int METRIC_SCORE = 2; @@ -36,15 +38,15 @@ public final class LinkerProbabilistic { private static final float MISSING_PENALTY = 0.925F; static List currentProbabilisticLinkFields = LINKER_CONFIG.probabilisticLinkFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List currentProbabilisticValidateFields = LINKER_CONFIG.probabilisticValidateFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List currentProbabilisticMatchFields = LINKER_CONFIG.probabilisticMatchNotificationFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List updatedProbabilisticLinkFields = null; @@ -65,18 +67,33 @@ static List toLinkProbabilisticFieldList( final var list = new ArrayList(); for (int i = 0; i < mu.size(); i++) { list.add(new ProbabilisticField( - getSimilarityFunction(probabilisticMetaData.get(i).similarityScore()), + getSimilarityFunction(SimilarityFunctionName.valueOf(probabilisticMetaData.get(i).similarityScore())), probabilisticMetaData.get(i).comparisonLevels(), mu.get(i).m(), mu.get(i).u())); } return list; } - static SimilarityScore getSimilarityFunction(final String func) { - if ("JARO_WINKLER_SIMILARITY".equals(func)) { - return JARO_WINKLER_SIMILARITY; - } else { - return JACCARD_SIMILARITY; + public enum SimilarityFunctionName { + JARO_WINKLER_SIMILARITY, + JARO_SIMILARITY, + JACCARD_SIMILARITY, + SOUNDEX_SIMILARITY, + EXACT_SIMILARITY + } + + static SimilarityScore getSimilarityFunction(final SimilarityFunctionName func) { + switch (func) { + case JARO_WINKLER_SIMILARITY: + return JARO_WINKLER_SIMILARITY; + case JARO_SIMILARITY: + return JARO_SIMILARITY; + case JACCARD_SIMILARITY: + return JACCARD_SIMILARITY; + case SOUNDEX_SIMILARITY: + return SOUNDEX_SIMILARITY; + default: + return EXACT_SIMILARITY; } } @@ -268,7 +285,7 @@ public Double apply( if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) { return 0.5; } - // assert - we have 2 non-empty strings + return StringUtils.equals(left, right) ? 1.0 : 0.0; @@ -276,6 +293,25 @@ public Double apply( } + static class SoundexSimilarity implements SimilarityScore { + + private final Soundex soundex = new Soundex(); + + @Override + public Double apply( + final CharSequence left, + final CharSequence right) { + if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) { + return 0.5; + } + + return StringUtils.equals(soundex.soundex(left.toString()), soundex.soundex(right.toString())) + ? 1.0 + : 0.0; + } + + } + static class JaroSimilarity implements SimilarityScore { @Override