From 438b3f91cbdcc26ed95b1b5ca4bd2a5eff4115da Mon Sep 17 00:00:00 2001 From: Matthew Erispe Date: Wed, 16 Oct 2024 13:01:25 +0200 Subject: [PATCH 1/2] implement soundex similarity --- .../linker/backend/LinkerProbabilistic.java | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java index e90b345be..c1bba655a 100644 --- a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java +++ b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java @@ -1,6 +1,7 @@ package org.jembi.jempi.linker.backend; import com.fasterxml.jackson.core.JsonProcessingException; +import org.apache.commons.codec.language.Soundex; import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; @@ -27,6 +28,7 @@ public final class LinkerProbabilistic { static final JaccardSimilarity JACCARD_SIMILARITY = new JaccardSimilarity(); static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity(); static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity(); + static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity(); private static final int METRIC_MIN = 0; private static final int METRIC_MAX = 1; private static final int METRIC_SCORE = 2; @@ -73,10 +75,17 @@ static List toLinkProbabilisticFieldList( } static SimilarityScore getSimilarityFunction(final String func) { - if ("JARO_WINKLER_SIMILARITY".equals(func)) { - return JARO_WINKLER_SIMILARITY; - } else { - return JACCARD_SIMILARITY; + switch (func) { + case "JARO_WINKLER_SIMILARITY": + return JARO_WINKLER_SIMILARITY; + case "JARO_SIMILARITY": + return JARO_SIMILARITY; + case "JACCARD_SIMILARITY": + return JACCARD_SIMILARITY; + case "SOUNDEX_SIMILARITY": + return SOUNDEX_SIMILARITY; + default: + return EXACT_SIMILARITY; } } @@ -268,7 +277,7 @@ public Double apply( if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) { return 0.5; } - // assert - we have 2 non-empty strings + return StringUtils.equals(left, right) ? 1.0 : 0.0; @@ -276,6 +285,25 @@ public Double apply( } + static class SoundexSimilarity implements SimilarityScore { + + private final Soundex soundex = new Soundex(); + + @Override + public Double apply( + final CharSequence left, + final CharSequence right) { + if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) { + return 0.5; + } + + return StringUtils.equals(soundex.soundex((String) left), soundex.soundex((String) right)) + ? 1.0 + : 0.0; + } + + } + static class JaroSimilarity implements SimilarityScore { @Override From 9e5efbfea899a711d039b0474fde2a31b3738a56 Mon Sep 17 00:00:00 2001 From: Matthew Erispe Date: Wed, 16 Oct 2024 13:53:30 +0200 Subject: [PATCH 2/2] refactor --- .../linker/backend/LinkerProbabilistic.java | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java index c1bba655a..125c31b26 100644 --- a/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java +++ b/JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java @@ -38,15 +38,15 @@ public final class LinkerProbabilistic { private static final float MISSING_PENALTY = 0.925F; static List currentProbabilisticLinkFields = LINKER_CONFIG.probabilisticLinkFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List currentProbabilisticValidateFields = LINKER_CONFIG.probabilisticValidateFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List currentProbabilisticMatchFields = LINKER_CONFIG.probabilisticMatchNotificationFields .stream() - .map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u())) + .map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u())) .toList(); static List updatedProbabilisticLinkFields = null; @@ -67,22 +67,30 @@ static List toLinkProbabilisticFieldList( final var list = new ArrayList(); for (int i = 0; i < mu.size(); i++) { list.add(new ProbabilisticField( - getSimilarityFunction(probabilisticMetaData.get(i).similarityScore()), + getSimilarityFunction(SimilarityFunctionName.valueOf(probabilisticMetaData.get(i).similarityScore())), probabilisticMetaData.get(i).comparisonLevels(), mu.get(i).m(), mu.get(i).u())); } return list; } - static SimilarityScore getSimilarityFunction(final String func) { + public enum SimilarityFunctionName { + JARO_WINKLER_SIMILARITY, + JARO_SIMILARITY, + JACCARD_SIMILARITY, + SOUNDEX_SIMILARITY, + EXACT_SIMILARITY + } + + static SimilarityScore getSimilarityFunction(final SimilarityFunctionName func) { switch (func) { - case "JARO_WINKLER_SIMILARITY": + case JARO_WINKLER_SIMILARITY: return JARO_WINKLER_SIMILARITY; - case "JARO_SIMILARITY": + case JARO_SIMILARITY: return JARO_SIMILARITY; - case "JACCARD_SIMILARITY": + case JACCARD_SIMILARITY: return JACCARD_SIMILARITY; - case "SOUNDEX_SIMILARITY": + case SOUNDEX_SIMILARITY: return SOUNDEX_SIMILARITY; default: return EXACT_SIMILARITY; @@ -297,7 +305,7 @@ public Double apply( return 0.5; } - return StringUtils.equals(soundex.soundex((String) left), soundex.soundex((String) right)) + return StringUtils.equals(soundex.soundex(left.toString()), soundex.soundex(right.toString())) ? 1.0 : 0.0; }