Skip to content
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.jembi.jempi.linker.backend;

import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.commons.text.similarity.SimilarityScore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -27,6 +29,8 @@ public final class LinkerProbabilistic {
static final JaccardSimilarity JACCARD_SIMILARITY = new JaccardSimilarity();
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity();
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity();
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity();
static final LevenshteinSimilarityPercentage LEVENSHTEIN_SIMILARITY_PERCENTAGE = new LevenshteinSimilarityPercentage();
private static final int METRIC_MIN = 0;
private static final int METRIC_MAX = 1;
private static final int METRIC_SCORE = 2;
Expand All @@ -36,15 +40,15 @@ public final class LinkerProbabilistic {
private static final float MISSING_PENALTY = 0.925F;
static List<ProbabilisticField> currentProbabilisticLinkFields = LINKER_CONFIG.probabilisticLinkFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();
static List<ProbabilisticField> currentProbabilisticValidateFields = LINKER_CONFIG.probabilisticValidateFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();
static List<ProbabilisticField> currentProbabilisticMatchFields = LINKER_CONFIG.probabilisticMatchNotificationFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();

static List<ProbabilisticField> updatedProbabilisticLinkFields = null;
Expand All @@ -65,18 +69,36 @@ static List<ProbabilisticField> toLinkProbabilisticFieldList(
final var list = new ArrayList<ProbabilisticField>();
for (int i = 0; i < mu.size(); i++) {
list.add(new ProbabilisticField(
getSimilarityFunction(probabilisticMetaData.get(i).similarityScore()),
getSimilarityFunction(SimilarityFunctionName.valueOf(probabilisticMetaData.get(i).similarityScore())),
probabilisticMetaData.get(i).comparisonLevels(),
mu.get(i).m(), mu.get(i).u()));
}
return list;
}

static SimilarityScore<Double> getSimilarityFunction(final String func) {
if ("JARO_WINKLER_SIMILARITY".equals(func)) {
return JARO_WINKLER_SIMILARITY;
} else {
return JACCARD_SIMILARITY;
public enum SimilarityFunctionName {
JARO_WINKLER_SIMILARITY,
JARO_SIMILARITY,
JACCARD_SIMILARITY,
SOUNDEX_SIMILARITY,
EXACT_SIMILARITY,
LEVENSHTEIN_SIMILARITY_PERCENTAGE
}

static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionName func) {
switch (func) {
case JARO_WINKLER_SIMILARITY:
return JARO_WINKLER_SIMILARITY;
case JARO_SIMILARITY:
return JARO_SIMILARITY;
case JACCARD_SIMILARITY:
return JACCARD_SIMILARITY;
case SOUNDEX_SIMILARITY:
return SOUNDEX_SIMILARITY;
case LEVENSHTEIN_SIMILARITY_PERCENTAGE:
return LEVENSHTEIN_SIMILARITY_PERCENTAGE;
default:
return EXACT_SIMILARITY;
}
}

Expand Down Expand Up @@ -268,14 +290,57 @@ public Double apply(
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}
// assert - we have 2 non-empty strings

return StringUtils.equals(left, right)
? 1.0
: 0.0;
}

}

static class SoundexSimilarity implements SimilarityScore<Double> {

private final Soundex soundex = new Soundex();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.0;
}

return StringUtils.equals(soundex.soundex(left.toString()), soundex.soundex(right.toString()))
? 1.0
: 0.0;
}

}

static class LevenshteinSimilarityPercentage implements SimilarityScore<Double> {

private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.0;
}

int maxLength = Math.max(left.length(), right.length());
double levenshteinDistanceValue = levenshteinDistance.apply(left, right);

// Invert the percentage value
double percentage = (levenshteinDistanceValue / maxLength) * 100;
double invertedPercentage = 100 - percentage;

return invertedPercentage / 100.0;
}

}

static class JaroSimilarity implements SimilarityScore<Double> {

@Override
Expand Down