diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java index 071e666..f8a31af 100644 --- a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java @@ -1,5 +1,6 @@ package com.optimaize.langdetect.text; +import java.lang.Character.UnicodeScript; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -15,6 +16,16 @@ */ public class RemoveMinorityScriptsTextFilter implements TextFilter { + private static final int INHERITED = UnicodeScript.INHERITED.ordinal(); + private static final int COMMON = UnicodeScript.COMMON.ordinal(); + private static final int UNKNOWN = UnicodeScript.UNKNOWN.ordinal(); + + private static final int[] SCRIPT_IDS = new int[65536]; + static { + for (int c = 0; c < SCRIPT_IDS.length; c++) { + SCRIPT_IDS[c] = Character.UnicodeScript.of(c).ordinal(); + } + } private final double threshold; /** @@ -34,14 +45,14 @@ private RemoveMinorityScriptsTextFilter(double threshold) { @Override public String filter(CharSequence text) { - Map counts = countByScript(text); + Map counts = countByScript(text); if (counts.size()<=1) { //nothing to do return text.toString(); } else { long most = findMost(counts); - Set toRemove = new HashSet<>(); - for (Map.Entry entry : counts.entrySet()) { + Set toRemove = new HashSet<>(); + for (Map.Entry entry : counts.entrySet()) { if (entry.getValue()==most) continue; double ratio = entry.getValue().doubleValue() / most; if (ratio <= threshold) { @@ -56,20 +67,20 @@ public String filter(CharSequence text) { } } - private String remove(CharSequence text, Set toRemove) { + private String remove(CharSequence text, Set toRemove) { StringBuilder remaining = new StringBuilder(); - Character.UnicodeScript last = null; + int last = -1; for (int i=0; i toRemove) return remaining.toString(); } - private long findMost(Map counts) { + private long findMost(Map counts) { long max = 0L; for (Long aLong : counts.values()) { if (aLong > max) max = aLong; @@ -86,37 +97,31 @@ private long findMost(Map counts) { return max; } - private Map countByScript(CharSequence text) { - Map counter = new HashMap<>(); - Character.UnicodeScript last = null; + private Map countByScript(CharSequence text) { + int last = -1; + long[] counter = new long[UnicodeScript.values().length]; for (int i=0; i= 0) { //really shouldn't be null + counter[last]++; + } + } else if(id != COMMON && id != UNKNOWN) { + counter[id]++; + last = id; } } - return counter; - } - private void increment(Map counter, Character.UnicodeScript unicodeScript) { - Long number = counter.get(unicodeScript); - if (number==null) { - counter.put(unicodeScript, 1L); - } else { - counter.put(unicodeScript, number+1); + + Map result = new HashMap<>(); + for (int i = 0; i < counter.length; i++) { + long value = counter[i]; + if (value > 0) { + result.put(i, value); + } } - } + return result; + } }