From 6cdaf760c62d09f5ab8bedd34be3d5853cdeee60 Mon Sep 17 00:00:00 2001 From: aoki Date: Fri, 1 Jul 2016 06:38:16 +0000 Subject: [PATCH 1/3] faster RemoveMinorityScriptsTextFilter --- .../text/RemoveMinorityScriptsTextFilter.java | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java index 071e666..29e17da 100644 --- a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java @@ -1,5 +1,6 @@ package com.optimaize.langdetect.text; +import java.lang.Character.UnicodeScript; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -15,6 +16,13 @@ */ public class RemoveMinorityScriptsTextFilter implements TextFilter { + private static final UnicodeScript[] SCRIPTS = new UnicodeScript[65535]; + static { + for (char c = 0; c < 65535; c++) { + SCRIPTS[c] = Character.UnicodeScript.of(c); + } + } + private final double threshold; /** @@ -61,7 +69,7 @@ private String remove(CharSequence text, Set toRemove) Character.UnicodeScript last = null; for (int i=0; i counts) { } private Map countByScript(CharSequence text) { - Map counter = new HashMap<>(); Character.UnicodeScript last = null; + long[] counter = new long[UnicodeScript.values().length]; for (int i=0; i countByScript(CharSequence text) { //don't count it break; default: - increment(counter, unicodeScript); + counter[unicodeScript.ordinal()]++; last = unicodeScript; } } - return counter; - } - private void increment(Map counter, Character.UnicodeScript unicodeScript) { - Long number = counter.get(unicodeScript); - if (number==null) { - counter.put(unicodeScript, 1L); - } else { - counter.put(unicodeScript, number+1); + + Map result = new HashMap<>(); + for (int i = 0; i < counter.length; i++) { + long value = counter[i]; + if(value > 0){ + result.put(UnicodeScript.values()[i], value); + } } - } + return result; + } } From 410fbb4a3017c11bd3f2ce92bec58296400299fe Mon Sep 17 00:00:00 2001 From: aoki Date: Fri, 1 Jul 2016 08:25:32 +0000 Subject: [PATCH 2/3] fix border value --- .../langdetect/text/RemoveMinorityScriptsTextFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java index 29e17da..1b48722 100644 --- a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java @@ -16,9 +16,9 @@ */ public class RemoveMinorityScriptsTextFilter implements TextFilter { - private static final UnicodeScript[] SCRIPTS = new UnicodeScript[65535]; + private static final UnicodeScript[] SCRIPTS = new UnicodeScript[65536]; static { - for (char c = 0; c < 65535; c++) { + for (int c = 0; c < SCRIPTS.length; c++) { SCRIPTS[c] = Character.UnicodeScript.of(c); } } From d3f048281f3aad269e83ff44ac0cc00410e86453 Mon Sep 17 00:00:00 2001 From: aoki Date: Mon, 4 Jul 2016 03:12:14 +0000 Subject: [PATCH 3/3] use Integer set --- .../text/RemoveMinorityScriptsTextFilter.java | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java index 1b48722..f8a31af 100644 --- a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java @@ -16,13 +16,16 @@ */ public class RemoveMinorityScriptsTextFilter implements TextFilter { - private static final UnicodeScript[] SCRIPTS = new UnicodeScript[65536]; + private static final int INHERITED = UnicodeScript.INHERITED.ordinal(); + private static final int COMMON = UnicodeScript.COMMON.ordinal(); + private static final int UNKNOWN = UnicodeScript.UNKNOWN.ordinal(); + + private static final int[] SCRIPT_IDS = new int[65536]; static { - for (int c = 0; c < SCRIPTS.length; c++) { - SCRIPTS[c] = Character.UnicodeScript.of(c); + for (int c = 0; c < SCRIPT_IDS.length; c++) { + SCRIPT_IDS[c] = Character.UnicodeScript.of(c).ordinal(); } } - private final double threshold; /** @@ -42,14 +45,14 @@ private RemoveMinorityScriptsTextFilter(double threshold) { @Override public String filter(CharSequence text) { - Map counts = countByScript(text); + Map counts = countByScript(text); if (counts.size()<=1) { //nothing to do return text.toString(); } else { long most = findMost(counts); - Set toRemove = new HashSet<>(); - for (Map.Entry entry : counts.entrySet()) { + Set toRemove = new HashSet<>(); + for (Map.Entry entry : counts.entrySet()) { if (entry.getValue()==most) continue; double ratio = entry.getValue().doubleValue() / most; if (ratio <= threshold) { @@ -64,20 +67,20 @@ public String filter(CharSequence text) { } } - private String remove(CharSequence text, Set toRemove) { + private String remove(CharSequence text, Set toRemove) { StringBuilder remaining = new StringBuilder(); - Character.UnicodeScript last = null; + int last = -1; for (int i=0; i toRemove) return remaining.toString(); } - private long findMost(Map counts) { + private long findMost(Map counts) { long max = 0L; for (Long aLong : counts.values()) { if (aLong > max) max = aLong; @@ -94,34 +97,28 @@ private long findMost(Map counts) { return max; } - private Map countByScript(CharSequence text) { - Character.UnicodeScript last = null; + private Map countByScript(CharSequence text) { + int last = -1; long[] counter = new long[UnicodeScript.values().length]; for (int i=0; i= 0) { //really shouldn't be null + counter[last]++; + } + } else if(id != COMMON && id != UNKNOWN) { + counter[id]++; + last = id; } } - Map result = new HashMap<>(); + Map result = new HashMap<>(); for (int i = 0; i < counter.length; i++) { long value = counter[i]; - if(value > 0){ - result.put(UnicodeScript.values()[i], value); + if (value > 0) { + result.put(i, value); } }