diff --git a/pom.xml b/pom.xml
index d30f4ed..7a193ac 100644
--- a/pom.xml
+++ b/pom.xml
@@ -230,6 +230,11 @@
guava
19.0
+
+ org.nibor.autolink
+ autolink
+ 0.6.0
+
org.slf4j
slf4j-api
diff --git a/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java b/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java
index 6813089..a7ec3f7 100644
--- a/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java
+++ b/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java
@@ -13,10 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package com.optimaize.langdetect.text;
-import java.util.regex.Pattern;
+import org.nibor.autolink.*;
+
+import java.util.EnumSet;
/**
* Removes URLs and email addresses from the text.
@@ -25,8 +26,9 @@
*/
public class UrlTextFilter implements TextFilter {
- private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
- private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+ private static final LinkExtractor linkExtractor = LinkExtractor.builder()
+ .linkTypes(EnumSet.of(LinkType.URL, LinkType.WWW, LinkType.EMAIL))
+ .build();
private static final UrlTextFilter INSTANCE = new UrlTextFilter();
@@ -38,9 +40,16 @@ private UrlTextFilter() {
}
@Override
- public String filter(CharSequence text) {
- String modified = URL_REGEX.matcher(text).replaceAll(" ");
- return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ public String filter(CharSequence originalText) {
+ return Autolink.renderLinks(originalText, linkExtractor.extractLinks(originalText), new NullRenderer());
+ }
+
+ private class NullRenderer implements LinkRenderer {
+
+ @Override
+ public void render(LinkSpan link, CharSequence text, StringBuilder sb) {
+ sb.append(" ");
+ }
}
}
diff --git a/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java b/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java
new file mode 100644
index 0000000..e7a0c1e
--- /dev/null
+++ b/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2011 Fabian Kessler
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.optimaize.langdetect.text;
+
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+public class UrlTextFilterTest {
+
+ @DataProvider(name = "exampleUrls")
+ public static Object[][] data() {
+ return new Object[][]{ //
+ {"http://foo.com/blah_blah", true}, //
+ {"http://foo.com:8080/blah_blah", true}, //
+ {"http://foo.com/blah_blah/", true}, //
+ {"http://foo-bar.com/blah_blah/", true}, //
+ {"http://foo.com/blah_blah_(wikipedia)", true}, //
+ {"http://foo.com/blah_blah_(wikipedia)_(again)", true}, //
+ {"http://www.example.com/wpstyle/?p=364", true}, //
+ {"https://www.example.com/foo/?bar=baz&inga=42&quux", true}, //
+ {"http://userid:password@example.com:8080", true}, //
+ {"http://userid:password@example.com:8080/", true}, //
+ {"http://userid@example.com", true}, //
+ {"http://userid@example.com/", true}, //
+ {"http://userid@example.com:8080", true}, //
+ {"http://userid@example.com:8080/", true}, //
+ {"http://userid:password@example.com", true}, //
+ {"http://foo.com/blah_(wikipedia)#cite-1", true}, //
+ {"http://foo.com/blah_(wikipedia)_blah#cite-1", true}, //
+ {"http://foo.com/(something)?after=parens", true}, //
+ {"http://code.google.com/events/#&product=browser", true}, //
+ {"https://foo.com/blah_blah", true}, //
+ {"ftp://foo.com/blah_blah", true}, //
+ {"http://.", true}, //
+ {"http://?", true}, //
+ {"http://#", true}, //
+ {"http://##", true}, //
+ {"//", false}, //
+ {"//a", false}, //
+ {"///a", false}, //
+ {"///", false}, //
+ {"h://test", true}, //
+ {"http://-error-.invalid/", true} //
+ };
+ }
+
+ @Test(dataProvider = "exampleUrls")
+ public void checkUrl(String testUrl, Boolean acceptUrl) {
+ String testTextPrefix = "just a short test ";
+ String testTextSuffix = " here";
+
+ String expectedResult = testTextPrefix + (acceptUrl ? " " : testUrl) + testTextSuffix;
+
+ UrlTextFilter filter = UrlTextFilter.getInstance();
+ String result = filter.filter(testTextPrefix + testUrl + testTextSuffix);
+
+ assertEquals(result, expectedResult);
+ }
+}