optimaize · nbartels · Jul 8, 2017
diff --git a/pom.xml b/pom.xml
@@ -230,6 +230,11 @@
             <artifactId>guava</artifactId>
             <version>19.0</version>
         </dependency>
+        <dependency><!-- used for url and email detection -->
+            <groupId>org.nibor.autolink</groupId>
+            <artifactId>autolink</artifactId>
+            <version>0.6.0</version>
+        </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>

diff --git a/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java b/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java
@@ -13,10 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package com.optimaize.langdetect.text;
 
-import java.util.regex.Pattern;
+import org.nibor.autolink.*;
+
+import java.util.EnumSet;
 
 /**
  * Removes URLs and email addresses from the text.
@@ -25,8 +26,9 @@
  */
 public class UrlTextFilter implements TextFilter {
 
-    private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
-    private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+    private static final LinkExtractor linkExtractor = LinkExtractor.builder()
+            .linkTypes(EnumSet.of(LinkType.URL, LinkType.WWW, LinkType.EMAIL))
+            .build();
 
     private static final UrlTextFilter INSTANCE = new UrlTextFilter();
 
@@ -38,9 +40,16 @@ private UrlTextFilter() {
     }
 
     @Override
-    public String filter(CharSequence text) {
-        String modified = URL_REGEX.matcher(text).replaceAll(" ");
-        return MAIL_REGEX.matcher(modified).replaceAll(" ");
+    public String filter(CharSequence originalText) {
+        return Autolink.renderLinks(originalText, linkExtractor.extractLinks(originalText), new NullRenderer());
+    }
+
+    private class NullRenderer implements LinkRenderer {
+
+        @Override
+        public void render(LinkSpan link, CharSequence text, StringBuilder sb) {
+            sb.append(" ");
+        }
     }
 
 }
diff --git a/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java b/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2011 Fabian Kessler
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.optimaize.langdetect.text;
+
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+public class UrlTextFilterTest {
+
+    @DataProvider(name = "exampleUrls")
+    public static Object[][] data() {
+        return new Object[][]{ //
+                {"http://foo.com/blah_blah", true}, //
+                {"http://foo.com:8080/blah_blah", true}, //
+                {"http://foo.com/blah_blah/", true}, //
+                {"http://foo-bar.com/blah_blah/", true}, //
+                {"http://foo.com/blah_blah_(wikipedia)", true}, //
+                {"http://foo.com/blah_blah_(wikipedia)_(again)", true}, //
+                {"http://www.example.com/wpstyle/?p=364", true}, //
+                {"https://www.example.com/foo/?bar=baz&inga=42&quux", true}, //
+                {"http://userid:[email protected]:8080", true}, //
+                {"http://userid:[email protected]:8080/", true}, //
+                {"http://[email protected]", true}, //
+                {"http://[email protected]/", true}, //
+                {"http://[email protected]:8080", true}, //
+                {"http://[email protected]:8080/", true}, //
+                {"http://userid:[email protected]", true}, //
+                {"http://foo.com/blah_(wikipedia)#cite-1", true}, //
+                {"http://foo.com/blah_(wikipedia)_blah#cite-1", true}, //
+                {"http://foo.com/(something)?after=parens", true}, //
+                {"http://code.google.com/events/#&product=browser", true}, //
+                {"https://foo.com/blah_blah", true}, //
+                {"ftp://foo.com/blah_blah", true}, //
+                {"http://.", true}, //
+                {"http://?", true}, //
+                {"http://#", true}, //
+                {"http://##", true}, //
+                {"//", false}, //
+                {"//a", false}, //
+                {"///a", false}, //
+                {"///", false}, //
+                {"h://test", true}, //
+                {"http://-error-.invalid/", true} //
+        };
+    }
+
+    @Test(dataProvider = "exampleUrls")
+    public void checkUrl(String testUrl, Boolean acceptUrl) {
+        String testTextPrefix = "just a short test ";
+        String testTextSuffix = " here";
+
+        String expectedResult = testTextPrefix + (acceptUrl ? " " : testUrl) + testTextSuffix;
+
+        UrlTextFilter filter = UrlTextFilter.getInstance();
+        String result = filter.filter(testTextPrefix + testUrl + testTextSuffix);
+
+        assertEquals(result, expectedResult);
+    }
+}