diff --git a/pom.xml b/pom.xml index d30f4ed..7a193ac 100644 --- a/pom.xml +++ b/pom.xml @@ -230,6 +230,11 @@ guava 19.0 + + org.nibor.autolink + autolink + 0.6.0 + org.slf4j slf4j-api diff --git a/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java b/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java index 6813089..a7ec3f7 100644 --- a/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java @@ -13,10 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.optimaize.langdetect.text; -import java.util.regex.Pattern; +import org.nibor.autolink.*; + +import java.util.EnumSet; /** * Removes URLs and email addresses from the text. @@ -25,8 +26,9 @@ */ public class UrlTextFilter implements TextFilter { - private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+"); - private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+"); + private static final LinkExtractor linkExtractor = LinkExtractor.builder() + .linkTypes(EnumSet.of(LinkType.URL, LinkType.WWW, LinkType.EMAIL)) + .build(); private static final UrlTextFilter INSTANCE = new UrlTextFilter(); @@ -38,9 +40,16 @@ private UrlTextFilter() { } @Override - public String filter(CharSequence text) { - String modified = URL_REGEX.matcher(text).replaceAll(" "); - return MAIL_REGEX.matcher(modified).replaceAll(" "); + public String filter(CharSequence originalText) { + return Autolink.renderLinks(originalText, linkExtractor.extractLinks(originalText), new NullRenderer()); + } + + private class NullRenderer implements LinkRenderer { + + @Override + public void render(LinkSpan link, CharSequence text, StringBuilder sb) { + sb.append(" "); + } } } diff --git a/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java b/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java new file mode 100644 index 0000000..e7a0c1e --- /dev/null +++ b/src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java @@ -0,0 +1,74 @@ +/* + * Copyright 2011 Fabian Kessler + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.optimaize.langdetect.text; + +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + +public class UrlTextFilterTest { + + @DataProvider(name = "exampleUrls") + public static Object[][] data() { + return new Object[][]{ // + {"http://foo.com/blah_blah", true}, // + {"http://foo.com:8080/blah_blah", true}, // + {"http://foo.com/blah_blah/", true}, // + {"http://foo-bar.com/blah_blah/", true}, // + {"http://foo.com/blah_blah_(wikipedia)", true}, // + {"http://foo.com/blah_blah_(wikipedia)_(again)", true}, // + {"http://www.example.com/wpstyle/?p=364", true}, // + {"https://www.example.com/foo/?bar=baz&inga=42&quux", true}, // + {"http://userid:password@example.com:8080", true}, // + {"http://userid:password@example.com:8080/", true}, // + {"http://userid@example.com", true}, // + {"http://userid@example.com/", true}, // + {"http://userid@example.com:8080", true}, // + {"http://userid@example.com:8080/", true}, // + {"http://userid:password@example.com", true}, // + {"http://foo.com/blah_(wikipedia)#cite-1", true}, // + {"http://foo.com/blah_(wikipedia)_blah#cite-1", true}, // + {"http://foo.com/(something)?after=parens", true}, // + {"http://code.google.com/events/#&product=browser", true}, // + {"https://foo.com/blah_blah", true}, // + {"ftp://foo.com/blah_blah", true}, // + {"http://.", true}, // + {"http://?", true}, // + {"http://#", true}, // + {"http://##", true}, // + {"//", false}, // + {"//a", false}, // + {"///a", false}, // + {"///", false}, // + {"h://test", true}, // + {"http://-error-.invalid/", true} // + }; + } + + @Test(dataProvider = "exampleUrls") + public void checkUrl(String testUrl, Boolean acceptUrl) { + String testTextPrefix = "just a short test "; + String testTextSuffix = " here"; + + String expectedResult = testTextPrefix + (acceptUrl ? " " : testUrl) + testTextSuffix; + + UrlTextFilter filter = UrlTextFilter.getInstance(); + String result = filter.filter(testTextPrefix + testUrl + testTextSuffix); + + assertEquals(result, expectedResult); + } +}