Skip to content

Commit

Permalink
Issue optimaize#77 - better link detection for UrlTextFilter added
Browse files Browse the repository at this point in the history
  • Loading branch information
nbartels committed Jul 8, 2017
1 parent 1a322c4 commit ce6fa91
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 7 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
<dependency><!-- used for url and email detection -->
<groupId>org.nibor.autolink</groupId>
<artifactId>autolink</artifactId>
<version>0.6.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Expand Down
23 changes: 16 additions & 7 deletions src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.optimaize.langdetect.text;

import java.util.regex.Pattern;
import org.nibor.autolink.*;

import java.util.EnumSet;

/**
* Removes URLs and email addresses from the text.
Expand All @@ -25,8 +26,9 @@
*/
public class UrlTextFilter implements TextFilter {

private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
private static final LinkExtractor linkExtractor = LinkExtractor.builder()
.linkTypes(EnumSet.of(LinkType.URL, LinkType.WWW, LinkType.EMAIL))
.build();

private static final UrlTextFilter INSTANCE = new UrlTextFilter();

Expand All @@ -38,9 +40,16 @@ private UrlTextFilter() {
}

@Override
public String filter(CharSequence text) {
String modified = URL_REGEX.matcher(text).replaceAll(" ");
return MAIL_REGEX.matcher(modified).replaceAll(" ");
public String filter(CharSequence originalText) {
return Autolink.renderLinks(originalText, linkExtractor.extractLinks(originalText), new NullRenderer());
}

private class NullRenderer implements LinkRenderer {

@Override
public void render(LinkSpan link, CharSequence text, StringBuilder sb) {
sb.append(" ");
}
}

}
74 changes: 74 additions & 0 deletions src/test/java/com/optimaize/langdetect/text/UrlTextFilterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright 2011 Fabian Kessler
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.optimaize.langdetect.text;

import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;

public class UrlTextFilterTest {

@DataProvider(name = "exampleUrls")
public static Object[][] data() {
return new Object[][]{ //
{"http://foo.com/blah_blah", true}, //
{"http://foo.com:8080/blah_blah", true}, //
{"http://foo.com/blah_blah/", true}, //
{"http://foo-bar.com/blah_blah/", true}, //
{"http://foo.com/blah_blah_(wikipedia)", true}, //
{"http://foo.com/blah_blah_(wikipedia)_(again)", true}, //
{"http://www.example.com/wpstyle/?p=364", true}, //
{"https://www.example.com/foo/?bar=baz&inga=42&quux", true}, //
{"http://userid:[email protected]:8080", true}, //
{"http://userid:[email protected]:8080/", true}, //
{"http://[email protected]", true}, //
{"http://[email protected]/", true}, //
{"http://[email protected]:8080", true}, //
{"http://[email protected]:8080/", true}, //
{"http://userid:[email protected]", true}, //
{"http://foo.com/blah_(wikipedia)#cite-1", true}, //
{"http://foo.com/blah_(wikipedia)_blah#cite-1", true}, //
{"http://foo.com/(something)?after=parens", true}, //
{"http://code.google.com/events/#&product=browser", true}, //
{"https://foo.com/blah_blah", true}, //
{"ftp://foo.com/blah_blah", true}, //
{"http://.", true}, //
{"http://?", true}, //
{"http://#", true}, //
{"http://##", true}, //
{"//", false}, //
{"//a", false}, //
{"///a", false}, //
{"///", false}, //
{"h://test", true}, //
{"http://-error-.invalid/", true} //
};
}

@Test(dataProvider = "exampleUrls")
public void checkUrl(String testUrl, Boolean acceptUrl) {
String testTextPrefix = "just a short test ";
String testTextSuffix = " here";

String expectedResult = testTextPrefix + (acceptUrl ? " " : testUrl) + testTextSuffix;

UrlTextFilter filter = UrlTextFilter.getInstance();
String result = filter.filter(testTextPrefix + testUrl + testTextSuffix);

assertEquals(result, expectedResult);
}
}

0 comments on commit ce6fa91

Please sign in to comment.