Skip to content

Commit 92b4d11

Browse files
authored
Merge pull request #8 from MTDdk/master
adding the possibility for extracting www-based URLs as well
2 parents 5c1c3e0 + eb10bf4 commit 92b4d11

File tree

10 files changed

+323
-163
lines changed

10 files changed

+323
-163
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99

1010
# mvn
1111
target/
12+
/bin/

src/main/java/org/nibor/autolink/LinkExtractor.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.nibor.autolink.internal.EmailScanner;
44
import org.nibor.autolink.internal.Scanner;
55
import org.nibor.autolink.internal.UrlScanner;
6+
import org.nibor.autolink.internal.WwwUrlScanner;
67

78
import java.util.*;
89

@@ -14,10 +15,12 @@
1415
public class LinkExtractor {
1516

1617
private final Scanner urlScanner;
18+
private final Scanner wwwScanner;
1719
private final Scanner emailScanner;
1820

19-
private LinkExtractor(UrlScanner urlScanner, EmailScanner emailScanner) {
21+
private LinkExtractor(UrlScanner urlScanner, WwwUrlScanner wwwScanner, EmailScanner emailScanner) {
2022
this.urlScanner = urlScanner;
23+
this.wwwScanner = wwwScanner;
2124
this.emailScanner = emailScanner;
2225
}
2326

@@ -46,6 +49,8 @@ private Scanner trigger(char c) {
4649
return urlScanner;
4750
case '@':
4851
return emailScanner;
52+
case 'w':
53+
return wwwScanner;
4954
}
5055
return null;
5156
}
@@ -88,8 +93,9 @@ public Builder emailDomainMustHaveDot(boolean emailDomainMustHaveDot) {
8893
*/
8994
public LinkExtractor build() {
9095
UrlScanner urlScanner = linkTypes.contains(LinkType.URL) ? new UrlScanner() : null;
96+
WwwUrlScanner wwwScanner = linkTypes.contains(LinkType.WWW) ? new WwwUrlScanner() : null;
9197
EmailScanner emailScanner = linkTypes.contains(LinkType.EMAIL) ? new EmailScanner(emailDomainMustHaveDot) : null;
92-
return new LinkExtractor(urlScanner, emailScanner);
98+
return new LinkExtractor(urlScanner, wwwScanner, emailScanner);
9399
}
94100
}
95101

src/main/java/org/nibor/autolink/LinkType.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,9 @@ public enum LinkType {
1111
/**
1212
* Email address such as {@code [email protected]}
1313
*/
14-
EMAIL
14+
EMAIL,
15+
/**
16+
* URL such as {@code www.example.com}
17+
*/
18+
WWW
1519
}

src/main/java/org/nibor/autolink/internal/LinkSpanImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public int getBeginIndex() {
2929
public int getEndIndex() {
3030
return endIndex;
3131
}
32-
32+
3333
@Override
3434
public String toString() {
3535
return "Link{type=" + getType() + ", beginIndex=" + beginIndex + ", endIndex=" + endIndex + "}";

src/main/java/org/nibor/autolink/internal/Scanners.java

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,160 @@ public static boolean isAlnum(char c) {
1717
public static boolean isNonAscii(char c) {
1818
return c >= 0x80;
1919
}
20+
21+
public static final int findUrlEnd(CharSequence input, int beginIndex) {
22+
int round = 0;
23+
int square = 0;
24+
int curly = 0;
25+
boolean doubleQuote = false;
26+
boolean singleQuote = false;
27+
int last = beginIndex;
28+
loop:
29+
for (int i = beginIndex; i < input.length(); i++) {
30+
char c = input.charAt(i);
31+
switch (c) {
32+
case '\u0000':
33+
case '\u0001':
34+
case '\u0002':
35+
case '\u0003':
36+
case '\u0004':
37+
case '\u0005':
38+
case '\u0006':
39+
case '\u0007':
40+
case '\u0008':
41+
case '\t':
42+
case '\n':
43+
case '\u000B':
44+
case '\f':
45+
case '\r':
46+
case '\u000E':
47+
case '\u000F':
48+
case '\u0010':
49+
case '\u0011':
50+
case '\u0012':
51+
case '\u0013':
52+
case '\u0014':
53+
case '\u0015':
54+
case '\u0016':
55+
case '\u0017':
56+
case '\u0018':
57+
case '\u0019':
58+
case '\u001A':
59+
case '\u001B':
60+
case '\u001C':
61+
case '\u001D':
62+
case '\u001E':
63+
case '\u001F':
64+
case ' ':
65+
case '<':
66+
case '>':
67+
case '\u007F':
68+
case '\u0080':
69+
case '\u0081':
70+
case '\u0082':
71+
case '\u0083':
72+
case '\u0084':
73+
case '\u0085':
74+
case '\u0086':
75+
case '\u0087':
76+
case '\u0088':
77+
case '\u0089':
78+
case '\u008A':
79+
case '\u008B':
80+
case '\u008C':
81+
case '\u008D':
82+
case '\u008E':
83+
case '\u008F':
84+
case '\u0090':
85+
case '\u0091':
86+
case '\u0092':
87+
case '\u0093':
88+
case '\u0094':
89+
case '\u0095':
90+
case '\u0096':
91+
case '\u0097':
92+
case '\u0098':
93+
case '\u0099':
94+
case '\u009A':
95+
case '\u009B':
96+
case '\u009C':
97+
case '\u009D':
98+
case '\u009E':
99+
case '\u009F':
100+
// These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
101+
// Some characters are not in the above list, even they are not in "unreserved" or "reserved":
102+
// '"', '\\', '^', '`', '{', '|', '}'
103+
// The reason for this is that other link detectors also allow them. Also see below, we require
104+
// the quote and the braces to be balanced.
105+
break loop;
106+
case '?':
107+
case '!':
108+
case '.':
109+
case ',':
110+
case ':':
111+
case ';':
112+
// These may be part of an URL but not at the end
113+
break;
114+
case '/':
115+
// This may be part of an URL and at the end, but not if the previous character can't be the end of an URL
116+
if (last == i - 1) {
117+
last = i;
118+
}
119+
break;
120+
case '(':
121+
round++;
122+
break;
123+
case ')':
124+
round--;
125+
if (round >= 0) {
126+
last = i;
127+
} else {
128+
// More closing than opening brackets, stop now
129+
break loop;
130+
}
131+
break;
132+
case '[':
133+
// Allowed in IPv6 address host
134+
square++;
135+
break;
136+
case ']':
137+
// Allowed in IPv6 address host
138+
square--;
139+
if (square >= 0) {
140+
last = i;
141+
} else {
142+
// More closing than opening brackets, stop now
143+
break loop;
144+
}
145+
break;
146+
case '{':
147+
curly++;
148+
break;
149+
case '}':
150+
curly--;
151+
if (curly >= 0) {
152+
last = i;
153+
} else {
154+
// More closing than opening brackets, stop now
155+
break loop;
156+
}
157+
break;
158+
case '"':
159+
doubleQuote = !doubleQuote;
160+
if (!doubleQuote) {
161+
last = i;
162+
}
163+
break;
164+
case '\'':
165+
singleQuote = !singleQuote;
166+
if (!singleQuote) {
167+
last = i;
168+
}
169+
break;
170+
default:
171+
last = i;
172+
}
173+
}
174+
return last;
175+
}
20176
}

0 commit comments

Comments
 (0)