-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSuffixArray.java
252 lines (220 loc) · 6.74 KB
/
SuffixArray.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import java.util.Arrays;
/**
* The SuffixArray class represents a sorted list of suffix strings.
*
* It supports the following operations: computing the length() of the text,
* selectAsString() the ith smallest suffix, getting the indexOf() select(i),
* the length of longestCommonPrefix() of select(i), and determining the rank()
* of a key string (number of suffixes less than the specified key).
*
* This implementation uses a nested class Suffix to represent a suffix of the
* specified text (using constant time and space) and relies on Arrays.sort() to
* sort the array of suffixes.
*
* The length() and indexOf() methods take constant time in the worst case. The
* longestCommonPrefix() method takes time proportional to the length of the
* longest common prefix. The selectAsString() method takes time proportional to
* the length of the suffix.
*
* See SuffixArrayOptimzed.java for an optimized version that uses 3-way radix
* quicksort and does not use the nested class Suffix.
*/
public class SuffixArray {
/**
* The array of suffixes.
*/
private Suffix[] suffixes;
/**
* Build an array of Suffix objects for the given text String, and sort them.
*
* The purpose of sorting is that index(i) just returns the index associated
* with suffixes[i].
*
* @param text the input String
*/
public SuffixArray(String text) {
// The problem with this implementation is that it uses quadratic time and
// quadratic space.
int textLength = text.length();
suffixes = new Suffix[textLength];
for (int i = 0; i < textLength; i++) {
suffixes[i] = new Suffix(text, i);
}
// The efficiency of this suffix sort depends on the fact that each suffix
// is represented by a reference to the text string and the index of its
// first charecter.
Arrays.sort(suffixes);
}
/**
* This nested class represents a suffix of a text string.
*
* A Suffix has two instance variables:
* - a String reference to the text string
* - an int index of its first charecter
*
* It also provides four utility methods, namely length(), charAt(i),
* toString(), and compareTo().
*/
private static class Suffix implements Comparable<Suffix> {
/**
* Reference to text string.
*/
private final String text;
/**
* Index to suffix's first char.
*/
private final int index;
private Suffix(String text, int index) {
this.text = text;
this.index = index;
}
/**
* Returns the length of this Suffix.
*/
private int length() {
return text.length() - index;
}
/**
* Returns the ith character in the suffix.
*/
private char charAt(int i) {
return text.charAt(index + i);
}
/**
* Returns a String representation of the suffix.
*/
public String toString() {
return text.substring(index);
}
/**
* Compares two suffixes, for use in sorting.
*
* When two suffixes are exchanged, onley their references are being
* exchanged, not the whole suffixes. Hence, the cost of comparing two
* suffixes may be proportional to the length of the suffixes in the case
* when their common prefix is very long.
*/
public int compareTo(Suffix that) {
if (this == that) {
return 0;
}
int length = Math.min(this.length(), that.length());
for (int i = 0; i < length; i++) {
if (this.charAt(i) < that.charAt(i)) {
return -1;
}
if (this.charAt(i) > that.charAt(i)) {
return 1;
}
}
return this.length() - that.length();
}
}
/**
* Returns the length of the input text.
*
* @return the length of the input text
*/
public int length() {
return suffixes.length;
}
/**
* Returns the index into the original string of the ith smallest suffix.
*
* @param i an integer between 1 and suffixes.length - 1
* @return the index into the original string of the ith smallest suffix
* @throws java.lang.IndexOutOfBoundsException unless
* 0 < i <= suffixes.length
*/
public int indexOf(int i) {
if (i < 0 || i >= suffixes.length) {
throw new IndexOutOfBoundsException();
}
return suffixes[i].index;
}
/**
* Returns the ith smallest suffix as a String. Note: this method should be
* used primarily for debugging purposes.
*
* @param i an integer between 1 and suffixes.length - 1
* @return the ith smallest suffix as a String
* @throws java.lang.IndexOutOfBoundsException unless
* 0 < i <= suffixes.length
*/
public String selectAsString(int i) {
if (i < 0 || i >= suffixes.length) {
throw new IndexOutOfBoundsException();
}
return suffixes[i].toString();
}
/**
* Returns the length of the longest common prefix of the ith smallest suffix
* and the i-1st smallest suffix.
*
* @param i an integer between 1 and suffixes.length - 1
* @return the length of the longest common prefix of the ith smallest suffix
* and the i-1st smallest suffix.
* @throws java.lang.IndexOutOfBoundsException unless
* 1 < i <= suffixes.length
*/
public int longestCommonPreffix(int i) {
if (i < 1 || i >= suffixes.length) {
throw new IndexOutOfBoundsException();
}
return longestCommonPreffix(suffixes[i], suffixes[i-1]);
}
/**
* Find the longest common prefix of the two specified suffixes.
*
* This is a brute-force solution and takes time proportional to the length of
* the match.
*/
private int longestCommonPreffix(Suffix s, Suffix t) {
int length = Math.min(s.length(), t.length());
for (int i = 0; i < length; i++) {
if (s.charAt(i) != t.charAt(i)) {
return i;
}
}
return length;
}
/**
* Returns the number of suffixes strictly less that the specified key.
*
* @param key the query string
* @return the number of suffixes strictly less than key
*/
public int rank(String key) {
int lo = 0;
int hi = suffixes.length - 1;
// Perform a binary search
while (lo <= hi) {
// Key is in suffixes[lo..hi] or not present.
int mid = lo + (hi - lo) / 2;
int cmp = compare(key, suffixes[mid]);
if (cmp < 0) {
hi = mid - 1;
} else if (cmp > 0) {
lo = mid + 1;
} else {
return mid;
}
}
return lo;
}
/**
* Compares key string to this suffix.
*/
private int compare(String key, Suffix suffix) {
int length = Math.min(key.length(), suffix.length());
for (int i = 0; i < length; i++) {
if (key.charAt(i) < suffix.charAt(i)) {
return -1;
}
if (key.charAt(i) > suffix.charAt(i)) {
return 1;
}
}
return key.length() - suffix.length();
}
}