diff --git a/java/com/google/re2j/Machine.java b/java/com/google/re2j/Machine.java index ba785917..17e67100 100644 --- a/java/com/google/re2j/Machine.java +++ b/java/com/google/re2j/Machine.java @@ -156,11 +156,8 @@ private void initNewCap(int ncap) { this.matchcap = new int[ncap]; } - int[] submatches() { - if (ncap == 0) { - return Utils.EMPTY_INTS; - } - return Arrays.copyOf(matchcap, ncap); + void submatches(int[] cap) { + System.arraycopy(matchcap, 0, cap, 0, Math.min(cap.length, ncap)); } // alloc() allocates a new thread with the given instruction. @@ -217,7 +214,7 @@ boolean match(MachineInput in, int pos, int anchor) { if (startCond == Utils.EMPTY_ALL) { // impossible return false; } - if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != 0) { + if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != in.begPos()) { return false; } matched = false; @@ -234,7 +231,7 @@ boolean match(MachineInput in, int pos, int anchor) { width1 = r & 7; } int flag; // bitmask of EMPTY_* flags - if (pos == 0) { + if (pos == in.begPos()) { flag = Utils.emptyOpContext(-1, rune); } else { flag = in.context(pos); @@ -242,7 +239,7 @@ boolean match(MachineInput in, int pos, int anchor) { for (; ; ) { if (runq.isEmpty()) { - if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != 0) { + if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != in.begPos()) { // Anchored match, past beginning of text. break; } @@ -265,7 +262,7 @@ boolean match(MachineInput in, int pos, int anchor) { width1 = r & 7; } } - if (!matched && (pos == 0 || anchor == RE2.UNANCHORED)) { + if (!matched && (pos == in.begPos() || anchor == RE2.UNANCHORED)) { // If we are anchoring at begin then only add threads that begin // at |pos| = 0. if (ncap > 0) { diff --git a/java/com/google/re2j/MachineInput.java b/java/com/google/re2j/MachineInput.java index f69ba043..ec69fd5f 100644 --- a/java/com/google/re2j/MachineInput.java +++ b/java/com/google/re2j/MachineInput.java @@ -53,9 +53,17 @@ static MachineInput fromUTF16(CharSequence s, int start, int end) { // Returns a bitmask of EMPTY_* flags. abstract int context(int pos); + // Returns the start position in the same units as step(). + abstract int begPos(); + // Returns the end position in the same units as step(). abstract int endPos(); + /* Returns a slice of current input, using absolute positions + * (that is, not relative to current bounds). + * The resulting slice is a subset of the current input. */ + abstract MachineInput region(int beg, int end); + //// Implementations // An implementation of MachineInput for UTF-8 byte arrays. @@ -84,7 +92,6 @@ private static class UTF8Input extends MachineInput { @Override int step(int i) { - i += start; if (i >= end) { return EOF; } @@ -133,14 +140,12 @@ boolean canCheckPrefix() { @Override int index(RE2 re2, int pos) { - pos += start; int i = Utils.indexOf(b, re2.prefixUTF8, pos); return i < 0 ? i : i - pos; } @Override int context(int pos) { - pos += this.start; int r1 = -1; if (pos > this.start && pos <= this.end) { int start = pos - 1; @@ -164,10 +169,23 @@ int context(int pos) { return Utils.emptyOpContext(r1, r2); } + @Override + int begPos() { + return start; + } + @Override int endPos() { return end; } + + @Override + MachineInput region(int beg, int end) { + int newbeg = Math.max(beg, this.start); + int newend = Math.min(end, this.end); + return newbeg == this.start && newend == this.end + ? this : new UTF8Input(b, newbeg, newend); + } } // |pos| and |width| are in Java "char" units. @@ -184,7 +202,6 @@ public UTF16Input(CharSequence str, int start, int end) { @Override int step(int pos) { - pos += start; if (pos < end) { int rune = Character.codePointAt(str, pos); return rune << 3 | Character.charCount(rune); @@ -200,24 +217,35 @@ boolean canCheckPrefix() { @Override int index(RE2 re2, int pos) { - pos += start; int i = indexOf(str, re2.prefix, pos); return i < 0 ? i : i - pos; } @Override int context(int pos) { - pos += start; int r1 = pos > 0 && pos <= str.length() ? Character.codePointBefore(str, pos) : -1; int r2 = pos < str.length() ? Character.codePointAt(str, pos) : -1; return Utils.emptyOpContext(r1, r2); } + @Override + int begPos() { + return start; + } + @Override int endPos() { return end; } + @Override + MachineInput region(int beg, int end) { + int newbeg = Math.max(beg, this.start); + int newend = Math.min(end, this.end); + return newbeg == this.start && newend == this.end + ? this : new UTF16Input(str, newbeg, newend); + } + private int indexOf(CharSequence hayStack, String needle, int pos) { if (hayStack instanceof String) { return ((String) hayStack).indexOf(needle, pos); diff --git a/java/com/google/re2j/MatcherInput.java b/java/com/google/re2j/MatcherInput.java index 8af494ba..862be217 100644 --- a/java/com/google/re2j/MatcherInput.java +++ b/java/com/google/re2j/MatcherInput.java @@ -47,6 +47,8 @@ static MatcherInput utf8(String input) { abstract int length(); + abstract MachineInput region(int beg, int end); + static class Utf8MatcherInput extends MatcherInput { byte[] bytes; @@ -73,6 +75,11 @@ public byte[] asBytes() { public int length() { return bytes.length; } + + @Override + public MachineInput region(int beg, int end) { + return MachineInput.fromUTF8(bytes, beg, end); + } } static class Utf16MatcherInput extends MatcherInput { @@ -101,5 +108,10 @@ public byte[] asBytes() { public int length() { return charSequence.length(); } + + @Override + public MachineInput region(int beg, int end) { + return MachineInput.fromUTF16(charSequence, beg, end); + } } } diff --git a/java/com/google/re2j/RE2.java b/java/com/google/re2j/RE2.java index 46837354..366e9805 100644 --- a/java/com/google/re2j/RE2.java +++ b/java/com/google/re2j/RE2.java @@ -20,7 +20,6 @@ package com.google.re2j; -import com.google.re2j.MatcherInput.Encoding; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; @@ -292,6 +291,11 @@ public String toString() { // the position of its subexpressions. // Derived from exec.go. private int[] doExecute(MachineInput in, int pos, int anchor, int ncap) { + int[] cap = ncap == 0 ? Utils.EMPTY_INTS : new int[ncap]; + return doExecute(in, pos, anchor, ncap, cap) ? cap : null; + } + + private boolean doExecute(MachineInput in, int pos, int anchor, int ncap, int[] cap) { Machine m = get(); // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at // the bottom of the stack (i.e., next == null). @@ -305,9 +309,12 @@ private int[] doExecute(MachineInput in, int pos, int anchor, int ncap) { } m.init(ncap); - int[] cap = m.match(in, pos, anchor) ? m.submatches() : null; + boolean ok = m.match(in, pos, anchor); + if (ok && cap != null) { + m.submatches(cap); + } put(m, isNew); - return cap; + return ok; } /** @@ -336,9 +343,6 @@ boolean match(CharSequence input, int start, int end, int anchor, int[] group, i * @return true if a match was found */ boolean match(MatcherInput input, int start, int end, int anchor, int[] group, int ngroup) { - if (start > end) { - return false; - } // TODO(afrozm): We suspect that the correct code should look something // like the following: // doExecute(MachineInput.fromUTF16(input), start, anchor, 2*ngroup); @@ -346,20 +350,25 @@ boolean match(MatcherInput input, int start, int end, int anchor, int[] group, i // In Russ' own words: // That is, I believe doExecute needs to know the bounds of the whole input // as well as the bounds of the subpiece that is being searched. - MachineInput machineInput = - input.getEncoding() == Encoding.UTF_16 - ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) - : MachineInput.fromUTF8(input.asBytes(), 0, end); - int[] groupMatch = doExecute(machineInput, start, anchor, 2 * ngroup); - - if (groupMatch == null) { - return false; - } + return match(input.region(0, end), start, anchor, group, ngroup); + } - if (group != null) { - System.arraycopy(groupMatch, 0, group, 0, groupMatch.length); - } - return true; + /** + * Matches the regular expression against input starting at position start and ending at position + * end, with the given anchoring. Records the submatch boundaries in group, which is [start, end) + * pairs of byte offsets. The number of boundaries needed is inferred from the size of the group + * array. It is most efficient not to ask for submatch boundaries. + * + * @param region the input slice + * @param pos the position in the input to start the search + * @param anchor the anchoring flag (UNANCHORED, ANCHOR_START, ANCHOR_BOTH) + * @param group the array to fill with submatch positions + * @param ngroup the number of array pairs to fill in + * @return true if a match was found + */ + boolean match(MachineInput region, int pos, int anchor, int[] group, int ngroup) { + return pos >= region.begPos() && pos <= region.endPos() && + doExecute(region, pos, anchor, 2 * ngroup, group); } /**