Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions java/com/google/re2j/Machine.java
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,8 @@ private void initNewCap(int ncap) {
this.matchcap = new int[ncap];
}

int[] submatches() {
if (ncap == 0) {
return Utils.EMPTY_INTS;
}
return Arrays.copyOf(matchcap, ncap);
void submatches(int[] cap) {
System.arraycopy(matchcap, 0, cap, 0, Math.min(cap.length, ncap));
}

// alloc() allocates a new thread with the given instruction.
Expand Down Expand Up @@ -217,7 +214,7 @@ boolean match(MachineInput in, int pos, int anchor) {
if (startCond == Utils.EMPTY_ALL) { // impossible
return false;
}
if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != 0) {
if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != in.begPos()) {
return false;
}
matched = false;
Expand All @@ -234,15 +231,15 @@ boolean match(MachineInput in, int pos, int anchor) {
width1 = r & 7;
}
int flag; // bitmask of EMPTY_* flags
if (pos == 0) {
if (pos == in.begPos()) {
flag = Utils.emptyOpContext(-1, rune);
} else {
flag = in.context(pos);
}
for (; ; ) {

if (runq.isEmpty()) {
if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != 0) {
if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != in.begPos()) {
// Anchored match, past beginning of text.
break;
}
Expand All @@ -265,7 +262,7 @@ boolean match(MachineInput in, int pos, int anchor) {
width1 = r & 7;
}
}
if (!matched && (pos == 0 || anchor == RE2.UNANCHORED)) {
if (!matched && (pos == in.begPos() || anchor == RE2.UNANCHORED)) {
// If we are anchoring at begin then only add threads that begin
// at |pos| = 0.
if (ncap > 0) {
Expand Down
40 changes: 34 additions & 6 deletions java/com/google/re2j/MachineInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ static MachineInput fromUTF16(CharSequence s, int start, int end) {
// Returns a bitmask of EMPTY_* flags.
abstract int context(int pos);

// Returns the start position in the same units as step().
abstract int begPos();

// Returns the end position in the same units as step().
abstract int endPos();

/* Returns a slice of current input, using absolute positions
* (that is, not relative to current bounds).
* The resulting slice is a subset of the current input. */
abstract MachineInput region(int beg, int end);

//// Implementations

// An implementation of MachineInput for UTF-8 byte arrays.
Expand Down Expand Up @@ -84,7 +92,6 @@ private static class UTF8Input extends MachineInput {

@Override
int step(int i) {
i += start;
if (i >= end) {
return EOF;
}
Expand Down Expand Up @@ -133,14 +140,12 @@ boolean canCheckPrefix() {

@Override
int index(RE2 re2, int pos) {
pos += start;
int i = Utils.indexOf(b, re2.prefixUTF8, pos);
return i < 0 ? i : i - pos;
}

@Override
int context(int pos) {
pos += this.start;
int r1 = -1;
if (pos > this.start && pos <= this.end) {
int start = pos - 1;
Expand All @@ -164,10 +169,23 @@ int context(int pos) {
return Utils.emptyOpContext(r1, r2);
}

@Override
int begPos() {
return start;
}

@Override
int endPos() {
return end;
}

@Override
MachineInput region(int beg, int end) {
int newbeg = Math.max(beg, this.start);
int newend = Math.min(end, this.end);
return newbeg == this.start && newend == this.end
? this : new UTF8Input(b, newbeg, newend);
}
}

// |pos| and |width| are in Java "char" units.
Expand All @@ -184,7 +202,6 @@ public UTF16Input(CharSequence str, int start, int end) {

@Override
int step(int pos) {
pos += start;
if (pos < end) {
int rune = Character.codePointAt(str, pos);
return rune << 3 | Character.charCount(rune);
Expand All @@ -200,24 +217,35 @@ boolean canCheckPrefix() {

@Override
int index(RE2 re2, int pos) {
pos += start;
int i = indexOf(str, re2.prefix, pos);
return i < 0 ? i : i - pos;
}

@Override
int context(int pos) {
pos += start;
int r1 = pos > 0 && pos <= str.length() ? Character.codePointBefore(str, pos) : -1;
int r2 = pos < str.length() ? Character.codePointAt(str, pos) : -1;
return Utils.emptyOpContext(r1, r2);
}

@Override
int begPos() {
return start;
}

@Override
int endPos() {
return end;
}

@Override
MachineInput region(int beg, int end) {
int newbeg = Math.max(beg, this.start);
int newend = Math.min(end, this.end);
return newbeg == this.start && newend == this.end
? this : new UTF16Input(str, newbeg, newend);
}

private int indexOf(CharSequence hayStack, String needle, int pos) {
if (hayStack instanceof String) {
return ((String) hayStack).indexOf(needle, pos);
Expand Down
12 changes: 12 additions & 0 deletions java/com/google/re2j/MatcherInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ static MatcherInput utf8(String input) {

abstract int length();

abstract MachineInput region(int beg, int end);

static class Utf8MatcherInput extends MatcherInput {
byte[] bytes;

Expand All @@ -73,6 +75,11 @@ public byte[] asBytes() {
public int length() {
return bytes.length;
}

@Override
public MachineInput region(int beg, int end) {
return MachineInput.fromUTF8(bytes, beg, end);
}
}

static class Utf16MatcherInput extends MatcherInput {
Expand Down Expand Up @@ -101,5 +108,10 @@ public byte[] asBytes() {
public int length() {
return charSequence.length();
}

@Override
public MachineInput region(int beg, int end) {
return MachineInput.fromUTF16(charSequence, beg, end);
}
}
}
47 changes: 28 additions & 19 deletions java/com/google/re2j/RE2.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

package com.google.re2j;

import com.google.re2j.MatcherInput.Encoding;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -292,6 +291,11 @@ public String toString() {
// the position of its subexpressions.
// Derived from exec.go.
private int[] doExecute(MachineInput in, int pos, int anchor, int ncap) {
int[] cap = ncap == 0 ? Utils.EMPTY_INTS : new int[ncap];
return doExecute(in, pos, anchor, ncap, cap) ? cap : null;
}

private boolean doExecute(MachineInput in, int pos, int anchor, int ncap, int[] cap) {
Machine m = get();
// The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
// the bottom of the stack (i.e., next == null).
Expand All @@ -305,9 +309,12 @@ private int[] doExecute(MachineInput in, int pos, int anchor, int ncap) {
}

m.init(ncap);
int[] cap = m.match(in, pos, anchor) ? m.submatches() : null;
boolean ok = m.match(in, pos, anchor);
if (ok && cap != null) {
m.submatches(cap);
}
put(m, isNew);
return cap;
return ok;
}

/**
Expand Down Expand Up @@ -336,30 +343,32 @@ boolean match(CharSequence input, int start, int end, int anchor, int[] group, i
* @return true if a match was found
*/
boolean match(MatcherInput input, int start, int end, int anchor, int[] group, int ngroup) {
if (start > end) {
return false;
}
// TODO(afrozm): We suspect that the correct code should look something
// like the following:
// doExecute(MachineInput.fromUTF16(input), start, anchor, 2*ngroup);
//
// In Russ' own words:
// That is, I believe doExecute needs to know the bounds of the whole input
// as well as the bounds of the subpiece that is being searched.
MachineInput machineInput =
input.getEncoding() == Encoding.UTF_16
? MachineInput.fromUTF16(input.asCharSequence(), 0, end)
: MachineInput.fromUTF8(input.asBytes(), 0, end);
int[] groupMatch = doExecute(machineInput, start, anchor, 2 * ngroup);

if (groupMatch == null) {
return false;
}
return match(input.region(0, end), start, anchor, group, ngroup);
}

if (group != null) {
System.arraycopy(groupMatch, 0, group, 0, groupMatch.length);
}
return true;
/**
* Matches the regular expression against input starting at position start and ending at position
* end, with the given anchoring. Records the submatch boundaries in group, which is [start, end)
* pairs of byte offsets. The number of boundaries needed is inferred from the size of the group
* array. It is most efficient not to ask for submatch boundaries.
*
* @param region the input slice
* @param pos the position in the input to start the search
* @param anchor the anchoring flag (UNANCHORED, ANCHOR_START, ANCHOR_BOTH)
* @param group the array to fill with submatch positions
* @param ngroup the number of array pairs to fill in
* @return true if a match was found
*/
boolean match(MachineInput region, int pos, int anchor, int[] group, int ngroup) {
return pos >= region.begPos() && pos <= region.endPos() &&
doExecute(region, pos, anchor, 2 * ngroup, group);
}

/**
Expand Down