From 2f945fb56dd5805adc092701b68a678b2f7c7830 Mon Sep 17 00:00:00 2001 From: Paul Mucur Date: Thu, 30 Nov 2023 16:01:38 +0000 Subject: [PATCH] Expose more of RE2's matching interface GitHub: https://github.com/mudge/re2/issues/119 Add new options to `RE2::Regexp#match` that expose the underlying capabilities of RE2's Match function: * anchor: specifying whether a match should be unanchored (the default), anchored to the start of the text or anchored to both ends * startpos: the offset at which to start matching (defaults to the start of the text) * submatches: the number of submatches to extract (defaults to the number of capturing groups in the pattern) We keep compatibility with the previous API by still accepting a number of submatches as the second argument to match. With these new options in place, we can now offer a higher-level `RE2::Regexp#full_match` and `RE2::Regexp#partial_match` API to match RE2's own. Note we don't actually use the underlying `FullMatchN` or `PartialMatchN` functions as we need to use `Match`'s behaviour of returning the overall match first before any extracted submatches. The plan is to then heavily promote these two methods over the lower-level `match`. --- ext/re2/re2.cc | 121 ++++++++++++++++++------- lib/re2.rb | 1 + lib/re2/regexp.rb | 62 +++++++++++++ re2.gemspec | 1 + spec/re2/regexp_spec.rb | 192 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 338 insertions(+), 39 deletions(-) create mode 100644 lib/re2/regexp.rb diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc index c59bb540..a358b036 100644 --- a/ext/re2/re2.cc +++ b/ext/re2/re2.cc @@ -47,8 +47,9 @@ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet, /* Symbols used in RE2 options. */ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors, id_max_mem, id_literal, id_never_nl, id_case_sensitive, - id_perl_classes, id_word_boundary, id_one_line, - id_unanchored, id_anchor_start, id_anchor_both, id_exception; + id_perl_classes, id_word_boundary, id_one_line, id_unanchored, + id_anchor, id_anchor_start, id_anchor_both, id_exception, + id_submatches, id_startpos; inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) { if (encoding == RE2::Options::EncodingUTF8) { @@ -1339,38 +1340,37 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) { * r = RE2::Regexp.new('woo') * r.match('woo') #=> true * - * @overload match(text, 0) - * Returns either true or false indicating whether a - * successful match was made. - * - * @param [String] text the text to search - * @return [Boolean] whether the match was successful - * @raise [NoMemoryError] if there was not enough memory to allocate the submatches - * @example - * r = RE2::Regexp.new('w(o)(o)') - * r.match('woo', 0) #=> true - * r.match('bob', 0) #=> false - * - * @overload match(text, number_of_submatches) + * @overload match(text, options) * See +match(text)+ but with a specific number of * submatches returned (padded with nils if necessary). * * @param [String] text the text to search - * @param [Integer] number_of_submatches the number of submatches to return - * @return [RE2::MatchData] the submatches - * @raise [ArgumentError] if given a negative number of submatches + * @param [Hash] options the options with which to perform the match + * @option options [Integer] :startpos (0) offset at which to start matching + * @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match + * @option options [Integer] :submatches how many submatches to extract (0 is + * fastest), defaults to the number of capturing groups + * @return [RE2::MatchData] if extracting any submatches + * @return [Boolean] if not extracting any submatches + * @raise [ArgumentError] if given a negative number of submatches or invalid anchor * @raise [NoMemoryError] if there was not enough memory to allocate the matches + * @raise [TypeError] if given non-String text, non-numeric number of + * submatches, non-symbol anchor or non-hash options * @example * r = RE2::Regexp.new('w(o)(o)') - * r.match('woo', 1) #=> # - * r.match('woo', 3) #=> # + * r.match('woo', submatches: 1) # => # + * r.match('woo', submatches: 3) # => # + * r.match('woot', anchor: :anchor_both, submatches: 0) + * # => false + * r.match('woot', anchor: :anchor_start, submatches: 0) + * # => true */ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { re2_pattern *p; re2_matchdata *m; - VALUE text, number_of_submatches; + VALUE text, options; - rb_scan_args(argc, argv, "11", &text, &number_of_submatches); + rb_scan_args(argc, argv, "11", &text, &options); /* Ensure text is a string. */ StringValue(text); @@ -1378,12 +1378,64 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p); int n; + int startpos = 0; + RE2::Anchor anchor = RE2::UNANCHORED; + + if (RTEST(options)) { + if (FIXNUM_P(options)) { + n = NUM2INT(options); - if (RTEST(number_of_submatches)) { - n = NUM2INT(number_of_submatches); + if (n < 0) { + rb_raise(rb_eArgError, "number of matches should be >= 0"); + } + } else { + if (TYPE(options) != T_HASH) { + options = rb_Hash(options); + } - if (n < 0) { - rb_raise(rb_eArgError, "number of matches should be >= 0"); + VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor)); + if (!NIL_P(anchor_option)) { + Check_Type(anchor_option, T_SYMBOL); + + ID id_anchor_option = SYM2ID(anchor_option); + if (id_anchor_option == id_unanchored) { + anchor = RE2::UNANCHORED; + } else if (id_anchor_option == id_anchor_start) { + anchor = RE2::ANCHOR_START; + } else if (id_anchor_option == id_anchor_both) { + anchor = RE2::ANCHOR_BOTH; + } else { + rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both"); + } + } + + VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches)); + if (!NIL_P(submatches_option)) { + Check_Type(submatches_option, T_FIXNUM); + + n = NUM2INT(submatches_option); + + if (n < 0) { + rb_raise(rb_eArgError, "number of matches should be >= 0"); + } + } else { + if (!p->pattern->ok()) { + return Qnil; + } + + n = p->pattern->NumberOfCapturingGroups(); + } + + VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos)); + if (!NIL_P(startpos_option)) { + Check_Type(startpos_option, T_FIXNUM); + + startpos = NUM2INT(startpos_option); + + if (startpos < 0) { + rb_raise(rb_eArgError, "startpos should be >= 0"); + } + } } } else { if (!p->pattern->ok()) { @@ -1395,10 +1447,10 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { if (n == 0) { #ifdef HAVE_ENDPOS_ARGUMENT - bool matched = p->pattern->Match(RSTRING_PTR(text), 0, - RSTRING_LEN(text), RE2::UNANCHORED, 0, 0); + bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, + RSTRING_LEN(text), anchor, 0, 0); #else - bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED, + bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor, 0, 0); #endif return BOOL2RUBY(matched); @@ -1423,11 +1475,11 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { m->number_of_matches = n; #ifdef HAVE_ENDPOS_ARGUMENT - bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0, - RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n); + bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos, + RSTRING_LEN(m->text), anchor, m->matches, n); #else - bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0, - RE2::UNANCHORED, m->matches, n); + bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos, + anchor, m->matches, n); #endif if (matched) { return matchdata; @@ -2032,7 +2084,10 @@ extern "C" void Init_re2(void) { id_word_boundary = rb_intern("word_boundary"); id_one_line = rb_intern("one_line"); id_unanchored = rb_intern("unanchored"); + id_anchor = rb_intern("anchor"); id_anchor_start = rb_intern("anchor_start"); id_anchor_both = rb_intern("anchor_both"); id_exception = rb_intern("exception"); + id_submatches = rb_intern("submatches"); + id_startpos = rb_intern("startpos"); } diff --git a/lib/re2.rb b/lib/re2.rb index 76d8fcec..3f2751e6 100644 --- a/lib/re2.rb +++ b/lib/re2.rb @@ -10,5 +10,6 @@ require 're2.so' end +require "re2/regexp" require "re2/scanner" require "re2/version" diff --git a/lib/re2/regexp.rb b/lib/re2/regexp.rb new file mode 100644 index 00000000..94790c0d --- /dev/null +++ b/lib/re2/regexp.rb @@ -0,0 +1,62 @@ +module RE2 + class Regexp + # Match the pattern against any substring of the given +text+ and return + # either a boolean (if no submatches are required) or a {RE2::MatchData} + # instance with the specified number of submatches (defaults to the total + # number of capturing groups). + # + # The number of submatches has a significant impact on performance: requesting + # one submatch is much faster than requesting more than one and requesting + # zero submatches is faster still. + # + # @param [String] text the text to search + # @param [Hash] options the options with which to perform the match + # @option options [Integer] :submatches how many submatches to extract (0 + # is fastest), defaults to the total number of capturing groups + # @return [RE2::MatchData] if extracting any submatches + # @return [Boolean] if not extracting any submatches + # @raise [ArgumentError] if given a negative number of submatches + # @raise [NoMemoryError] if there was not enough memory to allocate the + # matches + # @raise [TypeError] if given non-numeric submatches or non-hash options + # @example + # r = RE2::Regexp.new('w(o)(o)') + # r.partial_match('woot') + # # => # + # r.partial_match('woot', submatches: 1) # => # + # r.partial_match('woot', submatches: 0) # => true + def partial_match(text, options = {}) + match(text, Hash(options).merge(anchor: :unanchored)) + end + + # Match the pattern against the given +text+ exactly and return either a + # boolean (if no submatches are required) or a {RE2::MatchData} instance + # with the specified number of submatches (defaults to the total number of + # capturing groups). + # + # The number of submatches has a significant impact on performance: requesting + # one submatch is much faster than requesting more than one and requesting + # zero submatches is faster still. + # + # @param [String] text the text to search + # @param [Hash] options the options with which to perform the match + # @option options [Integer] :submatches how many submatches to extract (0 + # is fastest), defaults to the total number of capturing groups + # @return [RE2::MatchData] if extracting any submatches + # @return [Boolean] if not extracting any submatches + # @raise [ArgumentError] if given a negative number of submatches + # @raise [NoMemoryError] if there was not enough memory to allocate the + # matches + # @raise [TypeError] if given non-numeric submatches or non-hash options + # @example + # r = RE2::Regexp.new('w(o)(o)') + # r.full_match('woo') + # # => # + # r.full_match('woo', submatches: 1) # => # + # r.full_match('woo', submatches: 0) # => true + # r.full_match('woot') # => nil + def full_match(text, options = {}) + match(text, Hash(options).merge(anchor: :anchor_both)) + end + end +end diff --git a/re2.gemspec b/re2.gemspec index 8fa405da..08a7db05 100644 --- a/re2.gemspec +++ b/re2.gemspec @@ -17,6 +17,7 @@ Gem::Specification.new do |s| "ext/re2/recipes.rb", "Gemfile", "lib/re2.rb", + "lib/re2/regexp.rb", "lib/re2/scanner.rb", "lib/re2/string.rb", "lib/re2/version.rb", diff --git a/spec/re2/regexp_spec.rb b/spec/re2/regexp_spec.rb index 8f226f27..70c16e97 100644 --- a/spec/re2/regexp_spec.rb +++ b/spec/re2/regexp_spec.rb @@ -313,21 +313,100 @@ expect { re.match(nil) }.to raise_error(TypeError) end - it "raises an exception when given an inappropriate number of matches" do - expect { re.match("My name is Robert Paulson", {}) }.to raise_error(TypeError) + it "raises an exception when given invalid options" do + expect { re.match("My name is Robert Paulson", "foo") }.to raise_error(TypeError) end - it "raises an exception when given a negative number of matches" do - expect { re.match("My name is Robert Paulson", -1) }.to raise_error(ArgumentError, "number of matches should be >= 0") + it "accepts anything that can be coerced to a hash as options", :aggregate_failures do + m = re.match("My name is Robert Paulson", nil) + expect(m[1]).to eq("Robert") + + m = re.match("My name is Robert Paulson", []) + expect(m[1]).to eq("Robert") end it "returns nil with an invalid pattern" do re = RE2::Regexp.new('???', :log_errors => false) + expect(re.match('My name is Robert Paulson')).to be_nil end + it "returns nil with an invalid pattern and options" do + re = RE2::Regexp.new('???', :log_errors => false) + + expect(re.match('My name is Robert Paulson', submatches: 1)).to be_nil + end + + it "is unanchored by default", :aggregate_failures do + expect(re.match("My name is Robert Paulson", submatches: 0)).to eq(true) + expect(re.match("My name is Robert Paulson, he said", submatches: 0)).to eq(true) + expect(re.match("He said, My name is Robert Paulson", submatches: 0)).to eq(true) + end + + it "is unanchored if given a nil anchor", :aggregate_failures do + expect(re.match("My name is Robert Paulson", anchor: nil, submatches: 0)).to eq(true) + expect(re.match("My name is Robert Paulson, he said", anchor: nil, submatches: 0)).to eq(true) + expect(re.match("He said, My name is Robert Paulson", anchor: nil, submatches: 0)).to eq(true) + end + + it "can be explicitly unanchored", :aggregate_failures do + expect(re.match("My name is Robert Paulson", anchor: :unanchored, submatches: 0)).to eq(true) + expect(re.match("My name is Robert Paulson, he said", anchor: :unanchored, submatches: 0)).to eq(true) + expect(re.match("He said, My name is Robert Paulson", anchor: :unanchored, submatches: 0)).to eq(true) + end + + it "can anchor the match at both ends", :aggregate_failures do + expect(re.match("My name is Robert Paulson", anchor: :anchor_both, submatches: 0)).to eq(true) + expect(re.match("My name is Robert Paulson, he said", anchor: :anchor_both, submatches: 0)).to eq(false) + expect(re.match("He said, My name is Robert Paulson", anchor: :anchor_both, submatches: 0)).to eq(false) + end + + it "can anchor the match at the start", :aggregate_failures do + expect(re.match("My name is Robert Paulson", anchor: :anchor_start, submatches: 0)).to eq(true) + expect(re.match("My name is Robert Paulson, he said", anchor: :anchor_start, submatches: 0)).to eq(true) + expect(re.match("He said, My name is Robert Paulson", anchor: :anchor_start, submatches: 0)).to eq(false) + end + + it "raises an exception when given an invalid anchor" do + expect { re.match("My name is Robert Paulson", anchor: :invalid) }.to raise_error(ArgumentError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both") + end + + it "raises an exception when given a non-symbol anchor" do + expect { re.match("My name is Robert Paulson", anchor: 0) }.to raise_error(TypeError) + end + + it "can be given an offset at which to start matching", :aggregate_failures do + m = re.match("My name is Alice Bloggs My name is Robert Paulson", startpos: 24) + + expect(m[1]).to eq("Robert") + expect(m[2]).to eq("Paulson") + end + + it "does not match if given an offset past the end of the text", :aggregate_failures do + expect(re.match("My name is Alice Bloggs", startpos: 99)).to be_nil + end + + it "raises an exception when given a negative start position" do + expect { re.match("My name is Robert Paulson", startpos: -1) }.to raise_error(ArgumentError, "startpos should be >= 0") + end + + it "raises an exception when given a negative number of matches" do + expect { re.match("My name is Robert Paulson", submatches: -1) }.to raise_error(ArgumentError, "number of matches should be >= 0") + end + + it "raises an exception when given a non-numeric number of matches" do + expect { re.match("My name is Robert Paulson", submatches: "foo") }.to raise_error(TypeError) + end + + it "defaults to extracting all submatches when given nil", :aggregate_failures do + m = re.match("My name is Robert Paulson", submatches: nil) + + expect(m[1]).to eq("Robert") + expect(m[2]).to eq("Paulson") + end + describe "with a specific number of matches under the total in the pattern" do - subject { re.match("My name is Robert Paulson", 1) } + subject { re.match("My name is Robert Paulson", submatches: 1) } it "returns a match data object" do expect(subject).to be_a(RE2::MatchData) @@ -347,7 +426,7 @@ end describe "with a number of matches over the total in the pattern" do - subject { re.match("My name is Robert Paulson", 5) } + subject { re.match("My name is Robert Paulson", submatches: 5) } it "returns a match data object" do expect(subject).to be_a(RE2::MatchData) @@ -369,6 +448,20 @@ expect(subject[6]).to be_nil end end + + it "accepts the number of submatches as a second argument for compatibility", :aggregate_failures do + expect(re.match("My name is Robert Paulson", 0)).to eq(true) + + m = re.match("My name is Robert Paulson", 1) + expect(m[1]).to eq("Robert") + expect(m[2]).to be_nil + + m = re.match("My name is Robert Paulson", 2) + expect(m[1]).to eq("Robert") + expect(m[2]).to eq("Paulson") + + expect { re.match("My name is Robert Paulson", -1) }.to raise_error(ArgumentError, "number of matches should be >= 0") + end end describe "#match?" do @@ -475,4 +568,91 @@ expect(scanner).to be_a(RE2::Scanner) end end + + describe "#partial_match" do + it "matches the pattern anywhere within the given text" do + r = RE2::Regexp.new('f(o+)') + + expect(r.partial_match('foo bar', submatches: 0)).to eq(true) + end + + it "can set the number of submatches to extract", :aggregate_failures do + r = RE2::Regexp.new('f(o+)(a+)') + m = r.partial_match('fooaa bar', submatches: 1) + + expect(m[1]).to eq('oo') + expect(m[2]).to be_nil + + m = r.partial_match('fooaa bar', submatches: 2) + + expect(m[1]).to eq('oo') + expect(m[2]).to eq('aa') + end + + it "raises an error if given non-hash options" do + r = RE2::Regexp.new('f(o+)(a+)') + + expect { r.partial_match('fooaa bar', 'not a hash') }.to raise_error(TypeError) + end + + it "accepts options that can be coerced to a hash", :aggregate_failures do + r = RE2::Regexp.new('f(o+)(a+)') + + m = r.partial_match('fooaa bar', nil) + expect(m[1]).to eq('oo') + + m = r.partial_match('fooaa bar', []) + expect(m[1]).to eq('oo') + end + + it "accepts anything that can be coerced to a string" do + r = RE2::Regexp.new('f(o+)(a+)') + + expect(r.partial_match(StringLike.new('fooaa bar'), submatches: 0)).to eq(true) + end + end + + describe "#full_match" do + it "only matches the pattern if all of the given text matches", :aggregate_failures do + r = RE2::Regexp.new('f(o+)') + + expect(r.full_match('foo', submatches: 0)).to eq(true) + expect(r.full_match('foo bar', submatches: 0)).to eq(false) + end + + it "can set the number of submatches to extract", :aggregate_failures do + r = RE2::Regexp.new('f(o+)(a+)') + m = r.full_match('fooaa', submatches: 1) + + expect(m[1]).to eq('oo') + expect(m[2]).to be_nil + + m = r.full_match('fooaa', submatches: 2) + + expect(m[1]).to eq('oo') + expect(m[2]).to eq('aa') + end + + it "raises an error if given non-hash options" do + r = RE2::Regexp.new('f(o+)(a+)') + + expect { r.full_match('fooaa', 'not a hash') }.to raise_error(TypeError) + end + + it "accepts options that can be coerced to a hash", :aggregate_failures do + r = RE2::Regexp.new('f(o+)(a+)') + + m = r.full_match('fooaa', nil) + expect(m[1]).to eq('oo') + + m = r.full_match('fooaa', []) + expect(m[1]).to eq('oo') + end + + it "accepts anything that can be coerced to a string" do + r = RE2::Regexp.new('f(o+)(a+)') + + expect(r.full_match(StringLike.new('fooaa'), submatches: 0)).to eq(true) + end + end end