From 2f945fb56dd5805adc092701b68a678b2f7c7830 Mon Sep 17 00:00:00 2001
From: Paul Mucur <mudge@mudge.name>
Date: Thu, 30 Nov 2023 16:01:38 +0000
Subject: [PATCH] Expose more of RE2's matching interface

GitHub: https://github.com/mudge/re2/issues/119

Add new options to `RE2::Regexp#match` that expose the underlying
capabilities of RE2's Match function:

* anchor: specifying whether a match should be unanchored (the default),
  anchored to the start of the text or anchored to both ends
* startpos: the offset at which to start matching (defaults to the start
  of the text)
* submatches: the number of submatches to extract (defaults to the
  number of capturing groups in the pattern)

We keep compatibility with the previous API by still accepting a number
of submatches as the second argument to match.

With these new options in place, we can now offer a higher-level
`RE2::Regexp#full_match` and `RE2::Regexp#partial_match` API to match
RE2's own. Note we don't actually use the underlying `FullMatchN` or
`PartialMatchN` functions as we need to use `Match`'s behaviour of
returning the overall match first before any extracted submatches.

The plan is to then heavily promote these two methods over the
lower-level `match`.
---
 ext/re2/re2.cc          | 121 ++++++++++++++++++-------
 lib/re2.rb              |   1 +
 lib/re2/regexp.rb       |  62 +++++++++++++
 re2.gemspec             |   1 +
 spec/re2/regexp_spec.rb | 192 ++++++++++++++++++++++++++++++++++++++--
 5 files changed, 338 insertions(+), 39 deletions(-)
 create mode 100644 lib/re2/regexp.rb

diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc
index c59bb540..a358b036 100644
--- a/ext/re2/re2.cc
+++ b/ext/re2/re2.cc
@@ -47,8 +47,9 @@ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
 /* Symbols used in RE2 options. */
 static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
           id_max_mem, id_literal, id_never_nl, id_case_sensitive,
-          id_perl_classes, id_word_boundary, id_one_line,
-          id_unanchored, id_anchor_start, id_anchor_both, id_exception;
+          id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
+          id_anchor, id_anchor_start, id_anchor_both, id_exception,
+          id_submatches, id_startpos;
 
 inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
   if (encoding == RE2::Options::EncodingUTF8) {
@@ -1339,38 +1340,37 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
  *     r = RE2::Regexp.new('woo')
  *     r.match('woo')    #=> true
  *
- * @overload match(text, 0)
- *   Returns either true or false indicating whether a
- *   successful match was made.
- *
- *   @param [String] text the text to search
- *   @return [Boolean] whether the match was successful
- *   @raise [NoMemoryError] if there was not enough memory to allocate the submatches
- *   @example
- *     r = RE2::Regexp.new('w(o)(o)')
- *     r.match('woo', 0) #=> true
- *     r.match('bob', 0) #=> false
- *
- * @overload match(text, number_of_submatches)
+ * @overload match(text, options)
  *   See +match(text)+ but with a specific number of
  *   submatches returned (padded with nils if necessary).
  *
  *   @param [String] text the text to search
- *   @param [Integer] number_of_submatches the number of submatches to return
- *   @return [RE2::MatchData] the submatches
- *   @raise [ArgumentError] if given a negative number of submatches
+ *   @param [Hash] options the options with which to perform the match
+ *   @option options [Integer] :startpos (0) offset at which to start matching
+ *   @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
+ *   @option options [Integer] :submatches how many submatches to extract (0 is
+ *     fastest), defaults to the number of capturing groups
+ *   @return [RE2::MatchData] if extracting any submatches
+ *   @return [Boolean] if not extracting any submatches
+ *   @raise [ArgumentError] if given a negative number of submatches or invalid anchor
  *   @raise [NoMemoryError] if there was not enough memory to allocate the matches
+ *   @raise [TypeError] if given non-String text, non-numeric number of
+ *     submatches, non-symbol anchor or non-hash options
  *   @example
  *     r = RE2::Regexp.new('w(o)(o)')
- *     r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
- *     r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
+ *     r.match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+ *     r.match('woo', submatches: 3) # => #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
+ *     r.match('woot', anchor: :anchor_both, submatches: 0)
+ *     # => false
+ *     r.match('woot', anchor: :anchor_start, submatches: 0)
+ *     # => true
  */
 static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
   re2_pattern *p;
   re2_matchdata *m;
-  VALUE text, number_of_submatches;
+  VALUE text, options;
 
-  rb_scan_args(argc, argv, "11", &text, &number_of_submatches);
+  rb_scan_args(argc, argv, "11", &text, &options);
 
   /* Ensure text is a string. */
   StringValue(text);
@@ -1378,12 +1378,64 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
   TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
 
   int n;
+  int startpos = 0;
+  RE2::Anchor anchor = RE2::UNANCHORED;
+
+  if (RTEST(options)) {
+    if (FIXNUM_P(options)) {
+      n = NUM2INT(options);
 
-  if (RTEST(number_of_submatches)) {
-    n = NUM2INT(number_of_submatches);
+      if (n < 0) {
+        rb_raise(rb_eArgError, "number of matches should be >= 0");
+      }
+    } else {
+      if (TYPE(options) != T_HASH) {
+        options = rb_Hash(options);
+      }
 
-    if (n < 0) {
-      rb_raise(rb_eArgError, "number of matches should be >= 0");
+      VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
+      if (!NIL_P(anchor_option)) {
+        Check_Type(anchor_option, T_SYMBOL);
+
+        ID id_anchor_option = SYM2ID(anchor_option);
+        if (id_anchor_option == id_unanchored) {
+          anchor = RE2::UNANCHORED;
+        } else if (id_anchor_option == id_anchor_start) {
+          anchor = RE2::ANCHOR_START;
+        } else if (id_anchor_option == id_anchor_both) {
+          anchor = RE2::ANCHOR_BOTH;
+        } else {
+          rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
+        }
+      }
+
+      VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
+      if (!NIL_P(submatches_option)) {
+        Check_Type(submatches_option, T_FIXNUM);
+
+        n = NUM2INT(submatches_option);
+
+        if (n < 0) {
+          rb_raise(rb_eArgError, "number of matches should be >= 0");
+        }
+      } else {
+        if (!p->pattern->ok()) {
+          return Qnil;
+        }
+
+        n = p->pattern->NumberOfCapturingGroups();
+      }
+
+      VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
+      if (!NIL_P(startpos_option)) {
+        Check_Type(startpos_option, T_FIXNUM);
+
+        startpos = NUM2INT(startpos_option);
+
+        if (startpos < 0) {
+          rb_raise(rb_eArgError, "startpos should be >= 0");
+        }
+      }
     }
   } else {
     if (!p->pattern->ok()) {
@@ -1395,10 +1447,10 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
 
   if (n == 0) {
 #ifdef HAVE_ENDPOS_ARGUMENT
-    bool matched = p->pattern->Match(RSTRING_PTR(text), 0,
-        RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
+    bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
+        RSTRING_LEN(text), anchor, 0, 0);
 #else
-    bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED,
+    bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
         0, 0);
 #endif
     return BOOL2RUBY(matched);
@@ -1423,11 +1475,11 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
     m->number_of_matches = n;
 
 #ifdef HAVE_ENDPOS_ARGUMENT
-    bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
-        RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
+    bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
+        RSTRING_LEN(m->text), anchor, m->matches, n);
 #else
-    bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
-        RE2::UNANCHORED, m->matches, n);
+    bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
+        anchor, m->matches, n);
 #endif
     if (matched) {
       return matchdata;
@@ -2032,7 +2084,10 @@ extern "C" void Init_re2(void) {
   id_word_boundary = rb_intern("word_boundary");
   id_one_line = rb_intern("one_line");
   id_unanchored = rb_intern("unanchored");
+  id_anchor = rb_intern("anchor");
   id_anchor_start = rb_intern("anchor_start");
   id_anchor_both = rb_intern("anchor_both");
   id_exception = rb_intern("exception");
+  id_submatches = rb_intern("submatches");
+  id_startpos = rb_intern("startpos");
 }
diff --git a/lib/re2.rb b/lib/re2.rb
index 76d8fcec..3f2751e6 100644
--- a/lib/re2.rb
+++ b/lib/re2.rb
@@ -10,5 +10,6 @@
   require 're2.so'
 end
 
+require "re2/regexp"
 require "re2/scanner"
 require "re2/version"
diff --git a/lib/re2/regexp.rb b/lib/re2/regexp.rb
new file mode 100644
index 00000000..94790c0d
--- /dev/null
+++ b/lib/re2/regexp.rb
@@ -0,0 +1,62 @@
+module RE2
+  class Regexp
+    # Match the pattern against any substring of the given +text+ and return
+    # either a boolean (if no submatches are required) or a {RE2::MatchData}
+    # instance with the specified number of submatches (defaults to the total
+    # number of capturing groups).
+    #
+    # The number of submatches has a significant impact on performance: requesting
+    # one submatch is much faster than requesting more than one and requesting
+    # zero submatches is faster still.
+    #
+    # @param [String] text the text to search
+    # @param [Hash] options the options with which to perform the match
+    # @option options [Integer] :submatches how many submatches to extract (0
+    #   is fastest), defaults to the total number of capturing groups
+    # @return [RE2::MatchData] if extracting any submatches
+    # @return [Boolean] if not extracting any submatches
+    # @raise [ArgumentError] if given a negative number of submatches
+    # @raise [NoMemoryError] if there was not enough memory to allocate the
+    #   matches
+    # @raise [TypeError] if given non-numeric submatches or non-hash options
+    # @example
+    #   r = RE2::Regexp.new('w(o)(o)')
+    #   r.partial_match('woot')
+    #   # => #<RE2::MatchData "woo" 1:"o" 2:"o">
+    #   r.partial_match('woot', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+    #   r.partial_match('woot', submatches: 0) # => true
+    def partial_match(text, options = {})
+      match(text, Hash(options).merge(anchor: :unanchored))
+    end
+
+    # Match the pattern against the given +text+ exactly and return either a
+    # boolean (if no submatches are required) or a {RE2::MatchData} instance
+    # with the specified number of submatches (defaults to the total number of
+    # capturing groups).
+    #
+    # The number of submatches has a significant impact on performance: requesting
+    # one submatch is much faster than requesting more than one and requesting
+    # zero submatches is faster still.
+    #
+    # @param [String] text the text to search
+    # @param [Hash] options the options with which to perform the match
+    # @option options [Integer] :submatches how many submatches to extract (0
+    #   is fastest), defaults to the total number of capturing groups
+    # @return [RE2::MatchData] if extracting any submatches
+    # @return [Boolean] if not extracting any submatches
+    # @raise [ArgumentError] if given a negative number of submatches
+    # @raise [NoMemoryError] if there was not enough memory to allocate the
+    #   matches
+    # @raise [TypeError] if given non-numeric submatches or non-hash options
+    # @example
+    #   r = RE2::Regexp.new('w(o)(o)')
+    #   r.full_match('woo')
+    #   # => #<RE2::MatchData "woo" 1:"o" 2:"o">
+    #   r.full_match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+    #   r.full_match('woo', submatches: 0) # => true
+    #   r.full_match('woot') # => nil
+    def full_match(text, options = {})
+      match(text, Hash(options).merge(anchor: :anchor_both))
+    end
+  end
+end
diff --git a/re2.gemspec b/re2.gemspec
index 8fa405da..08a7db05 100644
--- a/re2.gemspec
+++ b/re2.gemspec
@@ -17,6 +17,7 @@ Gem::Specification.new do |s|
     "ext/re2/recipes.rb",
     "Gemfile",
     "lib/re2.rb",
+    "lib/re2/regexp.rb",
     "lib/re2/scanner.rb",
     "lib/re2/string.rb",
     "lib/re2/version.rb",
diff --git a/spec/re2/regexp_spec.rb b/spec/re2/regexp_spec.rb
index 8f226f27..70c16e97 100644
--- a/spec/re2/regexp_spec.rb
+++ b/spec/re2/regexp_spec.rb
@@ -313,21 +313,100 @@
       expect { re.match(nil) }.to raise_error(TypeError)
     end
 
-    it "raises an exception when given an inappropriate number of matches" do
-      expect { re.match("My name is Robert Paulson", {}) }.to raise_error(TypeError)
+    it "raises an exception when given invalid options" do
+      expect { re.match("My name is Robert Paulson", "foo") }.to raise_error(TypeError)
     end
 
-    it "raises an exception when given a negative number of matches" do
-      expect { re.match("My name is Robert Paulson", -1) }.to raise_error(ArgumentError, "number of matches should be >= 0")
+    it "accepts anything that can be coerced to a hash as options", :aggregate_failures do
+      m = re.match("My name is Robert Paulson", nil)
+      expect(m[1]).to eq("Robert")
+
+      m = re.match("My name is Robert Paulson", [])
+      expect(m[1]).to eq("Robert")
     end
 
     it "returns nil with an invalid pattern" do
       re = RE2::Regexp.new('???', :log_errors => false)
+
       expect(re.match('My name is Robert Paulson')).to be_nil
     end
 
+    it "returns nil with an invalid pattern and options" do
+      re = RE2::Regexp.new('???', :log_errors => false)
+
+      expect(re.match('My name is Robert Paulson', submatches: 1)).to be_nil
+    end
+
+    it "is unanchored by default", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", submatches: 0)).to eq(true)
+      expect(re.match("My name is Robert Paulson, he said", submatches: 0)).to eq(true)
+      expect(re.match("He said, My name is Robert Paulson", submatches: 0)).to eq(true)
+    end
+
+    it "is unanchored if given a nil anchor", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", anchor: nil, submatches: 0)).to eq(true)
+      expect(re.match("My name is Robert Paulson, he said", anchor: nil, submatches: 0)).to eq(true)
+      expect(re.match("He said, My name is Robert Paulson", anchor: nil, submatches: 0)).to eq(true)
+    end
+
+    it "can be explicitly unanchored", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", anchor: :unanchored, submatches: 0)).to eq(true)
+      expect(re.match("My name is Robert Paulson, he said", anchor: :unanchored, submatches: 0)).to eq(true)
+      expect(re.match("He said, My name is Robert Paulson", anchor: :unanchored, submatches: 0)).to eq(true)
+    end
+
+    it "can anchor the match at both ends", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", anchor: :anchor_both, submatches: 0)).to eq(true)
+      expect(re.match("My name is Robert Paulson, he said", anchor: :anchor_both, submatches: 0)).to eq(false)
+      expect(re.match("He said, My name is Robert Paulson", anchor: :anchor_both, submatches: 0)).to eq(false)
+    end
+
+    it "can anchor the match at the start", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", anchor: :anchor_start, submatches: 0)).to eq(true)
+      expect(re.match("My name is Robert Paulson, he said", anchor: :anchor_start, submatches: 0)).to eq(true)
+      expect(re.match("He said, My name is Robert Paulson", anchor: :anchor_start, submatches: 0)).to eq(false)
+    end
+
+    it "raises an exception when given an invalid anchor" do
+      expect { re.match("My name is Robert Paulson", anchor: :invalid) }.to raise_error(ArgumentError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both")
+    end
+
+    it "raises an exception when given a non-symbol anchor" do
+      expect { re.match("My name is Robert Paulson", anchor: 0) }.to raise_error(TypeError)
+    end
+
+    it "can be given an offset at which to start matching", :aggregate_failures do
+      m = re.match("My name is Alice Bloggs My name is Robert Paulson", startpos: 24)
+
+      expect(m[1]).to eq("Robert")
+      expect(m[2]).to eq("Paulson")
+    end
+
+    it "does not match if given an offset past the end of the text", :aggregate_failures do
+      expect(re.match("My name is Alice Bloggs", startpos: 99)).to be_nil
+    end
+
+    it "raises an exception when given a negative start position" do
+      expect { re.match("My name is Robert Paulson", startpos: -1) }.to raise_error(ArgumentError, "startpos should be >= 0")
+    end
+
+    it "raises an exception when given a negative number of matches" do
+      expect { re.match("My name is Robert Paulson", submatches: -1) }.to raise_error(ArgumentError, "number of matches should be >= 0")
+    end
+
+    it "raises an exception when given a non-numeric number of matches" do
+      expect { re.match("My name is Robert Paulson", submatches: "foo") }.to raise_error(TypeError)
+    end
+
+    it "defaults to extracting all submatches when given nil", :aggregate_failures do
+      m = re.match("My name is Robert Paulson", submatches: nil)
+
+      expect(m[1]).to eq("Robert")
+      expect(m[2]).to eq("Paulson")
+    end
+
     describe "with a specific number of matches under the total in the pattern" do
-      subject { re.match("My name is Robert Paulson", 1) }
+      subject { re.match("My name is Robert Paulson", submatches: 1) }
 
       it "returns a match data object" do
         expect(subject).to be_a(RE2::MatchData)
@@ -347,7 +426,7 @@
     end
 
     describe "with a number of matches over the total in the pattern" do
-      subject { re.match("My name is Robert Paulson", 5) }
+      subject { re.match("My name is Robert Paulson", submatches: 5) }
 
       it "returns a match data object" do
         expect(subject).to be_a(RE2::MatchData)
@@ -369,6 +448,20 @@
         expect(subject[6]).to be_nil
       end
     end
+
+    it "accepts the number of submatches as a second argument for compatibility", :aggregate_failures do
+      expect(re.match("My name is Robert Paulson", 0)).to eq(true)
+
+      m = re.match("My name is Robert Paulson", 1)
+      expect(m[1]).to eq("Robert")
+      expect(m[2]).to be_nil
+
+      m = re.match("My name is Robert Paulson", 2)
+      expect(m[1]).to eq("Robert")
+      expect(m[2]).to eq("Paulson")
+
+      expect { re.match("My name is Robert Paulson", -1) }.to raise_error(ArgumentError, "number of matches should be >= 0")
+    end
   end
 
   describe "#match?" do
@@ -475,4 +568,91 @@
       expect(scanner).to be_a(RE2::Scanner)
     end
   end
+
+  describe "#partial_match" do
+    it "matches the pattern anywhere within the given text" do
+      r = RE2::Regexp.new('f(o+)')
+
+      expect(r.partial_match('foo bar', submatches: 0)).to eq(true)
+    end
+
+    it "can set the number of submatches to extract", :aggregate_failures do
+      r = RE2::Regexp.new('f(o+)(a+)')
+      m = r.partial_match('fooaa bar', submatches: 1)
+
+      expect(m[1]).to eq('oo')
+      expect(m[2]).to be_nil
+
+      m = r.partial_match('fooaa bar', submatches: 2)
+
+      expect(m[1]).to eq('oo')
+      expect(m[2]).to eq('aa')
+    end
+
+    it "raises an error if given non-hash options" do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      expect { r.partial_match('fooaa bar', 'not a hash') }.to raise_error(TypeError)
+    end
+
+    it "accepts options that can be coerced to a hash", :aggregate_failures do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      m = r.partial_match('fooaa bar', nil)
+      expect(m[1]).to eq('oo')
+
+      m = r.partial_match('fooaa bar', [])
+      expect(m[1]).to eq('oo')
+    end
+
+    it "accepts anything that can be coerced to a string" do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      expect(r.partial_match(StringLike.new('fooaa bar'), submatches: 0)).to eq(true)
+    end
+  end
+
+  describe "#full_match" do
+    it "only matches the pattern if all of the given text matches", :aggregate_failures do
+      r = RE2::Regexp.new('f(o+)')
+
+      expect(r.full_match('foo', submatches: 0)).to eq(true)
+      expect(r.full_match('foo bar', submatches: 0)).to eq(false)
+    end
+
+    it "can set the number of submatches to extract", :aggregate_failures do
+      r = RE2::Regexp.new('f(o+)(a+)')
+      m = r.full_match('fooaa', submatches: 1)
+
+      expect(m[1]).to eq('oo')
+      expect(m[2]).to be_nil
+
+      m = r.full_match('fooaa', submatches: 2)
+
+      expect(m[1]).to eq('oo')
+      expect(m[2]).to eq('aa')
+    end
+
+    it "raises an error if given non-hash options" do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      expect { r.full_match('fooaa', 'not a hash') }.to raise_error(TypeError)
+    end
+
+    it "accepts options that can be coerced to a hash", :aggregate_failures do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      m = r.full_match('fooaa', nil)
+      expect(m[1]).to eq('oo')
+
+      m = r.full_match('fooaa', [])
+      expect(m[1]).to eq('oo')
+    end
+
+    it "accepts anything that can be coerced to a string" do
+      r = RE2::Regexp.new('f(o+)(a+)')
+
+      expect(r.full_match(StringLike.new('fooaa'), submatches: 0)).to eq(true)
+    end
+  end
 end