Expose more of RE2's matching interface

GitHub: #119 Add new options to `RE2::Regexp#match` that expose the underlying capabilities of RE2's Match function: * anchor: specifying whether a match should be unanchored (the default), anchored to the start of the text or anchored to both ends * startpos: the offset at which to start matching (defaults to the start of the text) * submatches: the number of submatches to extract (defaults to the number of capturing groups in the pattern) We keep compatibility with the previous API by still accepting a number of submatches as the second argument to match. With these new options in place, we can now offer a higher-level `RE2::Regexp#full_match` and `RE2::Regexp#partial_match` API to match RE2's own. Note we don't actually use the underlying `FullMatchN` or `PartialMatchN` functions as we need to use `Match`'s behaviour of returning the overall match first before any extracted submatches. The plan is to then heavily promote these two methods over the lower-level `match`.
mudge · Nov 30, 2023 · f4536bf · f4536bf
1 parent 8d1b21d
commit f4536bf
Show file tree

Hide file tree

Showing 4 changed files with 337 additions and 39 deletions.
diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc
@@ -47,8 +47,9 @@ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
 /* Symbols used in RE2 options. */
 static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
           id_max_mem, id_literal, id_never_nl, id_case_sensitive,
-          id_perl_classes, id_word_boundary, id_one_line,
-          id_unanchored, id_anchor_start, id_anchor_both, id_exception;
+          id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
+          id_anchor, id_anchor_start, id_anchor_both, id_exception,
+          id_submatches, id_startpos;
 
 inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
   if (encoding == RE2::Options::EncodingUTF8) {
@@ -1339,51 +1340,102 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
  *     r = RE2::Regexp.new('woo')
  *     r.match('woo')    #=> true
  *
- * @overload match(text, 0)
- *   Returns either true or false indicating whether a
- *   successful match was made.
- *
- *   @param [String] text the text to search
- *   @return [Boolean] whether the match was successful
- *   @raise [NoMemoryError] if there was not enough memory to allocate the submatches
- *   @example
- *     r = RE2::Regexp.new('w(o)(o)')
- *     r.match('woo', 0) #=> true
- *     r.match('bob', 0) #=> false
- *
- * @overload match(text, number_of_submatches)
+ * @overload match(text, options)
  *   See +match(text)+ but with a specific number of
  *   submatches returned (padded with nils if necessary).
  *
  *   @param [String] text the text to search
- *   @param [Integer] number_of_submatches the number of submatches to return
- *   @return [RE2::MatchData] the submatches
- *   @raise [ArgumentError] if given a negative number of submatches
+ *   @param [Hash] options the options with which to perform the match
+ *   @option options [Integer] :startpos (0) offset at which to start matching
+ *   @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
+ *   @option options [Integer] :submatches how many submatches to extract (0 is
+ *     fastest), defaults to the number of capturing groups
+ *   @return [RE2::MatchData] if extracting any submatches
+ *   @return [Boolean] if not extracting any submatches
+ *   @raise [ArgumentError] if given a negative number of submatches or invalid anchor
  *   @raise [NoMemoryError] if there was not enough memory to allocate the matches
+ *   @raise [TypeError] if given non-String text, non-numeric number of
+ *     submatches, non-symbol anchor or non-hash options
  *   @example
  *     r = RE2::Regexp.new('w(o)(o)')
- *     r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
- *     r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
+ *     r.match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+ *     r.match('woo', submatches: 3) # => #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
+ *     r.match('woot', anchor: :anchor_both, submatches: 0)
+ *     # => false
+ *     r.match('woot', anchor: :anchor_start, submatches: 0)
+ *     # => true
  */
 static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
   re2_pattern *p;
   re2_matchdata *m;
-  VALUE text, number_of_submatches;
+  VALUE text, options;
 
-  rb_scan_args(argc, argv, "11", &text, &number_of_submatches);
+  rb_scan_args(argc, argv, "11", &text, &options);
 
   /* Ensure text is a string. */
   StringValue(text);
 
   TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
 
   int n;
+  int startpos = 0;
+  RE2::Anchor anchor = RE2::UNANCHORED;
+
+  if (RTEST(options)) {
+    if (FIXNUM_P(options)) {
+      n = NUM2INT(options);
 
-  if (RTEST(number_of_submatches)) {
-    n = NUM2INT(number_of_submatches);
+      if (n < 0) {
+        rb_raise(rb_eArgError, "number of matches should be >= 0");
+      }
+    } else {
+      if (TYPE(options) != T_HASH) {
+        options = rb_Hash(options);
+      }
 
-    if (n < 0) {
-      rb_raise(rb_eArgError, "number of matches should be >= 0");
+      VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
+      if (!NIL_P(anchor_option)) {
+        Check_Type(anchor_option, T_SYMBOL);
+
+        ID id_anchor_option = SYM2ID(anchor_option);
+        if (id_anchor_option == id_unanchored) {
+          anchor = RE2::UNANCHORED;
+        } else if (id_anchor_option == id_anchor_start) {
+          anchor = RE2::ANCHOR_START;
+        } else if (id_anchor_option == id_anchor_both) {
+          anchor = RE2::ANCHOR_BOTH;
+        } else {
+          rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
+        }
+      }
+
+      VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
+      if (!NIL_P(submatches_option)) {
+        Check_Type(submatches_option, T_FIXNUM);
+
+        n = NUM2INT(submatches_option);
+
+        if (n < 0) {
+          rb_raise(rb_eArgError, "number of matches should be >= 0");
+        }
+      } else {
+        if (!p->pattern->ok()) {
+          return Qnil;
+        }
+
+        n = p->pattern->NumberOfCapturingGroups();
+      }
+
+      VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
+      if (!NIL_P(startpos_option)) {
+        Check_Type(startpos_option, T_FIXNUM);
+
+        startpos = NUM2INT(startpos_option);
+
+        if (startpos < 0) {
+          rb_raise(rb_eArgError, "startpos should be >= 0");
+        }
+      }
     }
   } else {
     if (!p->pattern->ok()) {
@@ -1395,10 +1447,10 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
 
   if (n == 0) {
 #ifdef HAVE_ENDPOS_ARGUMENT
-    bool matched = p->pattern->Match(RSTRING_PTR(text), 0,
-        RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
+    bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
+        RSTRING_LEN(text), anchor, 0, 0);
 #else
-    bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED,
+    bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
         0, 0);
 #endif
     return BOOL2RUBY(matched);
@@ -1423,11 +1475,11 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
     m->number_of_matches = n;
 
 #ifdef HAVE_ENDPOS_ARGUMENT
-    bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
-        RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
+    bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
+        RSTRING_LEN(m->text), anchor, m->matches, n);
 #else
-    bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
-        RE2::UNANCHORED, m->matches, n);
+    bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
+        anchor, m->matches, n);
 #endif
     if (matched) {
       return matchdata;
@@ -2032,7 +2084,10 @@ extern "C" void Init_re2(void) {
   id_word_boundary = rb_intern("word_boundary");
   id_one_line = rb_intern("one_line");
   id_unanchored = rb_intern("unanchored");
+  id_anchor = rb_intern("anchor");
   id_anchor_start = rb_intern("anchor_start");
   id_anchor_both = rb_intern("anchor_both");
   id_exception = rb_intern("exception");
+  id_submatches = rb_intern("submatches");
+  id_startpos = rb_intern("startpos");
 }
diff --git a/lib/re2.rb b/lib/re2.rb
@@ -10,5 +10,6 @@
   require 're2.so'
 end
 
+require "re2/regexp"
 require "re2/scanner"
 require "re2/version"
diff --git a/lib/re2/regexp.rb b/lib/re2/regexp.rb
@@ -0,0 +1,62 @@
+module RE2
+  class Regexp
+    # Match the pattern against any substring of the given +text+ and return
+    # either a boolean (if no submatches are required) or a {RE2::MatchData}
+    # instance with the specified number of submatches (defaults to the total
+    # number of capturing groups).
+    #
+    # The number of submatches has a significant impact on performance: requesting
+    # one submatch is much faster than requesting more than one and requesting
+    # zero submatches is faster still.
+    #
+    # @param [String] text the text to search
+    # @param [Hash] options the options with which to perform the match
+    # @option options [Integer] :submatches how many submatches to extract (0
+    #   is fastest), defaults to the total number of capturing groups
+    # @return [RE2::MatchData] if extracting any submatches
+    # @return [Boolean] if not extracting any submatches
+    # @raise [ArgumentError] if given a negative number of submatches
+    # @raise [NoMemoryError] if there was not enough memory to allocate the
+    #   matches
+    # @raise [TypeError] if given non-numeric submatches or non-hash options
+    # @example
+    #   r = RE2::Regexp.new('w(o)(o)')
+    #   r.partial_match('woot')
+    #   # => #<RE2::MatchData "woo" 1:"o" 2:"o">
+    #   r.partial_match('woot', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+    #   r.partial_match('woot', submatches: 0) # => true
+    def partial_match(text, options = {})
+      match(text, Hash(options).merge(anchor: :unanchored))
+    end
+
+    # Match the pattern against the given +text+ exactly and return either a
+    # boolean (if no submatches are required) or a {RE2::MatchData} instance
+    # with the specified number of submatches (defaults to the total number of
+    # capturing groups).
+    #
+    # The number of submatches has a significant impact on performance: requesting
+    # one submatch is much faster than requesting more than one and requesting
+    # zero submatches is faster still.
+    #
+    # @param [String] text the text to search
+    # @param [Hash] options the options with which to perform the match
+    # @option options [Integer] :submatches how many submatches to extract (0
+    #   is fastest), defaults to the total number of capturing groups
+    # @return [RE2::MatchData] if extracting any submatches
+    # @return [Boolean] if not extracting any submatches
+    # @raise [ArgumentError] if given a negative number of submatches
+    # @raise [NoMemoryError] if there was not enough memory to allocate the
+    #   matches
+    # @raise [TypeError] if given non-numeric submatches or non-hash options
+    # @example
+    #   r = RE2::Regexp.new('w(o)(o)')
+    #   r.full_match('woo')
+    #   # => #<RE2::MatchData "woo" 1:"o" 2:"o">
+    #   r.full_match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
+    #   r.full_match('woo', submatches: 0) # => true
+    #   r.full_match('woot') # => nil
+    def full_match(text, options = {})
+      match(text, Hash(options).merge(anchor: :anchor_both))
+    end
+  end
+end