Improve docs to highlight the effect of submatches

As a precursor to exposing more of RE2's matching API, rework the docs to better emphasise the impact of the number of submatches when matching.
mudge · Nov 28, 2023 · 8d1b21d · 8d1b21d
1 parent c7de1ba
commit 8d1b21d
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@ backtracking regular expression engines like those used in PCRE, Perl, and
 Python".
 
 **Current version:** 2.4.3  
-**Supported Ruby versions:** 2.6, 2.7, 3.0, 3.1, 3.2  
 **Bundled RE2 version:** libre2.11 (2023-11-01)  
+**Supported Ruby versions:** 2.6, 2.7, 3.0, 3.1, 3.2  
 **Supported RE2 versions:** libre2.0 (< 2020-03-02), libre2.1 (2020-03-02), libre2.6 (2020-03-03), libre2.7 (2020-05-01), libre2.8 (2020-07-06), libre2.9 (2020-11-01), libre2.10 (2022-12-01), libre2.11 (2023-07-01)
 
 Installation
@@ -68,9 +68,10 @@ Documentation
 Full documentation automatically generated from the latest version is
 available at <http://mudge.name/re2/>.
 
-Note that RE2's regular expression syntax differs from PCRE and Ruby's
-built-in [`Regexp`][Regexp] library, see the [official syntax page][] for more
-details.
+> [!IMPORTANT]
+> Note that RE2's regular expression syntax differs from PCRE and Ruby's
+> built-in [`Regexp`][Regexp] library, see the [official syntax page][] for
+> more details.
 
 Usage
 -----
@@ -80,27 +81,19 @@ library (with [`Regexp`](http://mudge.name/re2/RE2/Regexp.html) and
 [`MatchData`](http://mudge.name/re2/RE2/MatchData.html)), its API is slightly
 different:
 
-```console
-$ irb -rubygems
-> require 're2'
-> r = RE2::Regexp.new('w(\d)(\d+)')
-=> #<RE2::Regexp /w(\d)(\d+)/>
-> m = r.match("w1234")
-=> #<RE2::MatchData "w1234" 1:"1" 2:"234">
-> m[1]
-=> "1"
-> m.string
-=> "w1234"
-> m.begin(1)
-=> 1
-> m.end(1)
-=> 2
-> r =~ "w1234"
-=> true
-> r !~ "bob"
-=> true
-> r.match("bob")
-=> nil
+```ruby
+require "re2"
+
+r = RE2::Regexp.new('w(\d)(\d+)') # => #<RE2::Regexp /w(\d)(\d+)/>
+m = r.match("w1234")              # => #<RE2::MatchData "w1234" 1:"1" 2:"234">
+m[1]                              # => "1"
+
+# Improve performance by requesting fewer submatches
+m = r.match("w1234", 1)           # => #<RE2::MatchData "w1234" 1:"1">
+
+# Or no submatches at all
+r.match("w1234", 0)               # => true
+r =~ "w1234"                      # => true
 ```
 
 As
@@ -109,30 +102,25 @@ As
 defined against `Kernel` so you can use a shorter version to create regular
 expressions:
 
-```console
-> RE2('(\d+)')
-=> #<RE2::Regexp /(\d+)/>
+```ruby
+RE2('(\d+)') # => #<RE2::Regexp /(\d+)/>
 ```
 
 Note the use of *single quotes* as double quotes will interpret `\d` as `d` as
 in the following example:
 
-```console
-> RE2("(\d+)")
-=> #<RE2::Regexp /(d+)/>
+```ruby
+RE2("(\d+)") # => #<RE2::Regexp /(d+)/>
 ```
 
 As of 0.3.0, you can use named groups:
 
-```console
-> r = RE2::Regexp.new('(?P<name>\w+) (?P<age>\d+)')
-=> #<RE2::Regexp /(?P<name>\w+) (?P<age>\d+)/>
-> m = r.match("Bob 40")
-=> #<RE2::MatchData "Bob 40" 1:"Bob" 2:"40">
-> m[:name]
-=> "Bob"
-> m["age"]
-=> "40"
+```ruby
+r = RE2::Regexp.new('(?P<name>\w+) (?P<age>\d+)')
+# => #<RE2::Regexp /(?P<name>\w+) (?P<age>\d+)/>
+m = r.match("Bob 40") # => #<RE2::MatchData "Bob 40" 1:"Bob" 2:"40">
+m[:name]              # => "Bob"
+m["age"]              # => "40"
 ```
 
 As of 0.6.0, you can use `RE2::Regexp#scan` to incrementally scan text for
@@ -197,9 +185,10 @@ end
 Encoding
 --------
 
-Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
-returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
-`RE2::Regexp` is set to false (any other encoding's behaviour is undefined).
+> [!IMPORTANT]
+> Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
+> returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
+> `RE2::Regexp` is set to false (any other encoding's behaviour is undefined).
 
 For backward compatibility: re2 won't automatically convert string inputs to
 the right encoding so this is the responsibility of the caller, e.g.
@@ -220,8 +209,8 @@ Features
   `RE2::Regexp.compile(re)` or `RE2(re)` (including specifying options, e.g.
   `RE2::Regexp.new("pattern", :case_sensitive => false)`
 
-* Extracting matches with `re2.match(text)` (and an exact number of matches
-  with `re2.match(text, number_of_matches)` such as `re2.match("123-234", 2)`)
+* Extracting matches with `re2.match(text)` (and an exact number of submatches
+  with `re2.match(text, number_of_submatches)` such as `re2.match("123-234", 2)`)
 
 * Extracting matches by name (both with strings and symbols)
 

diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc
@@ -1309,15 +1309,20 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
 }
 
 /*
- * Match the pattern against the given +text+ and return either
- * a boolean (if no submatches are required) or a {RE2::MatchData}
- * instance.
+ * Match the pattern against the given +text+ and return either a boolean (if
+ * no submatches are required) or a {RE2::MatchData} instance with the
+ * specified number of submatches (defaults to the total number of capturing
+ * groups).
+ *
+ * The number of submatches has a significant impact on performance: requesting
+ * one submatch is much faster than requesting more than one and requesting
+ * zero submatches is faster still.
  *
  * @return [Boolean, RE2::MatchData]
  *
  * @overload match(text)
  *   Returns an {RE2::MatchData} containing the matching pattern and all
- *   subpatterns resulting from looking for the regexp in +text+ if the pattern
+ *   submatches resulting from looking for the regexp in +text+ if the pattern
  *   contains capturing groups.
  *
  *   Returns either true or false indicating whether a successful match was
@@ -1326,7 +1331,7 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
  *   @param [String] text the text to search
  *   @return [RE2::MatchData] if the pattern contains capturing groups
  *   @return [Boolean] if the pattern does not contain capturing groups
- *   @raise [NoMemoryError] if there was not enough memory to allocate the matches
+ *   @raise [NoMemoryError] if there was not enough memory to allocate the submatches
  *   @example Matching with capturing groups
  *     r = RE2::Regexp.new('w(o)(o)')
  *     r.match('woo')    #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
@@ -1340,20 +1345,20 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
  *
  *   @param [String] text the text to search
  *   @return [Boolean] whether the match was successful
- *   @raise [NoMemoryError] if there was not enough memory to allocate the matches
+ *   @raise [NoMemoryError] if there was not enough memory to allocate the submatches
  *   @example
  *     r = RE2::Regexp.new('w(o)(o)')
  *     r.match('woo', 0) #=> true
  *     r.match('bob', 0) #=> false
  *
- * @overload match(text, number_of_matches)
+ * @overload match(text, number_of_submatches)
  *   See +match(text)+ but with a specific number of
- *   matches returned (padded with nils if necessary).
+ *   submatches returned (padded with nils if necessary).
  *
  *   @param [String] text the text to search
- *   @param [Integer] number_of_matches the number of matches to return
- *   @return [RE2::MatchData] the matches
- *   @raise [ArgumentError] if given a negative number of matches
+ *   @param [Integer] number_of_submatches the number of submatches to return
+ *   @return [RE2::MatchData] the submatches
+ *   @raise [ArgumentError] if given a negative number of submatches
  *   @raise [NoMemoryError] if there was not enough memory to allocate the matches
  *   @example
  *     r = RE2::Regexp.new('w(o)(o)')
@@ -1363,9 +1368,9 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
 static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
   re2_pattern *p;
   re2_matchdata *m;
-  VALUE text, number_of_matches;
+  VALUE text, number_of_submatches;
 
-  rb_scan_args(argc, argv, "11", &text, &number_of_matches);
+  rb_scan_args(argc, argv, "11", &text, &number_of_submatches);
 
   /* Ensure text is a string. */
   StringValue(text);
@@ -1374,8 +1379,8 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
 
   int n;
 
-  if (RTEST(number_of_matches)) {
-    n = NUM2INT(number_of_matches);
+  if (RTEST(number_of_submatches)) {
+    n = NUM2INT(number_of_submatches);
 
     if (n < 0) {
       rb_raise(rb_eArgError, "number of matches should be >= 0");