Skip to content

Commit

Permalink
Expose more of RE2's matching interface
Browse files Browse the repository at this point in the history
GitHub: #119

Add new options to `RE2::Regexp#match` that expose the underlying
capabilities of RE2's Match function:

* anchor: specifying whether a match should be unanchored (the default),
  anchored to the start of the text or anchored to both ends
* startpos: the offset at which to start matching (defaults to the start
  of the text)
* submatches: the number of submatches to extract (defaults to the
  number of capturing groups in the pattern)

We keep compatibility with the previous API by still accepting a number
of submatches as the second argument to match.

With these new options in place, we can now offer a higher-level
`RE2::Regexp#full_match` and `RE2::Regexp#partial_match` API to match
RE2's own. Note we don't actually use the underlying `FullMatchN` or
`PartialMatchN` functions as we need to use `Match`'s behaviour of
returning the overall match first before any extracted submatches.

The plan is to then heavily promote these two methods over the
lower-level `match`.
  • Loading branch information
mudge committed Dec 1, 2023
1 parent 7202533 commit b909f8c
Show file tree
Hide file tree
Showing 5 changed files with 338 additions and 39 deletions.
121 changes: 88 additions & 33 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
/* Symbols used in RE2 options. */
static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
id_max_mem, id_literal, id_never_nl, id_case_sensitive,
id_perl_classes, id_word_boundary, id_one_line,
id_unanchored, id_anchor_start, id_anchor_both, id_exception;
id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
id_anchor, id_anchor_start, id_anchor_both, id_exception,
id_submatches, id_startpos;

inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
if (encoding == RE2::Options::EncodingUTF8) {
Expand Down Expand Up @@ -1339,51 +1340,102 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
* r = RE2::Regexp.new('woo')
* r.match('woo') #=> true
*
* @overload match(text, 0)
* Returns either true or false indicating whether a
* successful match was made.
*
* @param [String] text the text to search
* @return [Boolean] whether the match was successful
* @raise [NoMemoryError] if there was not enough memory to allocate the submatches
* @example
* r = RE2::Regexp.new('w(o)(o)')
* r.match('woo', 0) #=> true
* r.match('bob', 0) #=> false
*
* @overload match(text, number_of_submatches)
* @overload match(text, options)
* See +match(text)+ but with a specific number of
* submatches returned (padded with nils if necessary).
*
* @param [String] text the text to search
* @param [Integer] number_of_submatches the number of submatches to return
* @return [RE2::MatchData] the submatches
* @raise [ArgumentError] if given a negative number of submatches
* @param [Hash] options the options with which to perform the match
* @option options [Integer] :startpos (0) offset at which to start matching
* @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
* @option options [Integer] :submatches how many submatches to extract (0 is
* fastest), defaults to the number of capturing groups
* @return [RE2::MatchData] if extracting any submatches
* @return [Boolean] if not extracting any submatches
* @raise [ArgumentError] if given a negative number of submatches or invalid anchor
* @raise [NoMemoryError] if there was not enough memory to allocate the matches
* @raise [TypeError] if given non-String text, non-numeric number of
* submatches, non-symbol anchor or non-hash options
* @example
* r = RE2::Regexp.new('w(o)(o)')
* r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
* r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
* r.match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
* r.match('woo', submatches: 3) # => #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
* r.match('woot', anchor: :anchor_both, submatches: 0)
* # => false
* r.match('woot', anchor: :anchor_start, submatches: 0)
* # => true
*/
static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
re2_pattern *p;
re2_matchdata *m;
VALUE text, number_of_submatches;
VALUE text, options;

rb_scan_args(argc, argv, "11", &text, &number_of_submatches);
rb_scan_args(argc, argv, "11", &text, &options);

/* Ensure text is a string. */
StringValue(text);

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

int n;
int startpos = 0;
RE2::Anchor anchor = RE2::UNANCHORED;

if (RTEST(options)) {
if (FIXNUM_P(options)) {
n = NUM2INT(options);

if (RTEST(number_of_submatches)) {
n = NUM2INT(number_of_submatches);
if (n < 0) {
rb_raise(rb_eArgError, "number of matches should be >= 0");
}
} else {
if (TYPE(options) != T_HASH) {
options = rb_Hash(options);
}

if (n < 0) {
rb_raise(rb_eArgError, "number of matches should be >= 0");
VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
if (!NIL_P(anchor_option)) {
Check_Type(anchor_option, T_SYMBOL);

ID id_anchor_option = SYM2ID(anchor_option);
if (id_anchor_option == id_unanchored) {
anchor = RE2::UNANCHORED;
} else if (id_anchor_option == id_anchor_start) {
anchor = RE2::ANCHOR_START;
} else if (id_anchor_option == id_anchor_both) {
anchor = RE2::ANCHOR_BOTH;
} else {
rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
}
}

VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
if (!NIL_P(submatches_option)) {
Check_Type(submatches_option, T_FIXNUM);

n = NUM2INT(submatches_option);

if (n < 0) {
rb_raise(rb_eArgError, "number of matches should be >= 0");
}
} else {
if (!p->pattern->ok()) {
return Qnil;
}

n = p->pattern->NumberOfCapturingGroups();
}

VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
if (!NIL_P(startpos_option)) {
Check_Type(startpos_option, T_FIXNUM);

startpos = NUM2INT(startpos_option);

if (startpos < 0) {
rb_raise(rb_eArgError, "startpos should be >= 0");
}
}
}
} else {
if (!p->pattern->ok()) {
Expand All @@ -1395,10 +1447,10 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {

if (n == 0) {
#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(text), 0,
RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
RSTRING_LEN(text), anchor, 0, 0);
#else
bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED,
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
0, 0);
#endif
return BOOL2RUBY(matched);
Expand All @@ -1423,11 +1475,11 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
m->number_of_matches = n;

#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
RSTRING_LEN(m->text), anchor, m->matches, n);
#else
bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
RE2::UNANCHORED, m->matches, n);
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
anchor, m->matches, n);
#endif
if (matched) {
return matchdata;
Expand Down Expand Up @@ -2032,7 +2084,10 @@ extern "C" void Init_re2(void) {
id_word_boundary = rb_intern("word_boundary");
id_one_line = rb_intern("one_line");
id_unanchored = rb_intern("unanchored");
id_anchor = rb_intern("anchor");
id_anchor_start = rb_intern("anchor_start");
id_anchor_both = rb_intern("anchor_both");
id_exception = rb_intern("exception");
id_submatches = rb_intern("submatches");
id_startpos = rb_intern("startpos");
}
1 change: 1 addition & 0 deletions lib/re2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
require 're2.so'
end

require "re2/regexp"
require "re2/scanner"
require "re2/version"
62 changes: 62 additions & 0 deletions lib/re2/regexp.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
module RE2
class Regexp
# Match the pattern against any substring of the given +text+ and return
# either a boolean (if no submatches are required) or a {RE2::MatchData}
# instance with the specified number of submatches (defaults to the total
# number of capturing groups).
#
# The number of submatches has a significant impact on performance: requesting
# one submatch is much faster than requesting more than one and requesting
# zero submatches is faster still.
#
# @param [String] text the text to search
# @param [Hash] options the options with which to perform the match
# @option options [Integer] :submatches how many submatches to extract (0
# is fastest), defaults to the total number of capturing groups
# @return [RE2::MatchData] if extracting any submatches
# @return [Boolean] if not extracting any submatches
# @raise [ArgumentError] if given a negative number of submatches
# @raise [NoMemoryError] if there was not enough memory to allocate the
# matches
# @raise [TypeError] if given non-numeric submatches or non-hash options
# @example
# r = RE2::Regexp.new('w(o)(o)')
# r.partial_match('woot')
# # => #<RE2::MatchData "woo" 1:"o" 2:"o">
# r.partial_match('woot', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
# r.partial_match('woot', submatches: 0) # => true
def partial_match(text, options = {})
match(text, Hash(options).merge(anchor: :unanchored))
end

# Match the pattern against the given +text+ exactly and return either a
# boolean (if no submatches are required) or a {RE2::MatchData} instance
# with the specified number of submatches (defaults to the total number of
# capturing groups).
#
# The number of submatches has a significant impact on performance: requesting
# one submatch is much faster than requesting more than one and requesting
# zero submatches is faster still.
#
# @param [String] text the text to search
# @param [Hash] options the options with which to perform the match
# @option options [Integer] :submatches how many submatches to extract (0
# is fastest), defaults to the total number of capturing groups
# @return [RE2::MatchData] if extracting any submatches
# @return [Boolean] if not extracting any submatches
# @raise [ArgumentError] if given a negative number of submatches
# @raise [NoMemoryError] if there was not enough memory to allocate the
# matches
# @raise [TypeError] if given non-numeric submatches or non-hash options
# @example
# r = RE2::Regexp.new('w(o)(o)')
# r.full_match('woo')
# # => #<RE2::MatchData "woo" 1:"o" 2:"o">
# r.full_match('woo', submatches: 1) # => #<RE2::MatchData "woo" 1:"o">
# r.full_match('woo', submatches: 0) # => true
# r.full_match('woot') # => nil
def full_match(text, options = {})
match(text, Hash(options).merge(anchor: :anchor_both))
end
end
end
1 change: 1 addition & 0 deletions re2.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Gem::Specification.new do |s|
"ext/re2/recipes.rb",
"Gemfile",
"lib/re2.rb",
"lib/re2/regexp.rb",
"lib/re2/scanner.rb",
"lib/re2/string.rb",
"lib/re2/version.rb",
Expand Down
Loading

0 comments on commit b909f8c

Please sign in to comment.