From da95224179689d809534b50001d9d93765073d55 Mon Sep 17 00:00:00 2001 From: Paul Mucur Date: Sat, 22 Oct 2022 07:49:27 +0100 Subject: [PATCH] Add RE2::MatchData#deconstruct_keys GitHub: https://github.com/mudge/re2/issues/57 In order to support pattern matching in Ruby, add a deconstruct_keys method to RE2::MatchData that will return all named matches if given nil or a subset of matches if given an array of capturing group names as symbols. To match [Ruby's implementation of this](https://github.com/ruby/ruby/pull/6216/) there are a few edge cases: 1. If the number of keys given exceeds the number of named capturing groups, immediately return an empty hash. 2. Each key is added to the hash in turn and the first invalid key will cause the hash to be returned so it is possible to get a partial set of named captures. --- ext/re2/re2.cc | 68 +++++++++++++++++++++++++++++++++++++ spec/re2/match_data_spec.rb | 44 ++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc index 96f7218..18f9922 100644 --- a/ext/re2/re2.cc +++ b/ext/re2/re2.cc @@ -695,6 +695,7 @@ static VALUE re2_matchdata_inspect(VALUE self) { * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.deconstruct #=> ["123"] * + * @example pattern matching * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456") * in x, y * puts "Matched #{x} #{y}" @@ -727,6 +728,71 @@ static VALUE re2_matchdata_deconstruct(VALUE self) { return array; } +/* + * Returns a hash of capturing group names to submatches for pattern matching. + * + * As this is used by Ruby's pattern matching, it will return an empty hash if given + * more keys than there are capturing groups. Given keys will populate the hash in + * order but an invalid name will cause the hash to be immediately returned. + * + * @return [Hash] a hash of capturing group names to submatches + * @param [Array, nil] keys an array of Symbol capturing group names or nil to return all names + * @example + * m = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + * m.deconstruct_keys(nil) #=> {:numbers => "123", :letters => "abc"} + * m.deconstruct_keys([:numbers]) #=> {:numbers => "123"} + * m.deconstruct_keys([:fruit]) #=> {} + * m.deconstruct_keys([:letters, :fruit]) #=> {:letters => "abc"} + * + * @example pattern matching + * case RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + * in numbers:, letters: + * puts "Numbers: #{numbers}, letters: #{letters}" + * else + * puts "Unrecognised match" + * end + */ +static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) { + int i; + VALUE capturing_groups, key; + re2_matchdata *m; + re2_pattern *p; + map groups; + map::iterator iterator; + + Data_Get_Struct(self, re2_matchdata, m); + Data_Get_Struct(m->regexp, re2_pattern, p); + + groups = p->pattern->NamedCapturingGroups(); + capturing_groups = rb_hash_new(); + + if (NIL_P(keys)) { + for (iterator = groups.begin(); iterator != groups.end(); iterator++) { + rb_hash_aset(capturing_groups, + ID2SYM(rb_intern(iterator->first.data())), + re2_matchdata_nth_match(iterator->second, self)); + } + } else { + Check_Type(keys, T_ARRAY); + + if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) { + for (i = 0; i < RARRAY_LEN(keys); i++) { + key = RARRAY_AREF(keys, i); + Check_Type(key, T_SYMBOL); + string name(rb_id2name(SYM2ID(key))); + + if (groups.count(name) == 0) { + break; + } + + rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(groups[name], self)); + } + } + } + + return capturing_groups; +} + /* * Returns a new RE2 object with a compiled version of * +pattern+ stored inside. Equivalent to +RE2.new+. @@ -1708,6 +1774,8 @@ void Init_re2(void) { RUBY_METHOD_FUNC(re2_matchdata_inspect), 0); rb_define_method(re2_cMatchData, "deconstruct", RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0); + rb_define_method(re2_cMatchData, "deconstruct_keys", + RUBY_METHOD_FUNC(re2_matchdata_deconstruct_keys), 1); rb_define_method(re2_cScanner, "string", RUBY_METHOD_FUNC(re2_scanner_string), 0); diff --git a/spec/re2/match_data_spec.rb b/spec/re2/match_data_spec.rb index 2c77a25..c260c65 100644 --- a/spec/re2/match_data_spec.rb +++ b/spec/re2/match_data_spec.rb @@ -255,4 +255,48 @@ expect(md.deconstruct).to eq(['o', 'o', nil]) end end + + describe "#deconstruct_keys" do + it "returns all named captures if given nil" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect(md.deconstruct_keys(nil)).to eq(:numbers => '123', :letters => 'abc') + end + + it "returns only named captures if given names" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect(md.deconstruct_keys([:numbers])).to eq(:numbers => '123') + end + + it "returns named captures up until an invalid name is given" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect(md.deconstruct_keys([:numbers, :punctuation])).to eq(:numbers => '123') + end + + it "returns an empty hash if given more capture names than exist" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect(md.deconstruct_keys([:numbers, :letters, :punctuation])).to eq({}) + end + + it "returns an empty hash if there are no named capturing groups" do + md = RE2::Regexp.new('(\d+) ([a-zA-Z]+)').match('123 abc') + + expect(md.deconstruct_keys(nil)).to eq({}) + end + + it "raises an error if given a non-array of keys" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect { md.deconstruct_keys(0) }.to raise_error(TypeError) + end + + it "raises an error if given keys as non-symbols" do + md = RE2::Regexp.new('(?P\d+) (?P[a-zA-Z]+)').match('123 abc') + + expect { md.deconstruct_keys([0]) }.to raise_error(TypeError) + end + end end