diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc index 7923e9b..475d750 100644 --- a/ext/re2/re2.cc +++ b/ext/re2/re2.cc @@ -336,7 +336,8 @@ static VALUE re2_scanner_rewind(VALUE self) { TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c); delete c->input; - c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text)); + c->input = new(std::nothrow) re2::StringPiece( + RSTRING_PTR(c->text), RSTRING_LEN(c->text)); c->eof = false; return self; @@ -425,10 +426,20 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) { if (FIXNUM_P(idx)) { id = FIX2INT(idx); + } else if (SYMBOL_P(idx)) { + const std::map& groups = p->pattern->NamedCapturingGroups(); + std::map::const_iterator search = groups.find(rb_id2name(SYM2ID(idx))); + + if (search != groups.end()) { + id = search->second; + } else { + return NULL; + } } else { - const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx); + StringValue(idx); + const std::map& groups = p->pattern->NamedCapturingGroups(); - std::map::const_iterator search = groups.find(name); + std::map::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx))); if (search != groups.end()) { id = search->second; @@ -611,7 +622,7 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) { } } -static VALUE re2_matchdata_named_match(const char* name, const VALUE self) { +static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) { re2_matchdata *m; re2_pattern *p; @@ -678,7 +689,8 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) { rb_scan_args(argc, argv, "11", &idx, &rest); if (TYPE(idx) == T_STRING) { - return re2_matchdata_named_match(RSTRING_PTR(idx), self); + return re2_matchdata_named_match( + std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self); } else if (SYMBOL_P(idx)) { return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self); } else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) { @@ -731,7 +743,8 @@ static VALUE re2_matchdata_inspect(const VALUE self) { if (match == Qnil) { output << "nil"; } else { - output << "\"" << RSTRING_PTR(match) << "\""; + output << "\"" << re2::StringPiece(RSTRING_PTR(match), + RSTRING_LEN(match)) << "\""; } } @@ -910,9 +923,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) { RE2::Options re2_options; parse_re2_options(&re2_options, options); - p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options); + p->pattern = new(std::nothrow) RE2( + re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options); } else { - p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern)); + p->pattern = new(std::nothrow) RE2( + re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern))); } if (p->pattern == 0) { @@ -1501,11 +1516,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { if (n == 0) { #ifdef HAVE_ENDPOS_ARGUMENT - bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, - endpos, anchor, 0, 0); + bool matched = p->pattern->Match( + re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), + startpos, endpos, anchor, 0, 0); #else - bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor, - 0, 0); + bool matched = p->pattern->Match( + re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), + startpos, anchor, 0, 0); #endif return BOOL2RUBY(matched); } else { @@ -1529,11 +1546,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { m->number_of_matches = n; #ifdef HAVE_ENDPOS_ARGUMENT - bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos, - endpos, anchor, m->matches, n); + bool matched = p->pattern->Match( + re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)), + startpos, endpos, anchor, m->matches, n); #else - bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos, - anchor, m->matches, n); + bool matched = p->pattern->Match( + re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)), + startpos, anchor, m->matches, n); #endif if (matched) { return matchdata; @@ -1559,7 +1578,8 @@ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) { TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p); - return BOOL2RUBY(RE2::PartialMatch(RSTRING_PTR(text), *p->pattern)); + return BOOL2RUBY(RE2::PartialMatch( + re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern)); } /* @@ -1578,7 +1598,8 @@ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) { TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p); - return BOOL2RUBY(RE2::FullMatch(RSTRING_PTR(text), *p->pattern)); + return BOOL2RUBY(RE2::FullMatch( + re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern)); } /* @@ -1604,7 +1625,8 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) { VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner); TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c); - c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text)); + c->input = new(std::nothrow) re2::StringPiece( + RSTRING_PTR(text), RSTRING_LEN(text)); RB_OBJ_WRITE(scanner, &c->regexp, self); RB_OBJ_WRITE(scanner, &c->text, text); @@ -1669,12 +1691,14 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern, /* Take a copy of str so it can be modified in-place by * RE2::Replace. */ - std::string str_as_string(StringValuePtr(str)); + StringValue(str); + std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str)); /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p); - RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite)); + RE2::Replace(&str_as_string, *p->pattern, + re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite))); return encoded_str_new(str_as_string.data(), str_as_string.size(), p->pattern->options().encoding()); @@ -1682,7 +1706,9 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern, /* Ensure pattern is a string. */ StringValue(pattern); - RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite)); + RE2::Replace(&str_as_string, + re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), + re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite))); return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8); } @@ -1717,12 +1743,14 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern, * RE2::GlobalReplace. */ re2_pattern *p; - std::string str_as_string(StringValuePtr(str)); + StringValue(str); + std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str)); /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p); - RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite)); + RE2::GlobalReplace(&str_as_string, *p->pattern, + re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite))); return encoded_str_new(str_as_string.data(), str_as_string.size(), p->pattern->options().encoding()); @@ -1730,8 +1758,9 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern, /* Ensure pattern is a string. */ StringValue(pattern); - RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern), - RSTRING_PTR(rewrite)); + RE2::GlobalReplace(&str_as_string, + re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), + re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite))); return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8); } @@ -1753,7 +1782,8 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern, static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) { StringValue(unquoted); - std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted)); + std::string quoted_string = RE2::QuoteMeta( + re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted))); return rb_str_new(quoted_string.data(), quoted_string.size()); } @@ -1902,7 +1932,8 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) { { std::string err; - index = s->set->Add(RSTRING_PTR(pattern), &err); + index = s->set->Add( + re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err); strlcpy(msg, err.c_str(), sizeof(msg)); } @@ -2009,7 +2040,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) { if (raise_exception) { #ifdef HAVE_ERROR_INFO_ARGUMENT RE2::Set::ErrorInfo e; - bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e); + bool match_failed = !s->set->Match( + re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e); VALUE result = rb_ary_new2(v.size()); if (match_failed) { @@ -2036,7 +2068,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) { rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false"); #endif } else { - bool matched = s->set->Match(RSTRING_PTR(str), &v); + bool matched = s->set->Match( + re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v); VALUE result = rb_ary_new2(v.size()); if (matched) { diff --git a/spec/kernel_spec.rb b/spec/kernel_spec.rb index 3da28ec..93a9277 100644 --- a/spec/kernel_spec.rb +++ b/spec/kernel_spec.rb @@ -10,6 +10,12 @@ expect(re).not_to be_case_sensitive end + it "accepts patterns containing null bytes" do + re = RE2("a\0b") + + expect(re.pattern).to eq("a\0b") + end + it "raises an error if given an inappropriate type" do expect { RE2(nil) }.to raise_error(TypeError) end diff --git a/spec/re2/match_data_spec.rb b/spec/re2/match_data_spec.rb index a8a9fdf..a4e66a7 100644 --- a/spec/re2/match_data_spec.rb +++ b/spec/re2/match_data_spec.rb @@ -190,7 +190,7 @@ describe "#inspect" do it "returns a text representation of the object and indices" do md = RE2::Regexp.new('(\d+) (\d+)').match("1234 56") - + expect(md.inspect).to eq('#') end @@ -199,6 +199,12 @@ expect(md.inspect).to eq('#') end + + it "supports matches with null bytes" do + md = RE2::Regexp.new("(\\w\0\\w) (\\w\0\\w)").match("a\0b c\0d") + + expect(md.inspect).to eq("#") + end end describe "#to_s" do @@ -239,6 +245,12 @@ expect(md.string[md.begin(:foo)..-1]).to eq('foobar') end + it "returns the offset of the start of a match by something that can be coerced to a String" do + md = RE2::Regexp.new('(?Pfo{2})').match('a foobar') + + expect(md.string[md.begin(StringLike.new("foo"))..-1]).to eq('foobar') + end + it "returns the offset despite multibyte characters" do md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby') @@ -268,6 +280,12 @@ expect(md.begin(:foo)).to be_nil end + + it "raises a type error if given an invalid name or number" do + md = RE2::Regexp.new('(\d)').match('123') + + expect { md.begin(nil) }.to raise_error(TypeError) + end end describe "#end" do @@ -289,6 +307,12 @@ expect(md.string[0...md.end(:foo)]).to eq('a foo') end + it "returns the offset of a match by something that can be coerced to a String" do + md = RE2::Regexp.new('(?Pfo{2})').match('a foobar') + + expect(md.string[0...md.end(StringLike.new("foo"))]).to eq('a foo') + end + it "returns the offset despite multibyte characters" do md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby') @@ -318,6 +342,12 @@ expect(md.end(:foo)).to be_nil end + + it "raises a type error if given an invalid name or number" do + md = RE2::Regexp.new('(\d)').match('123') + + expect { md.end(nil) }.to raise_error(TypeError) + end end describe "#deconstruct" do diff --git a/spec/re2/regexp_spec.rb b/spec/re2/regexp_spec.rb index 837601f..2ff059a 100644 --- a/spec/re2/regexp_spec.rb +++ b/spec/re2/regexp_spec.rb @@ -12,6 +12,12 @@ expect(re).to be_a(RE2::Regexp) end + it "accepts patterns containing null bytes" do + re = RE2::Regexp.new("a\0b") + + expect(re.pattern).to eq("a\0b") + end + it "raises an error if given an inappropriate type" do expect { RE2::Regexp.new(nil) }.to raise_error(TypeError) end @@ -41,6 +47,12 @@ expect(re).to be_a(RE2::Regexp) end + it "accepts patterns containing null bytes" do + re = RE2::Regexp.compile("a\0b") + + expect(re.pattern).to eq("a\0b") + end + it "raises an error if given an inappropriate type" do expect { RE2::Regexp.compile(nil) }.to raise_error(TypeError) end @@ -339,6 +351,12 @@ expect(re.match("My name is Alice Bloggs")).to eq(true) end + it "supports matching against text containing null bytes" do + re = RE2::Regexp.new("a\0b") + + expect(re.match("a\0b")).to eq(true) + end + it "returns nil if the text does not match the pattern" do re = RE2::Regexp.new('My name is (\w+) (\w+)') @@ -511,6 +529,13 @@ expect(md[3]).to eq("three") end + it "supports extracting submatches containing null bytes" do + re = RE2::Regexp.new("(a\0b)") + md = re.match("a\0bc") + + expect(md[1]).to eq("a\0b") + end + it "extracts a specific number of submatches", :aggregate_failures do re = RE2::Regexp.new('(\w+) (\w+) (\w+)') md = re.match("one two three", submatches: 2) @@ -599,6 +624,13 @@ expect(re.partial_match?("My age is 99")).to eq(false) end + it "supports matching against text containing null bytes", :aggregate_failures do + re = RE2::Regexp.new("a\0b") + + expect(re.partial_match?("a\0b")).to eq(true) + expect(re.partial_match?("ab")).to eq(false) + end + it "returns false if the pattern is invalid" do re = RE2::Regexp.new('???', log_errors: false) @@ -620,6 +652,13 @@ expect(re =~ "My age is 99").to eq(false) end + it "supports matching against text containing null bytes", :aggregate_failures do + re = RE2::Regexp.new("a\0b") + + expect(re =~ "a\0b").to eq(true) + expect(re =~ "ab").to eq(false) + end + it "returns false if the pattern is invalid" do re = RE2::Regexp.new('???', log_errors: false) @@ -662,6 +701,13 @@ expect(re.full_match?("My name is Alice Bloggs and I am 99")).to eq(false) end + it "supports matching against text containing null bytes", :aggregate_failures do + re = RE2::Regexp.new("a\0b") + + expect(re.full_match?("a\0b")).to eq(true) + expect(re.full_match?("a\0bc")).to eq(false) + end + it "returns false if the pattern is invalid" do re = RE2::Regexp.new('???', log_errors: false) @@ -742,6 +788,12 @@ expect(scanner).to be_a(RE2::Scanner) end + + it "raises a type error if given invalid input" do + r = RE2::Regexp.new('(\w+)') + + expect { r.scan(nil) }.to raise_error(TypeError) + end end describe "#partial_match" do diff --git a/spec/re2/scanner_spec.rb b/spec/re2/scanner_spec.rb index 4d75fc6..53b11f8 100644 --- a/spec/re2/scanner_spec.rb +++ b/spec/re2/scanner_spec.rb @@ -34,6 +34,16 @@ expect(scanner.scan).to be_nil end + it "supports scanning inputs with null bytes", :aggregate_failures do + r = RE2::Regexp.new("(\\w\0\\w)") + scanner = r.scan("a\0b c\0d e\0f") + + expect(scanner.scan).to eq(["a\0b"]) + expect(scanner.scan).to eq(["c\0d"]) + expect(scanner.scan).to eq(["e\0f"]) + expect(scanner.scan).to be_nil + end + it "returns UTF-8 matches if the pattern is UTF-8" do r = RE2::Regexp.new('(\w+)') scanner = r.scan("It") @@ -190,6 +200,18 @@ expect(scanner.to_enum.first).to eq(["1"]) end + it "supports inputs with null bytes", :aggregate_failures do + r = RE2::Regexp.new("(\\w\0\\w)") + scanner = r.scan("a\0b c\0d") + + expect(scanner.to_enum.first).to eq(["a\0b"]) + expect(scanner.to_enum.first).to eq(["c\0d"]) + + scanner.rewind + + expect(scanner.to_enum.first).to eq(["a\0b"]) + end + it "resets the eof? check", :aggregate_failures do r = RE2::Regexp.new('(\d)') scanner = r.scan("1") diff --git a/spec/re2/set_spec.rb b/spec/re2/set_spec.rb index 54371ef..e8be3e9 100644 --- a/spec/re2/set_spec.rb +++ b/spec/re2/set_spec.rb @@ -123,6 +123,14 @@ expect(set.match("def", exception: false)).to be_empty end + it "supports matching null bytes", :aggregate_failures do + set = RE2::Set.new + set.add("a\0b") + set.compile + + expect(set.match("a\0b", exception: false)).to eq([0]) + end + it "returns an empty array if there is no match when :exception is true" do skip "Underlying RE2::Set::Match does not output error information" unless RE2::Set.match_raises_errors? diff --git a/spec/re2_spec.rb b/spec/re2_spec.rb index 4535df2..2a41bd9 100644 --- a/spec/re2_spec.rb +++ b/spec/re2_spec.rb @@ -4,6 +4,18 @@ expect(RE2.Replace("woo", "o", "a")).to eq("wao") end + it "supports inputs with null bytes" do + expect(RE2.Replace("w\0oo", "o", "a")).to eq("w\0ao") + end + + it "supports patterns with null bytes" do + expect(RE2.Replace("w\0oo", "\0", "o")).to eq("wooo") + end + + it "supports replacements with null bytes" do + expect(RE2.Replace("woo", "o", "\0")).to eq("w\0o") + end + it "performs replacement based on regular expressions" do expect(RE2.Replace("woo", "o+", "e")).to eq("we") end @@ -82,6 +94,18 @@ expect(RE2.GlobalReplace("woo", "o", "a")).to eq("waa") end + it "supports inputs with null bytes" do + expect(RE2.GlobalReplace("w\0oo", "o", "a")).to eq("w\0aa") + end + + it "supports patterns with null bytes" do + expect(RE2.GlobalReplace("w\0\0oo", "\0", "a")).to eq("waaoo") + end + + it "supports replacements with null bytes" do + expect(RE2.GlobalReplace("woo", "o", "\0")).to eq("w\0\0") + end + it "performs replacement based on regular expressions" do expect(RE2.GlobalReplace("woohoo", "o+", "e")).to eq("wehe") end @@ -167,5 +191,9 @@ it "supports passing something that can be coerced to a String as input" do expect(RE2.QuoteMeta(StringLike.new("1.5"))).to eq('1\.5') end + + it "supports strings containing null bytes" do + expect(RE2.QuoteMeta("abc\0def")).to eq('abc\x00def') + end end end