Skip to content

Commit

Permalink
Support null bytes in string inputs
Browse files Browse the repository at this point in the history
GitHub: #130

Ensure that whenever we pass Ruby string data into RE2 we use the data's
explicit length as returned by RSTRING_LEN rather than relying on
null-termination. RSTRING_PTR doesn't guarantee this (see
https://docs.ruby-lang.org/en/3.3/extension_rdoc.html#label-VALUE+type+conversion)
and we can end up either truncating input or, worse, over-reading.
  • Loading branch information
mudge committed Jan 20, 2024
1 parent abc1a98 commit c9f728e
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 30 deletions.
93 changes: 63 additions & 30 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,8 @@ static VALUE re2_scanner_rewind(VALUE self) {
TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);

delete c->input;
c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(c->text), RSTRING_LEN(c->text));
c->eof = false;

return self;
Expand Down Expand Up @@ -425,10 +426,20 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {

if (FIXNUM_P(idx)) {
id = FIX2INT(idx);
} else if (SYMBOL_P(idx)) {
const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
std::map<std::string, int>::const_iterator search = groups.find(rb_id2name(SYM2ID(idx)));

if (search != groups.end()) {
id = search->second;
} else {
return NULL;
}
} else {
const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
StringValue(idx);

const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
std::map<std::string, int>::const_iterator search = groups.find(name);
std::map<std::string, int>::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));

if (search != groups.end()) {
id = search->second;
Expand Down Expand Up @@ -611,7 +622,7 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
}
}

static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
re2_matchdata *m;
re2_pattern *p;

Expand Down Expand Up @@ -678,7 +689,8 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
rb_scan_args(argc, argv, "11", &idx, &rest);

if (TYPE(idx) == T_STRING) {
return re2_matchdata_named_match(RSTRING_PTR(idx), self);
return re2_matchdata_named_match(
std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
} else if (SYMBOL_P(idx)) {
return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
} else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
Expand Down Expand Up @@ -731,7 +743,8 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
if (match == Qnil) {
output << "nil";
} else {
output << "\"" << RSTRING_PTR(match) << "\"";
output << "\"" << re2::StringPiece(RSTRING_PTR(match),
RSTRING_LEN(match)) << "\"";
}
}

Expand Down Expand Up @@ -910,9 +923,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
RE2::Options re2_options;
parse_re2_options(&re2_options, options);

p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options);
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
} else {
p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern));
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
}

if (p->pattern == 0) {
Expand Down Expand Up @@ -1501,11 +1516,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {

if (n == 0) {
#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
endpos, anchor, 0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, endpos, anchor, 0, 0);
#else
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, anchor, 0, 0);
#endif
return BOOL2RUBY(matched);
} else {
Expand All @@ -1529,11 +1546,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
m->number_of_matches = n;

#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
endpos, anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, endpos, anchor, m->matches, n);
#else
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, anchor, m->matches, n);
#endif
if (matched) {
return matchdata;
Expand All @@ -1559,7 +1578,8 @@ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::PartialMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::PartialMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1578,7 +1598,8 @@ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::FullMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::FullMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1604,7 +1625,8 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);

c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(text), RSTRING_LEN(text));
RB_OBJ_WRITE(scanner, &c->regexp, self);
RB_OBJ_WRITE(scanner, &c->text, text);

Expand Down Expand Up @@ -1669,20 +1691,24 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
/* Take a copy of str so it can be modified in-place by
* RE2::Replace.
*/
std::string str_as_string(StringValuePtr(str));
StringValue(str);
std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand Down Expand Up @@ -1717,21 +1743,24 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
* RE2::GlobalReplace.
*/
re2_pattern *p;
std::string str_as_string(StringValuePtr(str));
StringValue(str);
std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern),
RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand All @@ -1753,7 +1782,8 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
StringValue(unquoted);

std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted));
std::string quoted_string = RE2::QuoteMeta(
re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));

return rb_str_new(quoted_string.data(), quoted_string.size());
}
Expand Down Expand Up @@ -1902,7 +1932,8 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {

{
std::string err;
index = s->set->Add(RSTRING_PTR(pattern), &err);
index = s->set->Add(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
strlcpy(msg, err.c_str(), sizeof(msg));
}

Expand Down Expand Up @@ -2009,7 +2040,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
if (raise_exception) {
#ifdef HAVE_ERROR_INFO_ARGUMENT
RE2::Set::ErrorInfo e;
bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e);
bool match_failed = !s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
VALUE result = rb_ary_new2(v.size());

if (match_failed) {
Expand All @@ -2036,7 +2068,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
#endif
} else {
bool matched = s->set->Match(RSTRING_PTR(str), &v);
bool matched = s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
VALUE result = rb_ary_new2(v.size());

if (matched) {
Expand Down
6 changes: 6 additions & 0 deletions spec/kernel_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
expect(re).not_to be_case_sensitive
end

it "accepts patterns containing null bytes" do
re = RE2("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2(nil) }.to raise_error(TypeError)
end
Expand Down
24 changes: 24 additions & 0 deletions spec/re2/match_data_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@
expect(md.string[md.begin(:foo)..-1]).to eq('foobar')
end

it "returns the offset of the start of a match by something that can be coerced to a String" do
md = RE2::Regexp.new('(?P<foo>fo{2})').match('a foobar')

expect(md.string[md.begin(StringLike.new("foo"))..-1]).to eq('foobar')
end

it "returns the offset despite multibyte characters" do
md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby')

Expand Down Expand Up @@ -268,6 +274,12 @@

expect(md.begin(:foo)).to be_nil
end

it "raises a type error if given an invalid name or number" do
md = RE2::Regexp.new('(\d)').match('123')

expect { md.begin(nil) }.to raise_error(TypeError)
end
end

describe "#end" do
Expand All @@ -289,6 +301,12 @@
expect(md.string[0...md.end(:foo)]).to eq('a foo')
end

it "returns the offset of a match by something that can be coerced to a String" do
md = RE2::Regexp.new('(?P<foo>fo{2})').match('a foobar')

expect(md.string[0...md.end(StringLike.new("foo"))]).to eq('a foo')
end

it "returns the offset despite multibyte characters" do
md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby')

Expand Down Expand Up @@ -318,6 +336,12 @@

expect(md.end(:foo)).to be_nil
end

it "raises a type error if given an invalid name or number" do
md = RE2::Regexp.new('(\d)').match('123')

expect { md.end(nil) }.to raise_error(TypeError)
end
end

describe "#deconstruct" do
Expand Down
Loading

0 comments on commit c9f728e

Please sign in to comment.