Skip to content

Commit

Permalink
Support null bytes in string inputs
Browse files Browse the repository at this point in the history
GitHub: #130

Ensure that whenever we pass Ruby string data into RE2 we use the data's
explicit length as returned by RSTRING_LEN rather than relying on
null-termination. RSTRING_PTR doesn't guarantee this (see
https://docs.ruby-lang.org/en/3.3/extension_rdoc.html#label-VALUE+type+conversion)
and we can end up either truncating input or, worse, over-reading.
  • Loading branch information
mudge committed Jan 20, 2024
1 parent abc1a98 commit deeea04
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 29 deletions.
89 changes: 60 additions & 29 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,8 @@ static VALUE re2_scanner_rewind(VALUE self) {
TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);

delete c->input;
c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(c->text), RSTRING_LEN(c->text));
c->eof = false;

return self;
Expand Down Expand Up @@ -426,7 +427,15 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
if (FIXNUM_P(idx)) {
id = FIX2INT(idx);
} else {
const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
std::string name;

if (SYMBOL_P(idx)) {
name = rb_id2name(SYM2ID(idx));
} else {
StringValue(idx);
name = std::string(RSTRING_PTR(idx), RSTRING_LEN(idx));
}

const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
std::map<std::string, int>::const_iterator search = groups.find(name);

Expand Down Expand Up @@ -611,7 +620,7 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
}
}

static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
re2_matchdata *m;
re2_pattern *p;

Expand Down Expand Up @@ -678,7 +687,8 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
rb_scan_args(argc, argv, "11", &idx, &rest);

if (TYPE(idx) == T_STRING) {
return re2_matchdata_named_match(RSTRING_PTR(idx), self);
return re2_matchdata_named_match(
std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
} else if (SYMBOL_P(idx)) {
return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
} else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
Expand Down Expand Up @@ -731,7 +741,8 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
if (match == Qnil) {
output << "nil";
} else {
output << "\"" << RSTRING_PTR(match) << "\"";
output << "\"" << re2::StringPiece(RSTRING_PTR(match),
RSTRING_LEN(match)) << "\"";
}
}

Expand Down Expand Up @@ -910,9 +921,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
RE2::Options re2_options;
parse_re2_options(&re2_options, options);

p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options);
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
} else {
p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern));
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
}

if (p->pattern == 0) {
Expand Down Expand Up @@ -1501,11 +1514,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {

if (n == 0) {
#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
endpos, anchor, 0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, endpos, anchor, 0, 0);
#else
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, anchor, 0, 0);
#endif
return BOOL2RUBY(matched);
} else {
Expand All @@ -1529,11 +1544,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
m->number_of_matches = n;

#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
endpos, anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, endpos, anchor, m->matches, n);
#else
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, anchor, m->matches, n);
#endif
if (matched) {
return matchdata;
Expand All @@ -1559,7 +1576,8 @@ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::PartialMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::PartialMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1578,7 +1596,8 @@ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::FullMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::FullMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1604,7 +1623,8 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);

c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(text), RSTRING_LEN(text));
RB_OBJ_WRITE(scanner, &c->regexp, self);
RB_OBJ_WRITE(scanner, &c->text, text);

Expand Down Expand Up @@ -1669,20 +1689,24 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
/* Take a copy of str so it can be modified in-place by
* RE2::Replace.
*/
std::string str_as_string(StringValuePtr(str));
StringValue(str);
std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand Down Expand Up @@ -1717,21 +1741,24 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
* RE2::GlobalReplace.
*/
re2_pattern *p;
std::string str_as_string(StringValuePtr(str));
StringValue(str);
std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern),
RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand All @@ -1753,7 +1780,8 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
StringValue(unquoted);

std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted));
std::string quoted_string = RE2::QuoteMeta(
re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));

return rb_str_new(quoted_string.data(), quoted_string.size());
}
Expand Down Expand Up @@ -1902,7 +1930,8 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {

{
std::string err;
index = s->set->Add(RSTRING_PTR(pattern), &err);
index = s->set->Add(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
strlcpy(msg, err.c_str(), sizeof(msg));
}

Expand Down Expand Up @@ -2009,7 +2038,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
if (raise_exception) {
#ifdef HAVE_ERROR_INFO_ARGUMENT
RE2::Set::ErrorInfo e;
bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e);
bool match_failed = !s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
VALUE result = rb_ary_new2(v.size());

if (match_failed) {
Expand All @@ -2036,7 +2066,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
#endif
} else {
bool matched = s->set->Match(RSTRING_PTR(str), &v);
bool matched = s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
VALUE result = rb_ary_new2(v.size());

if (matched) {
Expand Down
6 changes: 6 additions & 0 deletions spec/kernel_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
expect(re).not_to be_case_sensitive
end

it "accepts patterns containing null bytes" do
re = RE2("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2(nil) }.to raise_error(TypeError)
end
Expand Down
12 changes: 12 additions & 0 deletions spec/re2/match_data_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@
expect(md.string[md.begin(:foo)..-1]).to eq('foobar')
end

it "returns the offset of the start of a match by something that can be coerced to a String" do
md = RE2::Regexp.new('(?P<foo>fo{2})').match('a foobar')

expect(md.string[md.begin(StringLike.new("foo"))..-1]).to eq('foobar')
end

it "returns the offset despite multibyte characters" do
md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby')

Expand Down Expand Up @@ -289,6 +295,12 @@
expect(md.string[0...md.end(:foo)]).to eq('a foo')
end

it "returns the offset of a match by something that can be coerced to a String" do
md = RE2::Regexp.new('(?P<foo>fo{2})').match('a foobar')

expect(md.string[0...md.end(StringLike.new("foo"))]).to eq('a foo')
end

it "returns the offset despite multibyte characters" do
md = RE2::Regexp.new('(Ruby)').match('I ♥ Ruby')

Expand Down
46 changes: 46 additions & 0 deletions spec/re2/regexp_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
expect(re).to be_a(RE2::Regexp)
end

it "accepts patterns containing null bytes" do
re = RE2::Regexp.new("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2::Regexp.new(nil) }.to raise_error(TypeError)
end
Expand Down Expand Up @@ -41,6 +47,12 @@
expect(re).to be_a(RE2::Regexp)
end

it "accepts patterns containing null bytes" do
re = RE2::Regexp.compile("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2::Regexp.compile(nil) }.to raise_error(TypeError)
end
Expand Down Expand Up @@ -339,6 +351,12 @@
expect(re.match("My name is Alice Bloggs")).to eq(true)
end

it "supports matching against text containing null bytes" do
re = RE2::Regexp.new("a\0b")

expect(re.match("a\0b")).to eq(true)
end

it "returns nil if the text does not match the pattern" do
re = RE2::Regexp.new('My name is (\w+) (\w+)')

Expand Down Expand Up @@ -511,6 +529,13 @@
expect(md[3]).to eq("three")
end

it "supports extracting submatches containing null bytes" do
re = RE2::Regexp.new("(a\0b)")
md = re.match("a\0bc")

expect(md[1]).to eq("a\0b")
end

it "extracts a specific number of submatches", :aggregate_failures do
re = RE2::Regexp.new('(\w+) (\w+) (\w+)')
md = re.match("one two three", submatches: 2)
Expand Down Expand Up @@ -599,6 +624,13 @@
expect(re.partial_match?("My age is 99")).to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re.partial_match?("a\0b")).to eq(true)
expect(re.partial_match?("ab")).to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand All @@ -620,6 +652,13 @@
expect(re =~ "My age is 99").to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re =~ "a\0b").to eq(true)
expect(re =~ "ab").to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand Down Expand Up @@ -662,6 +701,13 @@
expect(re.full_match?("My name is Alice Bloggs and I am 99")).to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re.full_match?("a\0b")).to eq(true)
expect(re.full_match?("a\0bc")).to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand Down
Loading

0 comments on commit deeea04

Please sign in to comment.