Skip to content

Commit

Permalink
Support null bytes in string inputs
Browse files Browse the repository at this point in the history
GitHub: #130

Ensure that whenever we pass Ruby string data into RE2 we use the data's
explicit length as returned by RSTRING_LEN rather than relying on
null-termination. RSTRING_PTR doesn't guarantee this (see
https://docs.ruby-lang.org/en/3.3/extension_rdoc.html#label-VALUE+type+conversion)
and we can end up either truncating input or, worse, over-reading.
  • Loading branch information
mudge committed Jan 20, 2024
1 parent abc1a98 commit a8cc240
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 30 deletions.
83 changes: 53 additions & 30 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,8 @@ static VALUE re2_scanner_rewind(VALUE self) {
TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);

delete c->input;
c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(c->text), RSTRING_LEN(c->text));
c->eof = false;

return self;
Expand Down Expand Up @@ -426,9 +427,11 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
if (FIXNUM_P(idx)) {
id = FIX2INT(idx);
} else {
const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
std::map<std::string, int>::const_iterator search = groups.find(name);
std::map<std::string, int>::const_iterator search = groups.find(
SYMBOL_P(idx) ?
rb_id2name(SYM2ID(idx)) :
std::string(StringValuePtr(idx), RSTRING_LEN(idx)));

if (search != groups.end()) {
id = search->second;
Expand Down Expand Up @@ -611,7 +614,7 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
}
}

static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
re2_matchdata *m;
re2_pattern *p;

Expand Down Expand Up @@ -678,7 +681,8 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
rb_scan_args(argc, argv, "11", &idx, &rest);

if (TYPE(idx) == T_STRING) {
return re2_matchdata_named_match(RSTRING_PTR(idx), self);
return re2_matchdata_named_match(
std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
} else if (SYMBOL_P(idx)) {
return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
} else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
Expand Down Expand Up @@ -731,7 +735,8 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
if (match == Qnil) {
output << "nil";
} else {
output << "\"" << RSTRING_PTR(match) << "\"";
output << "\"" << re2::StringPiece(RSTRING_PTR(match),
RSTRING_LEN(match)) << "\"";
}
}

Expand Down Expand Up @@ -910,9 +915,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
RE2::Options re2_options;
parse_re2_options(&re2_options, options);

p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options);
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
} else {
p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern));
p->pattern = new(std::nothrow) RE2(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
}

if (p->pattern == 0) {
Expand Down Expand Up @@ -1501,11 +1508,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {

if (n == 0) {
#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos,
endpos, anchor, 0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, endpos, anchor, 0, 0);
#else
bool matched = p->pattern->Match(RSTRING_PTR(text), startpos, anchor,
0, 0);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
startpos, anchor, 0, 0);
#endif
return BOOL2RUBY(matched);
} else {
Expand All @@ -1529,11 +1538,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
m->number_of_matches = n;

#ifdef HAVE_ENDPOS_ARGUMENT
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
endpos, anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, endpos, anchor, m->matches, n);
#else
bool matched = p->pattern->Match(RSTRING_PTR(m->text), startpos,
anchor, m->matches, n);
bool matched = p->pattern->Match(
re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
startpos, anchor, m->matches, n);
#endif
if (matched) {
return matchdata;
Expand All @@ -1559,7 +1570,8 @@ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::PartialMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::PartialMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1578,7 +1590,8 @@ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {

TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);

return BOOL2RUBY(RE2::FullMatch(RSTRING_PTR(text), *p->pattern));
return BOOL2RUBY(RE2::FullMatch(
re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
}

/*
Expand All @@ -1604,7 +1617,8 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);

c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text));
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(text), RSTRING_LEN(text));
RB_OBJ_WRITE(scanner, &c->regexp, self);
RB_OBJ_WRITE(scanner, &c->text, text);

Expand Down Expand Up @@ -1669,20 +1683,23 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
/* Take a copy of str so it can be modified in-place by
* RE2::Replace.
*/
std::string str_as_string(StringValuePtr(str));
std::string str_as_string(StringValuePtr(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite));
RE2::Replace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand Down Expand Up @@ -1717,21 +1734,23 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
* RE2::GlobalReplace.
*/
re2_pattern *p;
std::string str_as_string(StringValuePtr(str));
std::string str_as_string(StringValuePtr(str), RSTRING_LEN(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string, *p->pattern,
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding());
} else {
/* Ensure pattern is a string. */
StringValue(pattern);

RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern),
RSTRING_PTR(rewrite));
RE2::GlobalReplace(&str_as_string,
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));

return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
}
Expand All @@ -1753,7 +1772,8 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
StringValue(unquoted);

std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted));
std::string quoted_string = RE2::QuoteMeta(
re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));

return rb_str_new(quoted_string.data(), quoted_string.size());
}
Expand Down Expand Up @@ -1902,7 +1922,8 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {

{
std::string err;
index = s->set->Add(RSTRING_PTR(pattern), &err);
index = s->set->Add(
re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
strlcpy(msg, err.c_str(), sizeof(msg));
}

Expand Down Expand Up @@ -2009,7 +2030,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
if (raise_exception) {
#ifdef HAVE_ERROR_INFO_ARGUMENT
RE2::Set::ErrorInfo e;
bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e);
bool match_failed = !s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
VALUE result = rb_ary_new2(v.size());

if (match_failed) {
Expand All @@ -2036,7 +2058,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
#endif
} else {
bool matched = s->set->Match(RSTRING_PTR(str), &v);
bool matched = s->set->Match(
re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
VALUE result = rb_ary_new2(v.size());

if (matched) {
Expand Down
6 changes: 6 additions & 0 deletions spec/kernel_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
expect(re).not_to be_case_sensitive
end

it "accepts patterns containing null bytes" do
re = RE2("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2(nil) }.to raise_error(TypeError)
end
Expand Down
39 changes: 39 additions & 0 deletions spec/re2/regexp_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
expect(re).to be_a(RE2::Regexp)
end

it "accepts patterns containing null bytes" do
re = RE2::Regexp.new("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2::Regexp.new(nil) }.to raise_error(TypeError)
end
Expand Down Expand Up @@ -41,6 +47,12 @@
expect(re).to be_a(RE2::Regexp)
end

it "accepts patterns containing null bytes" do
re = RE2::Regexp.compile("a\0b")

expect(re.pattern).to eq("a\0b")
end

it "raises an error if given an inappropriate type" do
expect { RE2::Regexp.compile(nil) }.to raise_error(TypeError)
end
Expand Down Expand Up @@ -339,6 +351,12 @@
expect(re.match("My name is Alice Bloggs")).to eq(true)
end

it "supports matching against text containing null bytes" do
re = RE2::Regexp.new("a\0b")

expect(re.match("a\0b")).to eq(true)
end

it "returns nil if the text does not match the pattern" do
re = RE2::Regexp.new('My name is (\w+) (\w+)')

Expand Down Expand Up @@ -599,6 +617,13 @@
expect(re.partial_match?("My age is 99")).to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re.partial_match?("a\0b")).to eq(true)
expect(re.partial_match?("ab")).to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand All @@ -620,6 +645,13 @@
expect(re =~ "My age is 99").to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re =~ "a\0b").to eq(true)
expect(re =~ "ab").to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand Down Expand Up @@ -662,6 +694,13 @@
expect(re.full_match?("My name is Alice Bloggs and I am 99")).to eq(false)
end

it "supports matching against text containing null bytes", :aggregate_failures do
re = RE2::Regexp.new("a\0b")

expect(re.full_match?("a\0b")).to eq(true)
expect(re.full_match?("a\0bc")).to eq(false)
end

it "returns false if the pattern is invalid" do
re = RE2::Regexp.new('???', log_errors: false)

Expand Down
22 changes: 22 additions & 0 deletions spec/re2/scanner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@
expect(scanner.scan).to be_nil
end

it "supports scanning inputs with null bytes", :aggregate_failures do
r = RE2::Regexp.new("(\\w\0\\w)")
scanner = r.scan("a\0b c\0d e\0f")

expect(scanner.scan).to eq(["a\0b"])
expect(scanner.scan).to eq(["c\0d"])
expect(scanner.scan).to eq(["e\0f"])
expect(scanner.scan).to be_nil
end

it "returns UTF-8 matches if the pattern is UTF-8" do
r = RE2::Regexp.new('(\w+)')
scanner = r.scan("It")
Expand Down Expand Up @@ -190,6 +200,18 @@
expect(scanner.to_enum.first).to eq(["1"])
end

it "supports inputs with null bytes", :aggregate_failures do
r = RE2::Regexp.new("(\\w\0\\w)")
scanner = r.scan("a\0b c\0d")

expect(scanner.to_enum.first).to eq(["a\0b"])
expect(scanner.to_enum.first).to eq(["c\0d"])

scanner.rewind

expect(scanner.to_enum.first).to eq(["a\0b"])
end

it "resets the eof? check", :aggregate_failures do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("1")
Expand Down
8 changes: 8 additions & 0 deletions spec/re2/set_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,14 @@
expect(set.match("def", exception: false)).to be_empty
end

it "supports matching null bytes", :aggregate_failures do
set = RE2::Set.new
set.add("a\0b")
set.compile

expect(set.match("a\0b", exception: false)).to eq([0])
end

it "returns an empty array if there is no match when :exception is true" do
skip "Underlying RE2::Set::Match does not output error information" unless RE2::Set.match_raises_errors?

Expand Down
Loading

0 comments on commit a8cc240

Please sign in to comment.