Skip to content

Commit

Permalink
Return UTF-8 strings when replacing with a string
Browse files Browse the repository at this point in the history
When replacing with a string pattern, we will implicitly create an RE2
pattern from the string with the default options meaning it will assume
and produce UTF-8 results.

This could potentially be a breaking change for users who rely on the
string pattern's encoding but the behaviour has been misleading (and
passing anything except ISO-8859-1 or UTF-8 to RE2 is undefined).
  • Loading branch information
mudge committed Sep 15, 2023
1 parent 355d7fb commit 45f3ddb
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 23 deletions.
45 changes: 32 additions & 13 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,9 @@ using std::vector;
rb_enc_associate_index(_string, _enc); \
_string; \
})
#define ENCODED_STR_NEW2(str, length, str2) \
({ \
VALUE _string = rb_str_new(str, length); \
int _enc = rb_enc_get_index(str2); \
rb_enc_associate_index(_string, _enc); \
_string; \
})
#else
#define ENCODED_STR_NEW(str, length, encoding) \
rb_str_new((const char *)str, (long)length)
#define ENCODED_STR_NEW2(str, length, str2) \
rb_str_new((const char *)str, (long)length)
#endif

#ifdef HAVE_RB_STR_SUBLEN
Expand Down Expand Up @@ -284,6 +275,10 @@ static VALUE re2_scanner_rewind(VALUE self) {
* Scan the given text incrementally for matches, returning an array of
* matches on each subsequent call. Returns nil if no matches are found.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @return [Array<String>] the matches.
* @example
* s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
Expand Down Expand Up @@ -503,6 +498,10 @@ static VALUE re2_regexp_allocate(VALUE klass) {
/*
* Returns the array of matches.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @return [Array<String, nil>] the array of matches
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
Expand Down Expand Up @@ -578,6 +577,10 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
/*
* Retrieve zero, one or more matches by index or name.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @return [Array<String, nil>, String, Boolean]
*
* @overload [](index)
Expand Down Expand Up @@ -689,6 +692,10 @@ static VALUE re2_matchdata_inspect(VALUE self) {
/*
* Returns the array of submatches for pattern matching.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @return [Array<String, nil>] the array of submatches
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
Expand Down Expand Up @@ -734,6 +741,10 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
* more keys than there are capturing groups. Given keys will populate the hash in
* order but an invalid name will cause the hash to be immediately returned.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @return [Hash] a hash of capturing group names to submatches
* @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
* @example
Expand Down Expand Up @@ -1389,6 +1400,10 @@ static VALUE re2_regexp_scan(VALUE self, VALUE text) {
* Returns a copy of +str+ with the first occurrence +pattern+
* replaced with +rewrite+.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @param [String] str the string to modify
* @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
* @param [String] rewrite the string to replace with
Expand Down Expand Up @@ -1419,15 +1434,19 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
RE2::Replace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
"UTF-8");
}

}

/*
* Return a copy of +str+ with +pattern+ replaced by +rewrite+.
*
* Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
* returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
* RE2::Regexp is set to false (any other encoding's behaviour is undefined).
*
* @param [String] str the string to modify
* @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
* @param [String] rewrite the string to replace with
Expand Down Expand Up @@ -1458,8 +1477,8 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
"UTF-8");
}
}

Expand Down
20 changes: 10 additions & 10 deletions spec/re2_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,23 @@
end

it "returns UTF-8 strings if the pattern is UTF-8" do
original = "Foo".encode("Shift_JIS")
original = "Foo".encode("ISO-8859-1")
replacement = RE2.Replace(original, "oo", "ah")

expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "returns ISO-8859-1 strings if the pattern is not UTF-8" do
original = "Foo".encode("Shift_JIS")
original = "Foo"
replacement = RE2.Replace(original, RE2("oo", :utf8 => false), "ah")

expect(replacement.encoding).to eq(Encoding::ISO_8859_1)
end

it "returns strings in the encoding of the given String pattern" do
replacement = RE2.Replace("Foo", "oo".encode("Shift_JIS"), "ah")
it "returns UTF-8 strings when given a String pattern" do
replacement = RE2.Replace("Foo", "oo".encode("ISO-8859-1"), "ah")

expect(replacement.encoding).to eq(Encoding::Shift_JIS)
expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "raises a Type Error for input that can't be converted to String" do
Expand Down Expand Up @@ -121,23 +121,23 @@
end

it "returns UTF-8 strings if the pattern is UTF-8" do
original = "Foo".encode("Shift_JIS")
original = "Foo".encode("ISO-8859-1")
replacement = RE2.GlobalReplace(original, "oo", "ah")

expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "returns ISO-8859-1 strings if the pattern is not UTF-8" do
original = "Foo".encode("Shift_JIS")
original = "Foo"
replacement = RE2.GlobalReplace(original, RE2("oo", :utf8 => false), "ah")

expect(replacement.encoding).to eq(Encoding::ISO_8859_1)
end

it "returns strings in the encoding of the given String pattern" do
replacement = RE2.GlobalReplace("Foo", "oo".encode("Shift_JIS"), "ah")
it "returns UTF-8 strings when given a String pattern" do
replacement = RE2.GlobalReplace("Foo", "oo".encode("ISO-8859-1"), "ah")

expect(replacement.encoding).to eq(Encoding::Shift_JIS)
expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "raises a Type Error for input that can't be converted to String" do
Expand Down

0 comments on commit 45f3ddb

Please sign in to comment.