Skip to content

Commit

Permalink
Return UTF-8 strings when replacing with a string
Browse files Browse the repository at this point in the history
When replacing with a string pattern, we will implicitly create an RE2
pattern from the string with the default options meaning it will assume
and produce UTF-8 results.

This could potentially be a breaking change for users who rely on the
string pattern's encoding but the behaviour has been misleading (and
passing anything except ISO-8859-1 or UTF-8 to RE2 is undefined).
  • Loading branch information
mudge committed Sep 15, 2023
1 parent 029e019 commit 291bca1
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 17 deletions.
17 changes: 4 additions & 13 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,9 @@ using std::vector;
rb_enc_associate_index(_string, _enc); \
_string; \
})
#define ENCODED_STR_NEW2(str, length, str2) \
({ \
VALUE _string = rb_str_new(str, length); \
int _enc = rb_enc_get_index(str2); \
rb_enc_associate_index(_string, _enc); \
_string; \
})
#else
#define ENCODED_STR_NEW(str, length, encoding) \
rb_str_new((const char *)str, (long)length)
#define ENCODED_STR_NEW2(str, length, str2) \
rb_str_new((const char *)str, (long)length)
#endif

#ifdef HAVE_RB_STR_SUBLEN
Expand Down Expand Up @@ -1419,8 +1410,8 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
RE2::Replace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
"UTF-8");
}

}
Expand Down Expand Up @@ -1458,8 +1449,8 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
"UTF-8");
}
}

Expand Down
8 changes: 4 additions & 4 deletions spec/re2_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@
expect(replacement.encoding).to eq(Encoding::ISO_8859_1)
end

it "returns strings in the encoding of the given String pattern" do
it "returns UTF-8 strings when given a String pattern" do
replacement = RE2.Replace("Foo", "oo".encode("Shift_JIS"), "ah")

expect(replacement.encoding).to eq(Encoding::Shift_JIS)
expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "raises a Type Error for input that can't be converted to String" do
Expand Down Expand Up @@ -134,10 +134,10 @@
expect(replacement.encoding).to eq(Encoding::ISO_8859_1)
end

it "returns strings in the encoding of the given String pattern" do
it "returns UTF-8 strings when given a String pattern" do
replacement = RE2.GlobalReplace("Foo", "oo".encode("Shift_JIS"), "ah")

expect(replacement.encoding).to eq(Encoding::Shift_JIS)
expect(replacement.encoding).to eq(Encoding::UTF_8)
end

it "raises a Type Error for input that can't be converted to String" do
Expand Down

0 comments on commit 291bca1

Please sign in to comment.