Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/internal/decl/encoding-decl.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ static
r_obj* attrib_encode_utf8(r_obj* x);

static inline
r_obj* str_encode_utf8(r_obj* x);

static inline
bool str_needs_encoding(r_obj* x);
bool str_is_ascii_or_utf8(r_obj* x);

static inline
bool str_is_ascii_or_utf8(r_obj* x);
r_obj* str_as_utf8(r_obj* x);
40 changes: 14 additions & 26 deletions src/internal/encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ r_obj* chr_encode_utf8(r_obj* x) {
for (r_ssize i = start; i < size; ++i) {
r_obj* const elt = p_x[i];

if (str_needs_encoding(elt)) {
r_chr_poke(x, i, str_encode_utf8(elt));
if (!str_is_ascii_or_utf8(elt)) {
r_chr_poke(x, i, str_as_utf8(elt));
}
}

Expand All @@ -78,7 +78,7 @@ r_ssize chr_find_encoding_start(r_obj* x, r_ssize size) {
for (r_ssize i = 0; i < size; ++i) {
r_obj* const elt = p_x[i];

if (str_needs_encoding(elt)) {
if (!str_is_ascii_or_utf8(elt)) {
return i;
}
}
Expand Down Expand Up @@ -179,33 +179,21 @@ r_obj* attrib_encode_utf8(r_obj* x) {

// -----------------------------------------------------------------------------

static inline
r_obj* str_encode_utf8(r_obj* x) {
return r_str(Rf_translateCharUTF8(x));
}

static inline
bool str_needs_encoding(r_obj* x) {
return (!str_is_ascii_or_utf8(x)) && (x != NA_STRING);
}

#if (R_VERSION < R_Version(4, 5, 0))

#define MASK_ASCII 8
#define MASK_UTF8 64
// The first 128 values are ASCII, and are the same regardless of the encoding.
// Otherwise we enforce UTF-8.
// String encoding normalization
// From https://github.com/r-lib/vctrs/pull/2085
static inline
bool str_is_ascii_or_utf8(r_obj* x) {
#if (R_VERSION >= R_Version(4, 5, 0))
return Rf_charIsASCII(x) || (Rf_getCharCE(x) == CE_UTF8) || (x == r_globals.na_str);
#else
const int mask_ascii = 8;
const int mask_utf8 = 64;
const int levels = LEVELS(x);
return (levels & MASK_ASCII) || (levels & MASK_UTF8);
return (levels & mask_ascii) || (levels & mask_utf8) || (x == r_globals.na_str);
#endif
}

#else

static inline
bool str_is_ascii_or_utf8(r_obj* x) {
return Rf_charIsUTF8(x);
r_obj* str_as_utf8(r_obj* x) {
return Rf_mkCharCE(Rf_translateCharUTF8(x), CE_UTF8);
}

#endif
7 changes: 7 additions & 0 deletions tests/testthat/_snaps/c-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,10 @@
Output
<simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>

---

Code
(expect_error(r_obj_encode_utf8(c(enc, bytes))))
Output
<simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>

12 changes: 2 additions & 10 deletions tests/testthat/helper-c-api.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,10 @@ test_encodings <- function() {
string <- "\u00B0C"

utf8 <- iconv(string, from = Encoding(string), to = "UTF-8")
unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
latin1 <- iconv(string, from = Encoding(string), to = "latin1")

# We used to be able to detect unknown encodings via `LEVELS()`. In recent
# versions of R we need to use `Rf_charIsUTF8` instead to be conformant to the
# public API but unfortunately it treats unknown encodings as UTF-8. So we
# no longer support this case:
#
# ```
# unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
# ```

list(utf8 = utf8, latin1 = latin1)
list(utf8 = utf8, unknown = unknown, latin1 = latin1)
}
expect_utf8_encoded <- function(object) {
expect_identical(Encoding(object), rep("UTF-8", length(object)))
Expand Down