From 7a63e347da47eceee1a3e5af17499c91f58a64c2 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Fri, 24 Oct 2025 13:54:43 -0400 Subject: [PATCH] Use string encoding utilities from vctrs --- src/internal/decl/encoding-decl.h | 7 ++---- src/internal/encoding.c | 40 +++++++++++-------------------- tests/testthat/_snaps/c-api.md | 7 ++++++ tests/testthat/helper-c-api.R | 12 ++-------- 4 files changed, 25 insertions(+), 41 deletions(-) diff --git a/src/internal/decl/encoding-decl.h b/src/internal/decl/encoding-decl.h index c90577b284..180287e0c4 100644 --- a/src/internal/decl/encoding-decl.h +++ b/src/internal/decl/encoding-decl.h @@ -14,10 +14,7 @@ static r_obj* attrib_encode_utf8(r_obj* x); static inline -r_obj* str_encode_utf8(r_obj* x); - -static inline -bool str_needs_encoding(r_obj* x); +bool str_is_ascii_or_utf8(r_obj* x); static inline -bool str_is_ascii_or_utf8(r_obj* x); +r_obj* str_as_utf8(r_obj* x); diff --git a/src/internal/encoding.c b/src/internal/encoding.c index 022a7034a7..308a85ad3b 100644 --- a/src/internal/encoding.c +++ b/src/internal/encoding.c @@ -61,8 +61,8 @@ r_obj* chr_encode_utf8(r_obj* x) { for (r_ssize i = start; i < size; ++i) { r_obj* const elt = p_x[i]; - if (str_needs_encoding(elt)) { - r_chr_poke(x, i, str_encode_utf8(elt)); + if (!str_is_ascii_or_utf8(elt)) { + r_chr_poke(x, i, str_as_utf8(elt)); } } @@ -78,7 +78,7 @@ r_ssize chr_find_encoding_start(r_obj* x, r_ssize size) { for (r_ssize i = 0; i < size; ++i) { r_obj* const elt = p_x[i]; - if (str_needs_encoding(elt)) { + if (!str_is_ascii_or_utf8(elt)) { return i; } } @@ -179,33 +179,21 @@ r_obj* attrib_encode_utf8(r_obj* x) { // ----------------------------------------------------------------------------- -static inline -r_obj* str_encode_utf8(r_obj* x) { - return r_str(Rf_translateCharUTF8(x)); -} - -static inline -bool str_needs_encoding(r_obj* x) { - return (!str_is_ascii_or_utf8(x)) && (x != NA_STRING); -} - -#if (R_VERSION < R_Version(4, 5, 0)) - -#define MASK_ASCII 8 -#define MASK_UTF8 64 -// The first 128 values are ASCII, and are the same regardless of the encoding. -// Otherwise we enforce UTF-8. +// String encoding normalization +// From https://github.com/r-lib/vctrs/pull/2085 static inline bool str_is_ascii_or_utf8(r_obj* x) { +#if (R_VERSION >= R_Version(4, 5, 0)) + return Rf_charIsASCII(x) || (Rf_getCharCE(x) == CE_UTF8) || (x == r_globals.na_str); +#else + const int mask_ascii = 8; + const int mask_utf8 = 64; const int levels = LEVELS(x); - return (levels & MASK_ASCII) || (levels & MASK_UTF8); + return (levels & mask_ascii) || (levels & mask_utf8) || (x == r_globals.na_str); +#endif } -#else - static inline -bool str_is_ascii_or_utf8(r_obj* x) { - return Rf_charIsUTF8(x); +r_obj* str_as_utf8(r_obj* x) { + return Rf_mkCharCE(Rf_translateCharUTF8(x), CE_UTF8); } - -#endif diff --git a/tests/testthat/_snaps/c-api.md b/tests/testthat/_snaps/c-api.md index 1f6e19e271..3f2a6e7e00 100644 --- a/tests/testthat/_snaps/c-api.md +++ b/tests/testthat/_snaps/c-api.md @@ -41,3 +41,10 @@ Output +--- + + Code + (expect_error(r_obj_encode_utf8(c(enc, bytes)))) + Output + + diff --git a/tests/testthat/helper-c-api.R b/tests/testthat/helper-c-api.R index 01cc26a0fa..11945d70e1 100644 --- a/tests/testthat/helper-c-api.R +++ b/tests/testthat/helper-c-api.R @@ -23,18 +23,10 @@ test_encodings <- function() { string <- "\u00B0C" utf8 <- iconv(string, from = Encoding(string), to = "UTF-8") + unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE) latin1 <- iconv(string, from = Encoding(string), to = "latin1") - # We used to be able to detect unknown encodings via `LEVELS()`. In recent - # versions of R we need to use `Rf_charIsUTF8` instead to be conformant to the - # public API but unfortunately it treats unknown encodings as UTF-8. So we - # no longer support this case: - # - # ``` - # unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE) - # ``` - - list(utf8 = utf8, latin1 = latin1) + list(utf8 = utf8, unknown = unknown, latin1 = latin1) } expect_utf8_encoded <- function(object) { expect_identical(Encoding(object), rep("UTF-8", length(object)))