r-lib · DavisVaughan · Oct 28, 2025 · Oct 24, 2025
diff --git a/src/internal/decl/encoding-decl.h b/src/internal/decl/encoding-decl.h
@@ -14,10 +14,7 @@ static
 r_obj* attrib_encode_utf8(r_obj* x);
 
 static inline
-r_obj* str_encode_utf8(r_obj* x);
-
-static inline
-bool str_needs_encoding(r_obj* x);
+bool str_is_ascii_or_utf8(r_obj* x);
 
 static inline
-bool str_is_ascii_or_utf8(r_obj* x);
+r_obj* str_as_utf8(r_obj* x);
diff --git a/src/internal/encoding.c b/src/internal/encoding.c
@@ -61,8 +61,8 @@ r_obj* chr_encode_utf8(r_obj* x) {
   for (r_ssize i = start; i < size; ++i) {
     r_obj* const elt = p_x[i];
 
-    if (str_needs_encoding(elt)) {
-      r_chr_poke(x, i, str_encode_utf8(elt));
+    if (!str_is_ascii_or_utf8(elt)) {
+      r_chr_poke(x, i, str_as_utf8(elt));
     }
   }
 
@@ -78,7 +78,7 @@ r_ssize chr_find_encoding_start(r_obj* x, r_ssize size) {
   for (r_ssize i = 0; i < size; ++i) {
     r_obj* const elt = p_x[i];
 
-    if (str_needs_encoding(elt)) {
+    if (!str_is_ascii_or_utf8(elt)) {
       return i;
     }
   }
@@ -179,33 +179,21 @@ r_obj* attrib_encode_utf8(r_obj* x) {
 
 // -----------------------------------------------------------------------------
 
-static inline
-r_obj* str_encode_utf8(r_obj* x) {
-  return r_str(Rf_translateCharUTF8(x));
-}
-
-static inline
-bool str_needs_encoding(r_obj* x) {
-  return (!str_is_ascii_or_utf8(x)) && (x != NA_STRING);
-}
-
-#if (R_VERSION < R_Version(4, 5, 0))
-
-#define MASK_ASCII 8
-#define MASK_UTF8 64
-// The first 128 values are ASCII, and are the same regardless of the encoding.
-// Otherwise we enforce UTF-8.
+// String encoding normalization
+// From https://github.com/r-lib/vctrs/pull/2085
 static inline
 bool str_is_ascii_or_utf8(r_obj* x) {
+#if (R_VERSION >= R_Version(4, 5, 0))
+  return Rf_charIsASCII(x) || (Rf_getCharCE(x) == CE_UTF8) || (x == r_globals.na_str);
+#else
+  const int mask_ascii = 8;
+  const int mask_utf8 = 64;
   const int levels = LEVELS(x);
-  return (levels & MASK_ASCII) || (levels & MASK_UTF8);
+  return (levels & mask_ascii) || (levels & mask_utf8) || (x == r_globals.na_str);
+#endif
 }
 
-#else
-
 static inline
-bool str_is_ascii_or_utf8(r_obj* x) {
-  return Rf_charIsUTF8(x);
+r_obj* str_as_utf8(r_obj* x) {
+  return Rf_mkCharCE(Rf_translateCharUTF8(x), CE_UTF8);
 }
-
-#endif
diff --git a/tests/testthat/_snaps/c-api.md b/tests/testthat/_snaps/c-api.md
@@ -41,3 +41,10 @@
     Output
       <simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>
 
+---
+
+    Code
+      (expect_error(r_obj_encode_utf8(c(enc, bytes))))
+    Output
+      <simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>
+
diff --git a/tests/testthat/helper-c-api.R b/tests/testthat/helper-c-api.R
@@ -23,18 +23,10 @@ test_encodings <- function() {
   string <- "\u00B0C"
 
   utf8 <- iconv(string, from = Encoding(string), to = "UTF-8")
+  unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
   latin1 <- iconv(string, from = Encoding(string), to = "latin1")
 
-  # We used to be able to detect unknown encodings via `LEVELS()`. In recent
-  # versions of R we need to use `Rf_charIsUTF8` instead to be conformant to the
-  # public API but unfortunately it treats unknown encodings as UTF-8. So we
-  # no longer support this case:
-  #
-  # ```
-  # unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
-  # ```
-
-  list(utf8 = utf8, latin1 = latin1)
+  list(utf8 = utf8, unknown = unknown, latin1 = latin1)
 }
 expect_utf8_encoded <- function(object) {
   expect_identical(Encoding(object), rep("UTF-8", length(object)))