Skip to content

Commit 0c6da00

Browse files
authored
Use string encoding utilities from vctrs (#1849)
1 parent 2fbb760 commit 0c6da00

File tree

4 files changed

+25
-41
lines changed

4 files changed

+25
-41
lines changed

src/internal/decl/encoding-decl.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@ static
1414
r_obj* attrib_encode_utf8(r_obj* x);
1515

1616
static inline
17-
r_obj* str_encode_utf8(r_obj* x);
18-
19-
static inline
20-
bool str_needs_encoding(r_obj* x);
17+
bool str_is_ascii_or_utf8(r_obj* x);
2118

2219
static inline
23-
bool str_is_ascii_or_utf8(r_obj* x);
20+
r_obj* str_as_utf8(r_obj* x);

src/internal/encoding.c

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ r_obj* chr_encode_utf8(r_obj* x) {
6161
for (r_ssize i = start; i < size; ++i) {
6262
r_obj* const elt = p_x[i];
6363

64-
if (str_needs_encoding(elt)) {
65-
r_chr_poke(x, i, str_encode_utf8(elt));
64+
if (!str_is_ascii_or_utf8(elt)) {
65+
r_chr_poke(x, i, str_as_utf8(elt));
6666
}
6767
}
6868

@@ -78,7 +78,7 @@ r_ssize chr_find_encoding_start(r_obj* x, r_ssize size) {
7878
for (r_ssize i = 0; i < size; ++i) {
7979
r_obj* const elt = p_x[i];
8080

81-
if (str_needs_encoding(elt)) {
81+
if (!str_is_ascii_or_utf8(elt)) {
8282
return i;
8383
}
8484
}
@@ -179,33 +179,21 @@ r_obj* attrib_encode_utf8(r_obj* x) {
179179

180180
// -----------------------------------------------------------------------------
181181

182-
static inline
183-
r_obj* str_encode_utf8(r_obj* x) {
184-
return r_str(Rf_translateCharUTF8(x));
185-
}
186-
187-
static inline
188-
bool str_needs_encoding(r_obj* x) {
189-
return (!str_is_ascii_or_utf8(x)) && (x != NA_STRING);
190-
}
191-
192-
#if (R_VERSION < R_Version(4, 5, 0))
193-
194-
#define MASK_ASCII 8
195-
#define MASK_UTF8 64
196-
// The first 128 values are ASCII, and are the same regardless of the encoding.
197-
// Otherwise we enforce UTF-8.
182+
// String encoding normalization
183+
// From https://github.com/r-lib/vctrs/pull/2085
198184
static inline
199185
bool str_is_ascii_or_utf8(r_obj* x) {
186+
#if (R_VERSION >= R_Version(4, 5, 0))
187+
return Rf_charIsASCII(x) || (Rf_getCharCE(x) == CE_UTF8) || (x == r_globals.na_str);
188+
#else
189+
const int mask_ascii = 8;
190+
const int mask_utf8 = 64;
200191
const int levels = LEVELS(x);
201-
return (levels & MASK_ASCII) || (levels & MASK_UTF8);
192+
return (levels & mask_ascii) || (levels & mask_utf8) || (x == r_globals.na_str);
193+
#endif
202194
}
203195

204-
#else
205-
206196
static inline
207-
bool str_is_ascii_or_utf8(r_obj* x) {
208-
return Rf_charIsUTF8(x);
197+
r_obj* str_as_utf8(r_obj* x) {
198+
return Rf_mkCharCE(Rf_translateCharUTF8(x), CE_UTF8);
209199
}
210-
211-
#endif

tests/testthat/_snaps/c-api.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,10 @@
4141
Output
4242
<simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>
4343

44+
---
45+
46+
Code
47+
(expect_error(r_obj_encode_utf8(c(enc, bytes))))
48+
Output
49+
<simpleError in r_obj_encode_utf8(c(enc, bytes)): translating strings with "bytes" encoding is not allowed>
50+

tests/testthat/helper-c-api.R

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,10 @@ test_encodings <- function() {
2323
string <- "\u00B0C"
2424

2525
utf8 <- iconv(string, from = Encoding(string), to = "UTF-8")
26+
unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
2627
latin1 <- iconv(string, from = Encoding(string), to = "latin1")
2728

28-
# We used to be able to detect unknown encodings via `LEVELS()`. In recent
29-
# versions of R we need to use `Rf_charIsUTF8` instead to be conformant to the
30-
# public API but unfortunately it treats unknown encodings as UTF-8. So we
31-
# no longer support this case:
32-
#
33-
# ```
34-
# unknown <- iconv(string, from = Encoding(string), to = "", mark = FALSE)
35-
# ```
36-
37-
list(utf8 = utf8, latin1 = latin1)
29+
list(utf8 = utf8, unknown = unknown, latin1 = latin1)
3830
}
3931
expect_utf8_encoded <- function(object) {
4032
expect_identical(Encoding(object), rep("UTF-8", length(object)))

0 commit comments

Comments
 (0)