Skip to content

Commit 361a904

Browse files
authored
Add chr_transform argument to vec_order_radix() (#1334)
* Implement C side of `string_key` support * Implement R side of `string_key` support, with documentation * Add `string_key` tests * Reorder loop to make reprotection easier * Use a protection index to avoid conditional protection * Tweak string-key error messages * Rename `string_key` to `chr_transform` * Use `proxy_chr_transform()` to apply character transforms up front * Update documentation regarding the name change * Mention common transformation functions in `chr_transform` param section
1 parent 6b04f16 commit 361a904

File tree

8 files changed

+254
-33
lines changed

8 files changed

+254
-33
lines changed

R/order-radix.R

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,16 @@
1919
#' unless `base::order(method = "radix")` is explicitly set, which also uses
2020
#' the C-locale. While sorting with the C-locale can be useful for
2121
#' algorithmic efficiency, in many real world uses it can be the cause of
22-
#' data analysis mistakes.
22+
#' data analysis mistakes. To balance these trade-offs, you can supply a
23+
#' `chr_transform` to transform character vectors into an alternative
24+
#' representation that orders in the C-locale in a less surprising way. For
25+
#' example, providing [base::tolower()] as a transform will order the original
26+
#' vector in a case-insensitive manner. Locale-aware ordering can be achieved
27+
#' by providing `stringi::stri_sort_key()` as a transform, setting the
28+
#' collation options as appropriate for your locale.
2329
#'
24-
#' Character vectors are always translated to UTF-8 before ordering.
30+
#' Character vectors are always translated to UTF-8 before ordering, and before
31+
#' any transform is applied by `chr_transform`.
2532
#'
2633
#' @param x A vector
2734
#' @param direction Direction to sort in.
@@ -36,6 +43,19 @@
3643
#' - For data frames, a length `1` or `ncol(x)` character vector containing
3744
#' only `"largest"` or `"smallest"`, specifying how `NA`s should be treated
3845
#' in each column.
46+
#' @param chr_transform Transformation of character vectors for sorting in
47+
#' alternate locales.
48+
#' - If `NULL`, no transformation is done.
49+
#' - Otherwise, this must be a function of one argument. The function will be
50+
#' invoked with `x`, if it is a character vector, after it has been
51+
#' translated to UTF-8, and should return a character vector with the same
52+
#' length as `x`, also encoded as UTF-8.
53+
#' - For data frames, `chr_transform` will be applied to all character
54+
#' columns.
55+
#'
56+
#' Common transformation functions include: `tolower()` for case-insensitive
57+
#' ordering and `stringi::str_sort_key()` for locale-aware ordering. See the
58+
#' Details section for more information.
3959
#' @return
4060
#' * `vec_order()` an integer vector the same size as `x`.
4161
#' * `vec_sort()` a vector with the same size and type as `x`.
@@ -67,9 +87,21 @@
6787
#' direction = c("desc", "asc"),
6888
#' na_value = c("largest", "smallest")
6989
#' )
90+
#'
91+
#' # Character vectors are ordered in the C locale, which orders capital letters
92+
#' # below lowercase ones
93+
#' y <- c("B", "A", "a")
94+
#' vec_sort(y)
95+
#'
96+
#' # To order in a case-insensitive manner, provide a `chr_transform` that
97+
#' # transforms the strings to all lowercase
98+
#' vec_sort(y, chr_transform = tolower)
7099
#' @noRd
71-
vec_order_radix <- function(x, direction = "asc", na_value = "largest") {
72-
.Call(vctrs_order, x, direction, na_value)
100+
vec_order_radix <- function(x,
101+
direction = "asc",
102+
na_value = "largest",
103+
chr_transform = NULL) {
104+
.Call(vctrs_order, x, direction, na_value, chr_transform)
73105
}
74106

75107
#' Identify ordered groups
@@ -104,6 +136,9 @@ vec_order_radix <- function(x, direction = "asc", na_value = "largest") {
104136
#'
105137
#' vec_group_loc(df)
106138
#' @noRd
107-
vec_order_locs <- function(x, direction = "asc", na_value = "largest") {
108-
.Call(vctrs_order_locs, x, direction, na_value)
139+
vec_order_locs <- function(x,
140+
direction = "asc",
141+
na_value = "largest",
142+
chr_transform = NULL) {
143+
.Call(vctrs_order_locs, x, direction, na_value, chr_transform)
109144
}

src/init.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ extern SEXP vctrs_slice_complete(SEXP);
129129
extern SEXP vctrs_locate_complete(SEXP);
130130
extern SEXP vctrs_detect_complete(SEXP);
131131
extern SEXP vctrs_normalize_encoding(SEXP);
132-
extern SEXP vctrs_order(SEXP, SEXP, SEXP);
133-
extern SEXP vctrs_order_locs(SEXP, SEXP, SEXP);
132+
extern SEXP vctrs_order(SEXP, SEXP, SEXP, SEXP);
133+
extern SEXP vctrs_order_locs(SEXP, SEXP, SEXP, SEXP);
134134
extern SEXP vctrs_unrep(SEXP);
135135
extern SEXP vctrs_fill_missing(SEXP, SEXP, SEXP);
136136
extern SEXP vctrs_chr_paste_prefix(SEXP, SEXP, SEXP);
@@ -280,8 +280,8 @@ static const R_CallMethodDef CallEntries[] = {
280280
{"vctrs_locate_complete", (DL_FUNC) &vctrs_locate_complete, 1},
281281
{"vctrs_detect_complete", (DL_FUNC) &vctrs_detect_complete, 1},
282282
{"vctrs_normalize_encoding", (DL_FUNC) &vctrs_normalize_encoding, 1},
283-
{"vctrs_order", (DL_FUNC) &vctrs_order, 3},
284-
{"vctrs_order_locs", (DL_FUNC) &vctrs_order_locs, 3},
283+
{"vctrs_order", (DL_FUNC) &vctrs_order, 4},
284+
{"vctrs_order_locs", (DL_FUNC) &vctrs_order_locs, 4},
285285
{"vctrs_unrep", (DL_FUNC) &vctrs_unrep, 1},
286286
{"vctrs_fill_missing", (DL_FUNC) &vctrs_fill_missing, 3},
287287
{"vctrs_chr_paste_prefix", (DL_FUNC) &vctrs_chr_paste_prefix, 3},

src/order-radix.c

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "order-groups.h"
2020
#include "order-truelength.h"
2121
#include "order-sortedness.h"
22+
#include "order-transform.h"
2223

2324
// -----------------------------------------------------------------------------
2425

@@ -183,46 +184,50 @@
183184

184185
// -----------------------------------------------------------------------------
185186

186-
static SEXP vec_order(SEXP x, SEXP decreasing, SEXP na_last);
187+
static SEXP vec_order(SEXP x, SEXP decreasing, SEXP na_last, SEXP chr_transform);
187188

188189
// [[ register() ]]
189-
SEXP vctrs_order(SEXP x, SEXP direction, SEXP na_value) {
190+
SEXP vctrs_order(SEXP x, SEXP direction, SEXP na_value, SEXP chr_transform) {
190191
SEXP decreasing = PROTECT(parse_direction(direction));
191192
SEXP na_last = PROTECT(parse_na_value(na_value));
192193

193-
SEXP out = vec_order(x, decreasing, na_last);
194+
SEXP out = vec_order(x, decreasing, na_last, chr_transform);
194195

195196
UNPROTECT(2);
196197
return out;
197198
}
198199

199200

200-
static SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations);
201+
static SEXP vec_order_impl(SEXP x,
202+
SEXP decreasing,
203+
SEXP na_last,
204+
SEXP chr_transform,
205+
bool locations);
201206

202207
static
203-
SEXP vec_order(SEXP x, SEXP decreasing, SEXP na_last) {
204-
return vec_order_impl(x, decreasing, na_last, false);
208+
SEXP vec_order(SEXP x, SEXP decreasing, SEXP na_last, SEXP chr_transform) {
209+
return vec_order_impl(x, decreasing, na_last, chr_transform, false);
205210
}
206211

207212
// -----------------------------------------------------------------------------
208213

209-
static SEXP vec_order_locs(SEXP x, SEXP decreasing, SEXP na_last);
214+
static SEXP vec_order_locs(SEXP x, SEXP decreasing, SEXP na_last, SEXP chr_transform);
210215

211216
// [[ register() ]]
212-
SEXP vctrs_order_locs(SEXP x, SEXP direction, SEXP na_value) {
217+
SEXP vctrs_order_locs(SEXP x, SEXP direction, SEXP na_value, SEXP chr_transform) {
213218
SEXP decreasing = PROTECT(parse_direction(direction));
214219
SEXP na_last = PROTECT(parse_na_value(na_value));
215220

216-
SEXP out = vec_order_locs(x, decreasing, na_last);
221+
SEXP out = vec_order_locs(x, decreasing, na_last, chr_transform);
217222

218223
UNPROTECT(2);
219224
return out;
220225
}
221226

222227

223228
static
224-
SEXP vec_order_locs(SEXP x, SEXP decreasing, SEXP na_last) {
225-
return vec_order_impl(x, decreasing, na_last, true);
229+
SEXP vec_order_locs(SEXP x, SEXP decreasing, SEXP na_last, SEXP chr_transform) {
230+
return vec_order_impl(x, decreasing, na_last, chr_transform, true);
226231
}
227232

228233
// -----------------------------------------------------------------------------
@@ -258,7 +263,7 @@ static void vec_order_switch(SEXP x,
258263
* the locations in `x` corresponding to each key.
259264
*/
260265
static
261-
SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
266+
SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, SEXP chr_transform, bool locations) {
262267
int n_prot = 0;
263268
int* p_n_prot = &n_prot;
264269

@@ -269,6 +274,7 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
269274

270275
SEXP proxy = PROTECT_N(vec_proxy_order(x), p_n_prot);
271276
proxy = PROTECT_N(vec_normalize_encoding(proxy), p_n_prot);
277+
proxy = PROTECT_N(proxy_chr_transform(proxy, chr_transform), p_n_prot);
272278

273279
r_ssize size = vec_size(proxy);
274280
const enum vctrs_type type = vec_proxy_typeof(proxy);
@@ -3526,7 +3532,14 @@ void df_order_internal(SEXP x,
35263532

35273533
// Iterate over remaining columns by group chunk
35283534
for (r_ssize i = 1; i < n_cols; ++i) {
3529-
col = VECTOR_ELT(x, i);
3535+
// Get the number of group chunks from previous column group info
3536+
struct group_info* p_group_info_pre = groups_current(p_group_infos);
3537+
r_ssize n_groups = p_group_info_pre->n_groups;
3538+
3539+
// If there were no ties, we are completely done
3540+
if (n_groups == size) {
3541+
break;
3542+
}
35303543

35313544
if (!recycle_decreasing) {
35323545
col_decreasing = p_decreasing[i];
@@ -3541,15 +3554,7 @@ void df_order_internal(SEXP x,
35413554
// processed at least one column.
35423555
int* p_o_col = p_order->p_data;
35433556

3544-
// Get the number of group chunks from previous column group info
3545-
struct group_info* p_group_info_pre = groups_current(p_group_infos);
3546-
r_ssize n_groups = p_group_info_pre->n_groups;
3547-
3548-
// If there were no ties, we are completely done
3549-
if (n_groups == size) {
3550-
break;
3551-
}
3552-
3557+
col = VECTOR_ELT(x, i);
35533558
type = vec_proxy_typeof(col);
35543559

35553560
// If we are on the rerun pass, flip this back off so the
@@ -3558,7 +3563,7 @@ void df_order_internal(SEXP x,
35583563
rerun_complex = rerun_complex ? false : true;
35593564
}
35603565

3561-
// Pre sort unique characters once for the whole column
3566+
// Pre-sort unique characters once for the whole column
35623567
if (type == vctrs_type_character) {
35633568
const SEXP* p_col = STRING_PTR_RO(col);
35643569

src/order-transform.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/*
2+
* The implementation of vec_order() is based on data.table’s forder() and their
3+
* earlier contribution to R’s order(). See LICENSE.note for more information.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
7+
* You can obtain one at https://mozilla.org/MPL/2.0/.
8+
*
9+
* Copyright (c) 2020, RStudio
10+
* Copyright (c) 2020, Data table team
11+
*/
12+
13+
#include "order-transform.h"
14+
#include "utils.h"
15+
16+
// -----------------------------------------------------------------------------
17+
18+
static SEXP chr_apply_transform(SEXP x, SEXP chr_transform);
19+
static SEXP df_apply_transform(SEXP x, SEXP chr_transform);
20+
21+
// [[ include("order-transform.h") ]]
22+
SEXP proxy_chr_transform(SEXP proxy, SEXP chr_transform) {
23+
if (chr_transform == r_null) {
24+
return proxy;
25+
}
26+
27+
chr_transform = PROTECT(r_as_function(chr_transform, "chr_transform"));
28+
29+
SEXP out;
30+
31+
switch (vec_proxy_typeof(proxy)) {
32+
case vctrs_type_character: out = chr_apply_transform(proxy, chr_transform); break;
33+
case vctrs_type_dataframe: out = df_apply_transform(proxy, chr_transform); break;
34+
default: out = proxy;
35+
}
36+
37+
UNPROTECT(1);
38+
return out;
39+
}
40+
41+
// -----------------------------------------------------------------------------
42+
43+
static
44+
SEXP chr_apply_transform(SEXP x, SEXP chr_transform) {
45+
// Don't use vctrs dispatch utils because we match argument positionally
46+
SEXP call = PROTECT(Rf_lang2(syms_chr_transform, syms_x));
47+
48+
SEXP mask = PROTECT(r_new_environment(R_GlobalEnv));
49+
Rf_defineVar(syms_chr_transform, chr_transform, mask);
50+
Rf_defineVar(syms_x, x, mask);
51+
52+
SEXP out = PROTECT(Rf_eval(call, mask));
53+
54+
if (vec_typeof(out) != vctrs_type_character) {
55+
Rf_errorcall(
56+
R_NilValue,
57+
"`chr_transform` must return a character vector."
58+
);
59+
}
60+
61+
R_len_t x_size = vec_size(x);
62+
R_len_t out_size = vec_size(out);
63+
64+
if (x_size != out_size) {
65+
Rf_errorcall(
66+
R_NilValue,
67+
"`chr_transform` must return a vector of the same length (%i, not %i).",
68+
x_size,
69+
out_size
70+
);
71+
}
72+
73+
UNPROTECT(3);
74+
return out;
75+
}
76+
77+
// -----------------------------------------------------------------------------
78+
79+
static
80+
SEXP df_apply_transform(SEXP x, SEXP chr_transform) {
81+
const r_ssize n_cols = r_length(x);
82+
const SEXP* v_x = VECTOR_PTR_RO(x);
83+
84+
r_ssize i = 0;
85+
86+
for (; i < n_cols; ++i) {
87+
SEXP col = v_x[i];
88+
if (vec_proxy_typeof(col) == vctrs_type_character) {
89+
break;
90+
}
91+
}
92+
93+
if (i == n_cols) {
94+
// No character columns
95+
return x;
96+
}
97+
98+
SEXP out = PROTECT(r_clone_referenced(x));
99+
100+
for (; i < n_cols; ++i) {
101+
SEXP col = v_x[i];
102+
103+
if (vec_proxy_typeof(col) != vctrs_type_character) {
104+
continue;
105+
}
106+
107+
col = chr_apply_transform(col, chr_transform);
108+
SET_VECTOR_ELT(out, i, col);
109+
}
110+
111+
UNPROTECT(1);
112+
return out;
113+
}

src/order-transform.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* The implementation of vec_order() is based on data.table’s forder() and their
3+
* earlier contribution to R’s order(). See LICENSE.note for more information.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
7+
* You can obtain one at https://mozilla.org/MPL/2.0/.
8+
*
9+
* Copyright (c) 2020, RStudio
10+
* Copyright (c) 2020, Data table team
11+
*/
12+
13+
#ifndef VCTRS_ORDER_TRANSFORM_H
14+
#define VCTRS_ORDER_TRANSFORM_H
15+
16+
#include "vctrs.h"
17+
18+
// -----------------------------------------------------------------------------
19+
20+
/*
21+
* `proxy_chr_transform()` iterates over `proxy`, applying `chr_transform`
22+
* on any character vectors that it detects.
23+
*
24+
* It expects that:
25+
* - If `proxy` is a data frame, it has been flattened by its corresponding
26+
* `vec_proxy_*()` function.
27+
* - All character vectors in `proxy` have already been normalized to UTF-8
28+
* by `vec_normalize_encoding()`.
29+
*/
30+
SEXP proxy_chr_transform(SEXP proxy, SEXP chr_transform);
31+
32+
// -----------------------------------------------------------------------------
33+
#endif

src/utils.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1835,6 +1835,7 @@ SEXP syms_vctrs_common_class_fallback = NULL;
18351835
SEXP syms_fallback_class = NULL;
18361836
SEXP syms_abort = NULL;
18371837
SEXP syms_message = NULL;
1838+
SEXP syms_chr_transform = NULL;
18381839

18391840
SEXP fns_bracket = NULL;
18401841
SEXP fns_quote = NULL;
@@ -2110,6 +2111,7 @@ void vctrs_init_utils(SEXP ns) {
21102111
syms_fallback_class = Rf_install("fallback_class");
21112112
syms_abort = Rf_install("abort");
21122113
syms_message = Rf_install("message");
2114+
syms_chr_transform = Rf_install("chr_transform");
21132115

21142116
fns_bracket = Rf_findVar(syms_bracket, R_BaseEnv);
21152117
fns_quote = Rf_findVar(Rf_install("quote"), R_BaseEnv);

src/utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ extern SEXP syms_vctrs_common_class_fallback;
631631
extern SEXP syms_fallback_class;
632632
extern SEXP syms_abort;
633633
extern SEXP syms_message;
634+
extern SEXP syms_chr_transform;
634635

635636
static const char * const c_strs_vctrs_common_class_fallback = "vctrs:::common_class_fallback";
636637

0 commit comments

Comments
 (0)