Skip to content

Commit 6b04f16

Browse files
authored
Delegate encoding normalization to vec_normalize_encoding() (#1328)
1 parent 3cbcabb commit 6b04f16

File tree

8 files changed

+8
-176
lines changed

8 files changed

+8
-176
lines changed

src/order-radix.c

Lines changed: 8 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "utils.h"
1515
#include "lazy.h"
1616
#include "type-data-frame.h"
17+
#include "translate.h"
1718
#include "order-radix.h"
1819
#include "order-groups.h"
1920
#include "order-truelength.h"
@@ -118,8 +119,9 @@
118119
* - `chr_order_insertion()` - Used when `x` is small.
119120
*
120121
* - `chr_order_radix()` - Same principle as integer/double ordering, but
121-
* we iterate 1 character at a time. We assume a C locale here, and any
122-
* non-ASCII and non-UTF8 strings are translated to UTF8.
122+
* we iterate 1 character at a time. We assume a C locale here. Any non-ASCII
123+
* and non-UTF-8 strings are translated up front by
124+
* `vec_normalize_encoding()`.
123125
*
124126
* -----------------------------------------------------------------------------
125127
* Logicals
@@ -246,7 +248,6 @@ static void vec_order_switch(SEXP x,
246248
struct lazy_raw* p_lazy_bytes,
247249
struct lazy_raw* p_lazy_counts,
248250
struct group_infos* p_group_infos,
249-
struct lazy_chr* p_lazy_x_reencoded,
250251
struct truelength_info* p_truelength_info);
251252

252253
/*
@@ -267,6 +268,7 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
267268
na_last = VECTOR_ELT(args, 1);
268269

269270
SEXP proxy = PROTECT_N(vec_proxy_order(x), p_n_prot);
271+
proxy = PROTECT_N(vec_normalize_encoding(proxy), p_n_prot);
270272

271273
r_ssize size = vec_size(proxy);
272274
const enum vctrs_type type = vec_proxy_typeof(proxy);
@@ -325,9 +327,6 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
325327
struct truelength_info* p_truelength_info = new_truelength_info(size);
326328
PROTECT_TRUELENGTH_INFO(p_truelength_info, p_n_prot);
327329

328-
struct lazy_chr* p_lazy_x_reencoded = new_lazy_chr(size);
329-
PROTECT_LAZY_VEC(p_lazy_x_reencoded, p_n_prot);
330-
331330
struct order* p_order = new_order(size);
332331
PROTECT_ORDER(p_order, p_n_prot);
333332

@@ -344,7 +343,6 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
344343
p_lazy_bytes,
345344
p_lazy_counts,
346345
p_group_infos,
347-
p_lazy_x_reencoded,
348346
p_truelength_info
349347
);
350348

@@ -430,7 +428,6 @@ static void df_order(SEXP x,
430428
struct lazy_raw* p_lazy_bytes,
431429
struct lazy_raw* p_lazy_counts,
432430
struct group_infos* p_group_infos,
433-
struct lazy_chr* p_lazy_x_reencoded,
434431
struct truelength_info* p_truelength_info);
435432

436433
static void vec_order_base_switch(SEXP x,
@@ -445,7 +442,6 @@ static void vec_order_base_switch(SEXP x,
445442
struct lazy_raw* p_lazy_bytes,
446443
struct lazy_raw* p_lazy_counts,
447444
struct group_infos* p_group_infos,
448-
struct lazy_chr* p_lazy_x_reencoded,
449445
struct truelength_info* p_truelength_info);
450446

451447
static
@@ -461,7 +457,6 @@ void vec_order_switch(SEXP x,
461457
struct lazy_raw* p_lazy_bytes,
462458
struct lazy_raw* p_lazy_counts,
463459
struct group_infos* p_group_infos,
464-
struct lazy_chr* p_lazy_x_reencoded,
465460
struct truelength_info* p_truelength_info) {
466461
if (type == vctrs_type_dataframe) {
467462
df_order(
@@ -476,7 +471,6 @@ void vec_order_switch(SEXP x,
476471
p_lazy_bytes,
477472
p_lazy_counts,
478473
p_group_infos,
479-
p_lazy_x_reencoded,
480474
p_truelength_info
481475
);
482476

@@ -515,7 +509,6 @@ void vec_order_switch(SEXP x,
515509
p_lazy_bytes,
516510
p_lazy_counts,
517511
p_group_infos,
518-
p_lazy_x_reencoded,
519512
p_truelength_info
520513
);
521514
}
@@ -581,7 +574,6 @@ static void chr_order(SEXP x,
581574
struct lazy_raw* p_lazy_bytes,
582575
struct lazy_raw* p_lazy_counts,
583576
struct group_infos* p_group_infos,
584-
struct lazy_chr* p_lazy_x_reencoded,
585577
struct truelength_info* p_truelength_info);
586578

587579
// Used on bare vectors and the first column of data frame `x`s
@@ -598,7 +590,6 @@ void vec_order_base_switch(SEXP x,
598590
struct lazy_raw* p_lazy_bytes,
599591
struct lazy_raw* p_lazy_counts,
600592
struct group_infos* p_group_infos,
601-
struct lazy_chr* p_lazy_x_reencoded,
602593
struct truelength_info* p_truelength_info) {
603594
switch (type) {
604595
case vctrs_type_integer: {
@@ -682,7 +673,6 @@ void vec_order_base_switch(SEXP x,
682673
p_lazy_bytes,
683674
p_lazy_counts,
684675
p_group_infos,
685-
p_lazy_x_reencoded,
686676
p_truelength_info
687677
);
688678

@@ -2590,7 +2580,6 @@ static void chr_mark_sorted_uniques(const SEXP* p_x,
25902580
r_ssize size,
25912581
struct lazy_raw* p_lazy_x_aux,
25922582
struct lazy_raw* p_lazy_bytes,
2593-
struct lazy_chr* p_lazy_x_reencoded,
25942583
struct truelength_info* p_truelength_info);
25952584

25962585
static inline void chr_extract_ordering(const SEXP* p_x, r_ssize size, int* p_x_aux);
@@ -2609,12 +2598,10 @@ static void chr_order_radix(const r_ssize size,
26092598
* `chr_order_chunk()` assumes `p_x` is modifiable by reference. It also
26102599
* assumes that `chr_mark_sorted_uniques()` has already been called. For data
26112600
* frame columns where `chr_order_chunk()` is called on each group chunk,
2612-
* `chr_mark_sorted_uniques()` is only called once on the entire column. It
2613-
* also assumes that `p_x` has already been re-encoded as UTF-8 if required.
2601+
* `chr_mark_sorted_uniques()` is only called once on the entire column.
26142602
*
26152603
* `chr_order()` assumes `x` is user input which cannot be modified.
2616-
* It copies `x` into another SEXP that can be modified directly and re-encodes
2617-
* as UTF-8 if required.
2604+
* It copies `x` into another SEXP that can be modified directly.
26182605
*
26192606
* `chr_order_chunk()` essentially calls `int_order_chunk()`, however we can't
26202607
* call it directly because we don't have access to all the required arguments.
@@ -2641,17 +2628,11 @@ void chr_order_chunk(bool decreasing,
26412628
struct group_infos* p_group_infos) {
26422629
void* p_x_chunk = p_lazy_x_chunk->p_data;
26432630

2644-
// Don't check encoding on `p_x_chunk` data. In `df_order()`, we already
2645-
// ran `chr_mark_sorted_uniques()` which told `df_order()` whether or not
2646-
// to re-encode as it created `p_x_chunk`
2647-
bool check_encoding = false;
2648-
26492631
const enum vctrs_sortedness sortedness = chr_sortedness(
26502632
p_x_chunk,
26512633
size,
26522634
decreasing,
26532635
na_last,
2654-
check_encoding,
26552636
p_group_infos
26562637
);
26572638

@@ -2699,7 +2680,6 @@ struct chr_order_info {
26992680
struct lazy_raw* p_lazy_bytes;
27002681
struct lazy_raw* p_lazy_counts;
27012682
struct group_infos* p_group_infos;
2702-
struct lazy_chr* p_lazy_x_reencoded;
27032683
struct truelength_info* p_truelength_info;
27042684
};
27052685

@@ -2728,7 +2708,6 @@ void chr_order(SEXP x,
27282708
struct lazy_raw* p_lazy_bytes,
27292709
struct lazy_raw* p_lazy_counts,
27302710
struct group_infos* p_group_infos,
2731-
struct lazy_chr* p_lazy_x_reencoded,
27322711
struct truelength_info* p_truelength_info) {
27332712
struct chr_order_info info = {
27342713
.x = x,
@@ -2742,7 +2721,6 @@ void chr_order(SEXP x,
27422721
.p_lazy_bytes = p_lazy_bytes,
27432722
.p_lazy_counts = p_lazy_counts,
27442723
.p_group_infos = p_group_infos,
2745-
.p_lazy_x_reencoded = p_lazy_x_reencoded,
27462724
.p_truelength_info = p_truelength_info
27472725
};
27482726

@@ -2769,7 +2747,6 @@ static void chr_order_internal(SEXP x,
27692747
struct lazy_raw* p_lazy_bytes,
27702748
struct lazy_raw* p_lazy_counts,
27712749
struct group_infos* p_group_infos,
2772-
struct lazy_chr* p_lazy_x_reencoded,
27732750
struct truelength_info* p_truelength_info);
27742751

27752752
static
@@ -2788,7 +2765,6 @@ SEXP chr_order_exec(void* p_data) {
27882765
p_info->p_lazy_bytes,
27892766
p_info->p_lazy_counts,
27902767
p_info->p_group_infos,
2791-
p_info->p_lazy_x_reencoded,
27922768
p_info->p_truelength_info
27932769
);
27942770

@@ -2813,19 +2789,14 @@ void chr_order_internal(SEXP x,
28132789
struct lazy_raw* p_lazy_bytes,
28142790
struct lazy_raw* p_lazy_counts,
28152791
struct group_infos* p_group_infos,
2816-
struct lazy_chr* p_lazy_x_reencoded,
28172792
struct truelength_info* p_truelength_info) {
28182793
const SEXP* p_x = STRING_PTR_RO(x);
28192794

2820-
// Check encodings when determining sortedness of user input
2821-
bool check_encoding = true;
2822-
28232795
const enum vctrs_sortedness sortedness = chr_sortedness(
28242796
p_x,
28252797
size,
28262798
decreasing,
28272799
na_last,
2828-
check_encoding,
28292800
p_group_infos
28302801
);
28312802

@@ -2839,23 +2810,15 @@ void chr_order_internal(SEXP x,
28392810

28402811
// Sort unique strings and mark their truelengths with ordering.
28412812
// Use `p_lazy_x_chunk` as auxiliary memory for `chr_order_radix()` so we
2842-
// hopefully don't have to also allocate `p_lazy_x_aux`. If re-encoding
2843-
// is required, it stores the results in `p_lazy_x_reencoded`.
2813+
// hopefully don't have to also allocate `p_lazy_x_aux`.
28442814
chr_mark_sorted_uniques(
28452815
p_x,
28462816
size,
28472817
p_lazy_x_chunk,
28482818
p_lazy_bytes,
2849-
p_lazy_x_reencoded,
28502819
p_truelength_info
28512820
);
28522821

2853-
// If we re-encoded, then the vector to extract the ordering from is in
2854-
// `p_lazy_x_reencoded`.
2855-
if (p_truelength_info->reencode) {
2856-
p_x = p_lazy_x_reencoded->p_data;
2857-
}
2858-
28592822
void* p_x_chunk = init_lazy_raw(p_lazy_x_chunk);
28602823

28612824
// Move integer ordering into `p_x_chunk`.
@@ -2929,12 +2892,6 @@ static void chr_mark_uniques(const SEXP* p_x,
29292892
* through `p_x` and just pluck off the TRUELENGTH value, which will be an
29302893
* integer proxy for the value's ordering.
29312894
*
2932-
* We optimize heavily for the ASCII / UTF-8 case by checking the encodings of
2933-
* only the uniques. If any uniques need re-encoding, we recompute the unique
2934-
* strings again on the entire vector, this time with reencoding. This gives
2935-
* a nice speed boost for the most common case of all ASCII/UTF-8 because
2936-
* checking encodings is expensive.
2937-
*
29382895
* `truelength_save()` also saves the unique strings and their original
29392896
* TRUELENGTH values so they can be reset after each column with
29402897
* `truelength_reset()`.
@@ -2944,34 +2901,9 @@ void chr_mark_sorted_uniques(const SEXP* p_x,
29442901
r_ssize size,
29452902
struct lazy_raw* p_lazy_x_aux,
29462903
struct lazy_raw* p_lazy_bytes,
2947-
struct lazy_chr* p_lazy_x_reencoded,
29482904
struct truelength_info* p_truelength_info) {
29492905
chr_mark_uniques(p_x, size, p_truelength_info);
29502906

2951-
// Check if any uniques need reencoding
2952-
bool reencode = p_chr_any_reencode(
2953-
p_truelength_info->p_uniques,
2954-
p_truelength_info->size_used
2955-
);
2956-
2957-
// Rerun marking of unique values if any needed reencoding
2958-
// (some characters might translate to the same UTF-8 character)
2959-
if (reencode) {
2960-
// Reset existing uniques before rerun
2961-
truelength_reset(p_truelength_info);
2962-
2963-
// Initialize container for re-encoded result
2964-
init_lazy_chr(p_lazy_x_reencoded);
2965-
2966-
p_chr_copy_with_reencode(p_x, p_lazy_x_reencoded->data, size);
2967-
2968-
// Tell `df_order()` and `chr_order()` we re-encoded
2969-
p_truelength_info->reencode = true;
2970-
2971-
// Re-mark uniques on re-encoded vector
2972-
chr_mark_uniques(p_lazy_x_reencoded->p_data, size, p_truelength_info);
2973-
}
2974-
29752907
r_ssize n_uniques = p_truelength_info->size_used;
29762908

29772909
SEXP* p_x_aux = (SEXP*) init_lazy_raw(p_lazy_x_aux);
@@ -3352,7 +3284,6 @@ struct df_order_info {
33523284
struct lazy_raw* p_lazy_bytes;
33533285
struct lazy_raw* p_lazy_counts;
33543286
struct group_infos* p_group_infos;
3355-
struct lazy_chr* p_lazy_x_reencoded;
33563287
struct truelength_info* p_truelength_info;
33573288
};
33583289

@@ -3393,7 +3324,6 @@ void df_order(SEXP x,
33933324
struct lazy_raw* p_lazy_bytes,
33943325
struct lazy_raw* p_lazy_counts,
33953326
struct group_infos* p_group_infos,
3396-
struct lazy_chr* p_lazy_x_reencoded,
33973327
struct truelength_info* p_truelength_info) {
33983328
struct df_order_info info = {
33993329
.x = x,
@@ -3407,7 +3337,6 @@ void df_order(SEXP x,
34073337
.p_lazy_bytes = p_lazy_bytes,
34083338
.p_lazy_counts = p_lazy_counts,
34093339
.p_group_infos = p_group_infos,
3410-
.p_lazy_x_reencoded = p_lazy_x_reencoded,
34113340
.p_truelength_info = p_truelength_info
34123341
};
34133342

@@ -3434,7 +3363,6 @@ static void df_order_internal(SEXP x,
34343363
struct lazy_raw* p_lazy_bytes,
34353364
struct lazy_raw* p_lazy_counts,
34363365
struct group_infos* p_group_infos,
3437-
struct lazy_chr* p_lazy_x_reencoded,
34383366
struct truelength_info* p_truelength_info);
34393367

34403368
static
@@ -3453,7 +3381,6 @@ SEXP df_order_exec(void* p_data) {
34533381
p_info->p_lazy_bytes,
34543382
p_info->p_lazy_counts,
34553383
p_info->p_group_infos,
3456-
p_info->p_lazy_x_reencoded,
34573384
p_info->p_truelength_info
34583385
);
34593386

@@ -3526,7 +3453,6 @@ void df_order_internal(SEXP x,
35263453
struct lazy_raw* p_lazy_bytes,
35273454
struct lazy_raw* p_lazy_counts,
35283455
struct group_infos* p_group_infos,
3529-
struct lazy_chr* p_lazy_x_reencoded,
35303456
struct truelength_info* p_truelength_info) {
35313457
r_ssize n_cols = r_length(x);
35323458

@@ -3590,7 +3516,6 @@ void df_order_internal(SEXP x,
35903516
p_lazy_bytes,
35913517
p_lazy_counts,
35923518
p_group_infos,
3593-
p_lazy_x_reencoded,
35943519
p_truelength_info
35953520
);
35963521

@@ -3642,15 +3567,8 @@ void df_order_internal(SEXP x,
36423567
size,
36433568
p_lazy_x_aux,
36443569
p_lazy_bytes,
3645-
p_lazy_x_reencoded,
36463570
p_truelength_info
36473571
);
3648-
3649-
// If re-encoding was required, the re-encoded column is stored
3650-
// in `p_lazy_x_reencoded`.
3651-
if (p_truelength_info->reencode) {
3652-
col = p_lazy_x_reencoded->data;
3653-
}
36543572
}
36553573

36563574
// Turn off group tracking if:

0 commit comments

Comments
 (0)