1414#include "utils.h"
1515#include "lazy.h"
1616#include "type-data-frame.h"
17+ #include "translate.h"
1718#include "order-radix.h"
1819#include "order-groups.h"
1920#include "order-truelength.h"
118119 * - `chr_order_insertion()` - Used when `x` is small.
119120 *
120121 * - `chr_order_radix()` - Same principle as integer/double ordering, but
121- * we iterate 1 character at a time. We assume a C locale here, and any
122- * non-ASCII and non-UTF8 strings are translated to UTF8.
122+ * we iterate 1 character at a time. We assume a C locale here. Any non-ASCII
123+ * and non-UTF-8 strings are translated up front by
124+ * `vec_normalize_encoding()`.
123125 *
124126 * -----------------------------------------------------------------------------
125127 * Logicals
@@ -246,7 +248,6 @@ static void vec_order_switch(SEXP x,
246248 struct lazy_raw * p_lazy_bytes ,
247249 struct lazy_raw * p_lazy_counts ,
248250 struct group_infos * p_group_infos ,
249- struct lazy_chr * p_lazy_x_reencoded ,
250251 struct truelength_info * p_truelength_info );
251252
252253/*
@@ -267,6 +268,7 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
267268 na_last = VECTOR_ELT (args , 1 );
268269
269270 SEXP proxy = PROTECT_N (vec_proxy_order (x ), p_n_prot );
271+ proxy = PROTECT_N (vec_normalize_encoding (proxy ), p_n_prot );
270272
271273 r_ssize size = vec_size (proxy );
272274 const enum vctrs_type type = vec_proxy_typeof (proxy );
@@ -325,9 +327,6 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
325327 struct truelength_info * p_truelength_info = new_truelength_info (size );
326328 PROTECT_TRUELENGTH_INFO (p_truelength_info , p_n_prot );
327329
328- struct lazy_chr * p_lazy_x_reencoded = new_lazy_chr (size );
329- PROTECT_LAZY_VEC (p_lazy_x_reencoded , p_n_prot );
330-
331330 struct order * p_order = new_order (size );
332331 PROTECT_ORDER (p_order , p_n_prot );
333332
@@ -344,7 +343,6 @@ SEXP vec_order_impl(SEXP x, SEXP decreasing, SEXP na_last, bool locations) {
344343 p_lazy_bytes ,
345344 p_lazy_counts ,
346345 p_group_infos ,
347- p_lazy_x_reencoded ,
348346 p_truelength_info
349347 );
350348
@@ -430,7 +428,6 @@ static void df_order(SEXP x,
430428 struct lazy_raw * p_lazy_bytes ,
431429 struct lazy_raw * p_lazy_counts ,
432430 struct group_infos * p_group_infos ,
433- struct lazy_chr * p_lazy_x_reencoded ,
434431 struct truelength_info * p_truelength_info );
435432
436433static void vec_order_base_switch (SEXP x ,
@@ -445,7 +442,6 @@ static void vec_order_base_switch(SEXP x,
445442 struct lazy_raw * p_lazy_bytes ,
446443 struct lazy_raw * p_lazy_counts ,
447444 struct group_infos * p_group_infos ,
448- struct lazy_chr * p_lazy_x_reencoded ,
449445 struct truelength_info * p_truelength_info );
450446
451447static
@@ -461,7 +457,6 @@ void vec_order_switch(SEXP x,
461457 struct lazy_raw * p_lazy_bytes ,
462458 struct lazy_raw * p_lazy_counts ,
463459 struct group_infos * p_group_infos ,
464- struct lazy_chr * p_lazy_x_reencoded ,
465460 struct truelength_info * p_truelength_info ) {
466461 if (type == vctrs_type_dataframe ) {
467462 df_order (
@@ -476,7 +471,6 @@ void vec_order_switch(SEXP x,
476471 p_lazy_bytes ,
477472 p_lazy_counts ,
478473 p_group_infos ,
479- p_lazy_x_reencoded ,
480474 p_truelength_info
481475 );
482476
@@ -515,7 +509,6 @@ void vec_order_switch(SEXP x,
515509 p_lazy_bytes ,
516510 p_lazy_counts ,
517511 p_group_infos ,
518- p_lazy_x_reencoded ,
519512 p_truelength_info
520513 );
521514}
@@ -581,7 +574,6 @@ static void chr_order(SEXP x,
581574 struct lazy_raw * p_lazy_bytes ,
582575 struct lazy_raw * p_lazy_counts ,
583576 struct group_infos * p_group_infos ,
584- struct lazy_chr * p_lazy_x_reencoded ,
585577 struct truelength_info * p_truelength_info );
586578
587579// Used on bare vectors and the first column of data frame `x`s
@@ -598,7 +590,6 @@ void vec_order_base_switch(SEXP x,
598590 struct lazy_raw * p_lazy_bytes ,
599591 struct lazy_raw * p_lazy_counts ,
600592 struct group_infos * p_group_infos ,
601- struct lazy_chr * p_lazy_x_reencoded ,
602593 struct truelength_info * p_truelength_info ) {
603594 switch (type ) {
604595 case vctrs_type_integer : {
@@ -682,7 +673,6 @@ void vec_order_base_switch(SEXP x,
682673 p_lazy_bytes ,
683674 p_lazy_counts ,
684675 p_group_infos ,
685- p_lazy_x_reencoded ,
686676 p_truelength_info
687677 );
688678
@@ -2590,7 +2580,6 @@ static void chr_mark_sorted_uniques(const SEXP* p_x,
25902580 r_ssize size ,
25912581 struct lazy_raw * p_lazy_x_aux ,
25922582 struct lazy_raw * p_lazy_bytes ,
2593- struct lazy_chr * p_lazy_x_reencoded ,
25942583 struct truelength_info * p_truelength_info );
25952584
25962585static inline void chr_extract_ordering (const SEXP * p_x , r_ssize size , int * p_x_aux );
@@ -2609,12 +2598,10 @@ static void chr_order_radix(const r_ssize size,
26092598 * `chr_order_chunk()` assumes `p_x` is modifiable by reference. It also
26102599 * assumes that `chr_mark_sorted_uniques()` has already been called. For data
26112600 * frame columns where `chr_order_chunk()` is called on each group chunk,
2612- * `chr_mark_sorted_uniques()` is only called once on the entire column. It
2613- * also assumes that `p_x` has already been re-encoded as UTF-8 if required.
2601+ * `chr_mark_sorted_uniques()` is only called once on the entire column.
26142602 *
26152603 * `chr_order()` assumes `x` is user input which cannot be modified.
2616- * It copies `x` into another SEXP that can be modified directly and re-encodes
2617- * as UTF-8 if required.
2604+ * It copies `x` into another SEXP that can be modified directly.
26182605 *
26192606 * `chr_order_chunk()` essentially calls `int_order_chunk()`, however we can't
26202607 * call it directly because we don't have access to all the required arguments.
@@ -2641,17 +2628,11 @@ void chr_order_chunk(bool decreasing,
26412628 struct group_infos * p_group_infos ) {
26422629 void * p_x_chunk = p_lazy_x_chunk -> p_data ;
26432630
2644- // Don't check encoding on `p_x_chunk` data. In `df_order()`, we already
2645- // ran `chr_mark_sorted_uniques()` which told `df_order()` whether or not
2646- // to re-encode as it created `p_x_chunk`
2647- bool check_encoding = false;
2648-
26492631 const enum vctrs_sortedness sortedness = chr_sortedness (
26502632 p_x_chunk ,
26512633 size ,
26522634 decreasing ,
26532635 na_last ,
2654- check_encoding ,
26552636 p_group_infos
26562637 );
26572638
@@ -2699,7 +2680,6 @@ struct chr_order_info {
26992680 struct lazy_raw * p_lazy_bytes ;
27002681 struct lazy_raw * p_lazy_counts ;
27012682 struct group_infos * p_group_infos ;
2702- struct lazy_chr * p_lazy_x_reencoded ;
27032683 struct truelength_info * p_truelength_info ;
27042684};
27052685
@@ -2728,7 +2708,6 @@ void chr_order(SEXP x,
27282708 struct lazy_raw * p_lazy_bytes ,
27292709 struct lazy_raw * p_lazy_counts ,
27302710 struct group_infos * p_group_infos ,
2731- struct lazy_chr * p_lazy_x_reencoded ,
27322711 struct truelength_info * p_truelength_info ) {
27332712 struct chr_order_info info = {
27342713 .x = x ,
@@ -2742,7 +2721,6 @@ void chr_order(SEXP x,
27422721 .p_lazy_bytes = p_lazy_bytes ,
27432722 .p_lazy_counts = p_lazy_counts ,
27442723 .p_group_infos = p_group_infos ,
2745- .p_lazy_x_reencoded = p_lazy_x_reencoded ,
27462724 .p_truelength_info = p_truelength_info
27472725 };
27482726
@@ -2769,7 +2747,6 @@ static void chr_order_internal(SEXP x,
27692747 struct lazy_raw * p_lazy_bytes ,
27702748 struct lazy_raw * p_lazy_counts ,
27712749 struct group_infos * p_group_infos ,
2772- struct lazy_chr * p_lazy_x_reencoded ,
27732750 struct truelength_info * p_truelength_info );
27742751
27752752static
@@ -2788,7 +2765,6 @@ SEXP chr_order_exec(void* p_data) {
27882765 p_info -> p_lazy_bytes ,
27892766 p_info -> p_lazy_counts ,
27902767 p_info -> p_group_infos ,
2791- p_info -> p_lazy_x_reencoded ,
27922768 p_info -> p_truelength_info
27932769 );
27942770
@@ -2813,19 +2789,14 @@ void chr_order_internal(SEXP x,
28132789 struct lazy_raw * p_lazy_bytes ,
28142790 struct lazy_raw * p_lazy_counts ,
28152791 struct group_infos * p_group_infos ,
2816- struct lazy_chr * p_lazy_x_reencoded ,
28172792 struct truelength_info * p_truelength_info ) {
28182793 const SEXP * p_x = STRING_PTR_RO (x );
28192794
2820- // Check encodings when determining sortedness of user input
2821- bool check_encoding = true;
2822-
28232795 const enum vctrs_sortedness sortedness = chr_sortedness (
28242796 p_x ,
28252797 size ,
28262798 decreasing ,
28272799 na_last ,
2828- check_encoding ,
28292800 p_group_infos
28302801 );
28312802
@@ -2839,23 +2810,15 @@ void chr_order_internal(SEXP x,
28392810
28402811 // Sort unique strings and mark their truelengths with ordering.
28412812 // Use `p_lazy_x_chunk` as auxiliary memory for `chr_order_radix()` so we
2842- // hopefully don't have to also allocate `p_lazy_x_aux`. If re-encoding
2843- // is required, it stores the results in `p_lazy_x_reencoded`.
2813+ // hopefully don't have to also allocate `p_lazy_x_aux`.
28442814 chr_mark_sorted_uniques (
28452815 p_x ,
28462816 size ,
28472817 p_lazy_x_chunk ,
28482818 p_lazy_bytes ,
2849- p_lazy_x_reencoded ,
28502819 p_truelength_info
28512820 );
28522821
2853- // If we re-encoded, then the vector to extract the ordering from is in
2854- // `p_lazy_x_reencoded`.
2855- if (p_truelength_info -> reencode ) {
2856- p_x = p_lazy_x_reencoded -> p_data ;
2857- }
2858-
28592822 void * p_x_chunk = init_lazy_raw (p_lazy_x_chunk );
28602823
28612824 // Move integer ordering into `p_x_chunk`.
@@ -2929,12 +2892,6 @@ static void chr_mark_uniques(const SEXP* p_x,
29292892 * through `p_x` and just pluck off the TRUELENGTH value, which will be an
29302893 * integer proxy for the value's ordering.
29312894 *
2932- * We optimize heavily for the ASCII / UTF-8 case by checking the encodings of
2933- * only the uniques. If any uniques need re-encoding, we recompute the unique
2934- * strings again on the entire vector, this time with reencoding. This gives
2935- * a nice speed boost for the most common case of all ASCII/UTF-8 because
2936- * checking encodings is expensive.
2937- *
29382895 * `truelength_save()` also saves the unique strings and their original
29392896 * TRUELENGTH values so they can be reset after each column with
29402897 * `truelength_reset()`.
@@ -2944,34 +2901,9 @@ void chr_mark_sorted_uniques(const SEXP* p_x,
29442901 r_ssize size ,
29452902 struct lazy_raw * p_lazy_x_aux ,
29462903 struct lazy_raw * p_lazy_bytes ,
2947- struct lazy_chr * p_lazy_x_reencoded ,
29482904 struct truelength_info * p_truelength_info ) {
29492905 chr_mark_uniques (p_x , size , p_truelength_info );
29502906
2951- // Check if any uniques need reencoding
2952- bool reencode = p_chr_any_reencode (
2953- p_truelength_info -> p_uniques ,
2954- p_truelength_info -> size_used
2955- );
2956-
2957- // Rerun marking of unique values if any needed reencoding
2958- // (some characters might translate to the same UTF-8 character)
2959- if (reencode ) {
2960- // Reset existing uniques before rerun
2961- truelength_reset (p_truelength_info );
2962-
2963- // Initialize container for re-encoded result
2964- init_lazy_chr (p_lazy_x_reencoded );
2965-
2966- p_chr_copy_with_reencode (p_x , p_lazy_x_reencoded -> data , size );
2967-
2968- // Tell `df_order()` and `chr_order()` we re-encoded
2969- p_truelength_info -> reencode = true;
2970-
2971- // Re-mark uniques on re-encoded vector
2972- chr_mark_uniques (p_lazy_x_reencoded -> p_data , size , p_truelength_info );
2973- }
2974-
29752907 r_ssize n_uniques = p_truelength_info -> size_used ;
29762908
29772909 SEXP * p_x_aux = (SEXP * ) init_lazy_raw (p_lazy_x_aux );
@@ -3352,7 +3284,6 @@ struct df_order_info {
33523284 struct lazy_raw * p_lazy_bytes ;
33533285 struct lazy_raw * p_lazy_counts ;
33543286 struct group_infos * p_group_infos ;
3355- struct lazy_chr * p_lazy_x_reencoded ;
33563287 struct truelength_info * p_truelength_info ;
33573288};
33583289
@@ -3393,7 +3324,6 @@ void df_order(SEXP x,
33933324 struct lazy_raw * p_lazy_bytes ,
33943325 struct lazy_raw * p_lazy_counts ,
33953326 struct group_infos * p_group_infos ,
3396- struct lazy_chr * p_lazy_x_reencoded ,
33973327 struct truelength_info * p_truelength_info ) {
33983328 struct df_order_info info = {
33993329 .x = x ,
@@ -3407,7 +3337,6 @@ void df_order(SEXP x,
34073337 .p_lazy_bytes = p_lazy_bytes ,
34083338 .p_lazy_counts = p_lazy_counts ,
34093339 .p_group_infos = p_group_infos ,
3410- .p_lazy_x_reencoded = p_lazy_x_reencoded ,
34113340 .p_truelength_info = p_truelength_info
34123341 };
34133342
@@ -3434,7 +3363,6 @@ static void df_order_internal(SEXP x,
34343363 struct lazy_raw * p_lazy_bytes ,
34353364 struct lazy_raw * p_lazy_counts ,
34363365 struct group_infos * p_group_infos ,
3437- struct lazy_chr * p_lazy_x_reencoded ,
34383366 struct truelength_info * p_truelength_info );
34393367
34403368static
@@ -3453,7 +3381,6 @@ SEXP df_order_exec(void* p_data) {
34533381 p_info -> p_lazy_bytes ,
34543382 p_info -> p_lazy_counts ,
34553383 p_info -> p_group_infos ,
3456- p_info -> p_lazy_x_reencoded ,
34573384 p_info -> p_truelength_info
34583385 );
34593386
@@ -3526,7 +3453,6 @@ void df_order_internal(SEXP x,
35263453 struct lazy_raw * p_lazy_bytes ,
35273454 struct lazy_raw * p_lazy_counts ,
35283455 struct group_infos * p_group_infos ,
3529- struct lazy_chr * p_lazy_x_reencoded ,
35303456 struct truelength_info * p_truelength_info ) {
35313457 r_ssize n_cols = r_length (x );
35323458
@@ -3590,7 +3516,6 @@ void df_order_internal(SEXP x,
35903516 p_lazy_bytes ,
35913517 p_lazy_counts ,
35923518 p_group_infos ,
3593- p_lazy_x_reencoded ,
35943519 p_truelength_info
35953520 );
35963521
@@ -3642,15 +3567,8 @@ void df_order_internal(SEXP x,
36423567 size ,
36433568 p_lazy_x_aux ,
36443569 p_lazy_bytes ,
3645- p_lazy_x_reencoded ,
36463570 p_truelength_info
36473571 );
3648-
3649- // If re-encoding was required, the re-encoded column is stored
3650- // in `p_lazy_x_reencoded`.
3651- if (p_truelength_info -> reencode ) {
3652- col = p_lazy_x_reencoded -> data ;
3653- }
36543572 }
36553573
36563574 // Turn off group tracking if:
0 commit comments