@@ -2481,14 +2481,13 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
2481
2481
dr -> state = (c == '\0' ? START_RECORD : EAT_CRNL );
2482
2482
}
2483
2483
else if (c == dialect -> quotechar && dialect -> quoting != QUOTE_NONE ) {
2484
- // start quoted field
2485
2484
dr -> state = IN_QUOTED_FIELD ;
2486
2485
}
2487
- else if (c == dialect -> escapechar ) { // possible escaped character
2486
+ else if (c == dialect -> escapechar ) {
2488
2487
dr -> state = ESCAPED_CHAR ;
2489
2488
}
2490
2489
else if (c == ' ' && dialect -> skipinitialspace );
2491
- else if (c == dialect -> delimiter ) { // save empty field
2490
+ else if (c == dialect -> delimiter ) { // end of a field
2492
2491
if (AK_DR_close_field (dr , cpg )) return -1 ;
2493
2492
}
2494
2493
else { // begin new unquoted field
@@ -2515,7 +2514,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
2515
2514
if (AK_DR_close_field (dr , cpg )) return -1 ;
2516
2515
dr -> state = (c == '\0' ? START_RECORD : EAT_CRNL );
2517
2516
}
2518
- else if (c == dialect -> escapechar ) { // possible escaped character
2517
+ else if (c == dialect -> escapechar ) {
2519
2518
dr -> state = ESCAPED_CHAR ;
2520
2519
}
2521
2520
else if (c == dialect -> delimiter ) { // save field - wait for new field
@@ -2566,8 +2565,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
2566
2565
}
2567
2566
else { // illegal
2568
2567
PyErr_Format (PyExc_RuntimeError , "'%c' expected after '%c'" ,
2569
- dialect -> delimiter ,
2570
- dialect -> quotechar );
2568
+ dialect -> delimiter , dialect -> quotechar );
2571
2569
return -1 ;
2572
2570
}
2573
2571
break ;
@@ -2943,61 +2941,212 @@ iterable_str_to_array_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwarg
2943
2941
return AK_IterableStrToArray1D (iterable , dtype_specifier , tsep , decc );
2944
2942
}
2945
2943
2944
+ static char * split_after_count_kwarg_names [] = {
2945
+ "string" ,
2946
+ "delimiter" ,
2947
+ "count" ,
2948
+ "doublequote" ,
2949
+ "escapechar" ,
2950
+ "quotechar" ,
2951
+ "quoting" ,
2952
+ "strict" ,
2953
+ NULL
2954
+ };
2946
2955
2947
2956
static PyObject *
2948
- split_after_count (PyObject * Py_UNUSED (m ), PyObject * args )
2957
+ split_after_count (PyObject * Py_UNUSED (m ), PyObject * args , PyObject * kwargs )
2949
2958
{
2950
2959
PyObject * string = NULL ;
2951
2960
PyObject * delimiter = NULL ;
2952
2961
int count = 0 ;
2962
+ PyObject * doublequote = NULL ;
2963
+ PyObject * escapechar = NULL ;
2964
+ PyObject * quotechar = NULL ;
2965
+ PyObject * quoting = NULL ;
2966
+ PyObject * strict = NULL ;
2953
2967
2954
- if (!PyArg_ParseTuple (args ,
2955
- "OOi:split_after_count" ,
2968
+ if (!PyArg_ParseTupleAndKeywords (args , kwargs ,
2969
+ "O|$OiOOOOO:split_after_count" ,
2970
+ split_after_count_kwarg_names ,
2956
2971
& string ,
2972
+ // kwarg-only
2957
2973
& delimiter ,
2958
- & count )) {
2974
+ & count ,
2975
+ & doublequote ,
2976
+ & escapechar ,
2977
+ & quotechar ,
2978
+ & quoting ,
2979
+ & strict
2980
+ )) {
2959
2981
return NULL ;
2960
2982
}
2961
2983
2962
2984
if (!PyUnicode_Check (string )) {
2963
- PyErr_Format (PyExc_RuntimeError ,
2985
+ PyErr_Format (PyExc_ValueError ,
2964
2986
"a string is required, not %.200s" ,
2965
2987
Py_TYPE (string )-> tp_name
2966
2988
);
2967
2989
return NULL ;
2968
2990
}
2969
-
2970
2991
if (count <= 0 ) {
2971
- PyErr_Format (PyExc_RuntimeError ,
2992
+ PyErr_Format (PyExc_ValueError ,
2972
2993
"count must be greater than zero, not %i" ,
2973
2994
count
2974
2995
);
2975
2996
return NULL ;
2976
2997
}
2977
2998
2978
- Py_UCS4 delim_char ;
2999
+ AK_Dialect dialect ;
3000
+
2979
3001
if (AK_set_char (
2980
3002
"delimiter" ,
2981
- & delim_char ,
3003
+ & dialect . delimiter ,
2982
3004
delimiter ,
2983
- '\0' )) return NULL ;
3005
+ ',' )) return NULL ;
3006
+
3007
+ if (AK_set_bool (
3008
+ "doublequote" ,
3009
+ & dialect .doublequote ,
3010
+ doublequote ,
3011
+ true)) return NULL ;
3012
+
3013
+ if (AK_set_char (
3014
+ "escapechar" ,
3015
+ & dialect .escapechar ,
3016
+ escapechar ,
3017
+ 0 )) return NULL ;
3018
+
3019
+ if (AK_set_char (
3020
+ "quotechar" ,
3021
+ & dialect .quotechar ,
3022
+ quotechar ,
3023
+ '"' )) return NULL ;
3024
+
3025
+ if (AK_set_int (
3026
+ "quoting" ,
3027
+ & dialect .quoting ,
3028
+ quoting ,
3029
+ QUOTE_MINIMAL )) return NULL ;
3030
+
3031
+ if (AK_set_bool (
3032
+ "strict" ,
3033
+ & dialect .strict ,
3034
+ strict ,
3035
+ false)) return NULL ;
2984
3036
2985
3037
unsigned int kind = PyUnicode_KIND (string );
2986
3038
const void * data = PyUnicode_DATA (string );
2987
3039
Py_ssize_t pos = 0 ;
2988
3040
Py_ssize_t delim_count = 0 ;
2989
3041
Py_ssize_t linelen = PyUnicode_GET_LENGTH (string );
2990
3042
Py_UCS4 c ;
3043
+ AK_DelimitedReaderState state = START_RECORD ;
2991
3044
2992
3045
while (pos < linelen ) {
2993
3046
c = PyUnicode_READ (kind , data , pos );
2994
- if (c == delim_char ) {
2995
- delim_count ++ ;
2996
- if (delim_count == count ) {
2997
- break ; // to not include delim at transition
2998
- // do not increment pos so as to exclude in left
3047
+
3048
+ switch (state ) {
3049
+ case START_RECORD : // start of record
3050
+ if (c == '\0' ) // empty line
3051
+ break ;
3052
+ else if (c == '\n' || c == '\r' ) {
3053
+ state = EAT_CRNL ;
3054
+ break ;
3055
+ }
3056
+ state = START_FIELD ; // normal character
3057
+ // fallthru
3058
+ case START_FIELD : // expecting field
3059
+ if (c == '\n' || c == '\r' || c == '\0' ) {
3060
+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3061
+ }
3062
+ else if (c == dialect .quotechar && dialect .quoting != QUOTE_NONE ) {
3063
+ state = IN_QUOTED_FIELD ;
3064
+ }
3065
+ else if (c == dialect .escapechar ) {
3066
+ state = ESCAPED_CHAR ;
3067
+ }
3068
+ else if (c == dialect .delimiter ) { // end of a field
3069
+ delim_count += 1 ;
3070
+ }
3071
+ else {
3072
+ state = IN_FIELD ;
3073
+ }
3074
+ break ;
3075
+ case ESCAPED_CHAR :
3076
+ if (c == '\n' || c == '\r' ) {
3077
+ state = AFTER_ESCAPED_CRNL ;
3078
+ break ;
3079
+ }
3080
+ if (c == '\0' )
3081
+ c = '\n' ;
3082
+ state = IN_FIELD ;
3083
+ break ;
3084
+ case AFTER_ESCAPED_CRNL :
3085
+ if (c == '\0' ) break ;
3086
+ // fallthru
3087
+ case IN_FIELD : // in unquoted field
3088
+ if (c == '\n' || c == '\r' || c == '\0' ) { // end of line
3089
+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3090
+ }
3091
+ else if (c == dialect .escapechar ) {
3092
+ state = ESCAPED_CHAR ;
3093
+ }
3094
+ else if (c == dialect .delimiter ) {
3095
+ delim_count += 1 ;
3096
+ state = START_FIELD ;
3097
+ }
3098
+ break ;
3099
+ case IN_QUOTED_FIELD : // in quoted field
3100
+ if (c == '\0' );
3101
+ else if (c == dialect .escapechar ) {
3102
+ state = ESCAPE_IN_QUOTED_FIELD ;
3103
+ }
3104
+ else if (c == dialect .quotechar && dialect .quoting != QUOTE_NONE ) {
3105
+ state = (dialect .doublequote ? QUOTE_IN_QUOTED_FIELD : IN_FIELD );
2999
3106
}
3107
+ break ;
3108
+ case ESCAPE_IN_QUOTED_FIELD :
3109
+ if (c == '\0' ) {
3110
+ c = '\n' ;
3111
+ }
3112
+ state = IN_QUOTED_FIELD ;
3113
+ break ;
3114
+ case QUOTE_IN_QUOTED_FIELD :
3115
+ // doublequote - seen a quote in a quoted field
3116
+ if (dialect .quoting != QUOTE_NONE && c == dialect .quotechar ) {
3117
+ state = IN_QUOTED_FIELD ;
3118
+ }
3119
+ else if (c == dialect .delimiter ) {
3120
+ delim_count += 1 ;
3121
+ state = START_FIELD ;
3122
+ }
3123
+ else if (c == '\n' || c == '\r' || c == '\0' ) {
3124
+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3125
+ }
3126
+ else if (!dialect .strict ) {
3127
+ state = IN_FIELD ;
3128
+ }
3129
+ else { // illegal
3130
+ PyErr_Format (PyExc_RuntimeError , "'%c' expected after '%c'" ,
3131
+ dialect .delimiter , dialect .quotechar );
3132
+ return NULL ;
3133
+ }
3134
+ break ;
3135
+ case EAT_CRNL :
3136
+ if (c == '\n' || c == '\r' );
3137
+ else if (c == '\0' )
3138
+ state = START_RECORD ;
3139
+ else {
3140
+ PyErr_Format (PyExc_RuntimeError ,
3141
+ "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?" );
3142
+ return NULL ;
3143
+ }
3144
+ break ;
3000
3145
}
3146
+ if (delim_count == count ) {
3147
+ break ; // to not include delim at transition
3148
+ }
3149
+ // NOTE: must break before the increment when finding match
3001
3150
pos ++ ;
3002
3151
}
3003
3152
@@ -3010,7 +3159,7 @@ split_after_count(PyObject *Py_UNUSED(m), PyObject *args)
3010
3159
}
3011
3160
3012
3161
3013
-
3162
+ // A fast counter of unsized iterators
3014
3163
static PyObject *
3015
3164
count_iteration (PyObject * Py_UNUSED (m ), PyObject * iterable )
3016
3165
{
@@ -3883,7 +4032,10 @@ static PyMethodDef arraykit_methods[] = {
3883
4032
(PyCFunction )iterable_str_to_array_1d ,
3884
4033
METH_VARARGS | METH_KEYWORDS ,
3885
4034
NULL },
3886
- {"split_after_count" , split_after_count , METH_VARARGS , NULL },
4035
+ {"split_after_count" ,
4036
+ (PyCFunction )split_after_count ,
4037
+ METH_VARARGS | METH_KEYWORDS ,
4038
+ NULL },
3887
4039
{"count_iteration" , count_iteration , METH_O , NULL },
3888
4040
{"isna_element" , isna_element , METH_O , NULL },
3889
4041
{"dtype_from_element" , dtype_from_element , METH_O , NULL },
0 commit comments