Skip to content

Commit ec59575

Browse files
authored
Merge pull request #79 from static-frame/78/split-after-count
`split_after_count` enhancement
2 parents f511362 + 5c2f1ea commit ec59575

File tree

4 files changed

+309
-68
lines changed

4 files changed

+309
-68
lines changed

src/__init__.pyi

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def iterable_str_to_array_1d(
2929
) -> np.ndarray: ...
3030

3131
def delimited_to_arrays(
32-
__file_like: tp.Iterable[str],
32+
file_like: tp.Iterable[str],
3333
*,
3434
axis: int = 0,
3535
dtypes: tp.Optional[tp.Callable[[int], tp.Any]] = None,
@@ -47,8 +47,14 @@ def delimited_to_arrays(
4747

4848
def split_after_count(
4949
string: str,
50-
delimiter: str,
51-
count: int,
50+
*,
51+
delimiter: str = ',',
52+
count: int = 0,
53+
doublequote: bool = True,
54+
escapechar: str = '',
55+
quotechar: str = '"',
56+
quoting: int = 0,
57+
strict: bool = False,
5258
) -> tp.Tuple[str, str]: ...
5359

5460
def count_iteration(__iterable: tp.Iterable) -> int: ...

src/_arraykit.c

Lines changed: 175 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2481,14 +2481,13 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
24812481
dr->state = (c == '\0' ? START_RECORD : EAT_CRNL);
24822482
}
24832483
else if (c == dialect->quotechar && dialect->quoting != QUOTE_NONE) {
2484-
// start quoted field
24852484
dr->state = IN_QUOTED_FIELD;
24862485
}
2487-
else if (c == dialect->escapechar) { // possible escaped character
2486+
else if (c == dialect->escapechar) {
24882487
dr->state = ESCAPED_CHAR;
24892488
}
24902489
else if (c == ' ' && dialect->skipinitialspace);
2491-
else if (c == dialect->delimiter) { // save empty field
2490+
else if (c == dialect->delimiter) { // end of a field
24922491
if (AK_DR_close_field(dr, cpg)) return -1;
24932492
}
24942493
else { // begin new unquoted field
@@ -2515,7 +2514,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
25152514
if (AK_DR_close_field(dr, cpg)) return -1;
25162515
dr->state = (c == '\0' ? START_RECORD : EAT_CRNL);
25172516
}
2518-
else if (c == dialect->escapechar) { // possible escaped character
2517+
else if (c == dialect->escapechar) {
25192518
dr->state = ESCAPED_CHAR;
25202519
}
25212520
else if (c == dialect->delimiter) { // save field - wait for new field
@@ -2566,8 +2565,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
25662565
}
25672566
else { // illegal
25682567
PyErr_Format(PyExc_RuntimeError, "'%c' expected after '%c'",
2569-
dialect->delimiter,
2570-
dialect->quotechar);
2568+
dialect->delimiter, dialect->quotechar);
25712569
return -1;
25722570
}
25732571
break;
@@ -2943,61 +2941,212 @@ iterable_str_to_array_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwarg
29432941
return AK_IterableStrToArray1D(iterable, dtype_specifier, tsep, decc);
29442942
}
29452943

2944+
static char *split_after_count_kwarg_names[] = {
2945+
"string",
2946+
"delimiter",
2947+
"count",
2948+
"doublequote",
2949+
"escapechar",
2950+
"quotechar",
2951+
"quoting",
2952+
"strict",
2953+
NULL
2954+
};
29462955

29472956
static PyObject *
2948-
split_after_count(PyObject *Py_UNUSED(m), PyObject *args)
2957+
split_after_count(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
29492958
{
29502959
PyObject *string = NULL;
29512960
PyObject *delimiter = NULL;
29522961
int count = 0;
2962+
PyObject *doublequote = NULL;
2963+
PyObject *escapechar = NULL;
2964+
PyObject *quotechar = NULL;
2965+
PyObject *quoting = NULL;
2966+
PyObject *strict = NULL;
29532967

2954-
if (!PyArg_ParseTuple(args,
2955-
"OOi:split_after_count",
2968+
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
2969+
"O|$OiOOOOO:split_after_count",
2970+
split_after_count_kwarg_names,
29562971
&string,
2972+
// kwarg-only
29572973
&delimiter,
2958-
&count)) {
2974+
&count,
2975+
&doublequote,
2976+
&escapechar,
2977+
&quotechar,
2978+
&quoting,
2979+
&strict
2980+
)) {
29592981
return NULL;
29602982
}
29612983

29622984
if (!PyUnicode_Check(string)) {
2963-
PyErr_Format(PyExc_RuntimeError,
2985+
PyErr_Format(PyExc_ValueError,
29642986
"a string is required, not %.200s",
29652987
Py_TYPE(string)->tp_name
29662988
);
29672989
return NULL;
29682990
}
2969-
29702991
if (count <= 0) {
2971-
PyErr_Format(PyExc_RuntimeError,
2992+
PyErr_Format(PyExc_ValueError,
29722993
"count must be greater than zero, not %i",
29732994
count
29742995
);
29752996
return NULL;
29762997
}
29772998

2978-
Py_UCS4 delim_char;
2999+
AK_Dialect dialect;
3000+
29793001
if (AK_set_char(
29803002
"delimiter",
2981-
&delim_char,
3003+
&dialect.delimiter,
29823004
delimiter,
2983-
'\0')) return NULL;
3005+
',')) return NULL;
3006+
3007+
if (AK_set_bool(
3008+
"doublequote",
3009+
&dialect.doublequote,
3010+
doublequote,
3011+
true)) return NULL;
3012+
3013+
if (AK_set_char(
3014+
"escapechar",
3015+
&dialect.escapechar,
3016+
escapechar,
3017+
0)) return NULL;
3018+
3019+
if (AK_set_char(
3020+
"quotechar",
3021+
&dialect.quotechar,
3022+
quotechar,
3023+
'"')) return NULL;
3024+
3025+
if (AK_set_int(
3026+
"quoting",
3027+
&dialect.quoting,
3028+
quoting,
3029+
QUOTE_MINIMAL)) return NULL;
3030+
3031+
if (AK_set_bool(
3032+
"strict",
3033+
&dialect.strict,
3034+
strict,
3035+
false)) return NULL;
29843036

29853037
unsigned int kind = PyUnicode_KIND(string);
29863038
const void *data = PyUnicode_DATA(string);
29873039
Py_ssize_t pos = 0;
29883040
Py_ssize_t delim_count = 0;
29893041
Py_ssize_t linelen = PyUnicode_GET_LENGTH(string);
29903042
Py_UCS4 c;
3043+
AK_DelimitedReaderState state = START_RECORD;
29913044

29923045
while (pos < linelen) {
29933046
c = PyUnicode_READ(kind, data, pos);
2994-
if (c == delim_char) {
2995-
delim_count++;
2996-
if (delim_count == count) {
2997-
break; // to not include delim at transition
2998-
// do not increment pos so as to exclude in left
3047+
3048+
switch (state) {
3049+
case START_RECORD: // start of record
3050+
if (c == '\0') // empty line
3051+
break;
3052+
else if (c == '\n' || c == '\r') {
3053+
state = EAT_CRNL;
3054+
break;
3055+
}
3056+
state = START_FIELD; // normal character
3057+
// fallthru
3058+
case START_FIELD: // expecting field
3059+
if (c == '\n' || c == '\r' || c == '\0') {
3060+
state = (c == '\0' ? START_RECORD : EAT_CRNL);
3061+
}
3062+
else if (c == dialect.quotechar && dialect.quoting != QUOTE_NONE) {
3063+
state = IN_QUOTED_FIELD;
3064+
}
3065+
else if (c == dialect.escapechar) {
3066+
state = ESCAPED_CHAR;
3067+
}
3068+
else if (c == dialect.delimiter) { // end of a field
3069+
delim_count += 1;
3070+
}
3071+
else {
3072+
state = IN_FIELD;
3073+
}
3074+
break;
3075+
case ESCAPED_CHAR:
3076+
if (c == '\n' || c=='\r') {
3077+
state = AFTER_ESCAPED_CRNL;
3078+
break;
3079+
}
3080+
if (c == '\0')
3081+
c = '\n';
3082+
state = IN_FIELD;
3083+
break;
3084+
case AFTER_ESCAPED_CRNL:
3085+
if (c == '\0') break;
3086+
// fallthru
3087+
case IN_FIELD: // in unquoted field
3088+
if (c == '\n' || c == '\r' || c == '\0') { // end of line
3089+
state = (c == '\0' ? START_RECORD : EAT_CRNL);
3090+
}
3091+
else if (c == dialect.escapechar) {
3092+
state = ESCAPED_CHAR;
3093+
}
3094+
else if (c == dialect.delimiter) {
3095+
delim_count += 1;
3096+
state = START_FIELD;
3097+
}
3098+
break;
3099+
case IN_QUOTED_FIELD: // in quoted field
3100+
if (c == '\0');
3101+
else if (c == dialect.escapechar) {
3102+
state = ESCAPE_IN_QUOTED_FIELD;
3103+
}
3104+
else if (c == dialect.quotechar && dialect.quoting != QUOTE_NONE) {
3105+
state = (dialect.doublequote ? QUOTE_IN_QUOTED_FIELD : IN_FIELD);
29993106
}
3107+
break;
3108+
case ESCAPE_IN_QUOTED_FIELD:
3109+
if (c == '\0') {
3110+
c = '\n';
3111+
}
3112+
state = IN_QUOTED_FIELD;
3113+
break;
3114+
case QUOTE_IN_QUOTED_FIELD:
3115+
// doublequote - seen a quote in a quoted field
3116+
if (dialect.quoting != QUOTE_NONE && c == dialect.quotechar) {
3117+
state = IN_QUOTED_FIELD;
3118+
}
3119+
else if (c == dialect.delimiter) {
3120+
delim_count += 1;
3121+
state = START_FIELD;
3122+
}
3123+
else if (c == '\n' || c == '\r' || c == '\0') {
3124+
state = (c == '\0' ? START_RECORD : EAT_CRNL);
3125+
}
3126+
else if (!dialect.strict) {
3127+
state = IN_FIELD;
3128+
}
3129+
else { // illegal
3130+
PyErr_Format(PyExc_RuntimeError, "'%c' expected after '%c'",
3131+
dialect.delimiter, dialect.quotechar);
3132+
return NULL;
3133+
}
3134+
break;
3135+
case EAT_CRNL:
3136+
if (c == '\n' || c == '\r');
3137+
else if (c == '\0')
3138+
state = START_RECORD;
3139+
else {
3140+
PyErr_Format(PyExc_RuntimeError,
3141+
"new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
3142+
return NULL;
3143+
}
3144+
break;
30003145
}
3146+
if (delim_count == count) {
3147+
break; // to not include delim at transition
3148+
}
3149+
// NOTE: must break before the increment when finding match
30013150
pos++;
30023151
}
30033152

@@ -3010,7 +3159,7 @@ split_after_count(PyObject *Py_UNUSED(m), PyObject *args)
30103159
}
30113160

30123161

3013-
3162+
// A fast counter of unsized iterators
30143163
static PyObject *
30153164
count_iteration(PyObject *Py_UNUSED(m), PyObject *iterable)
30163165
{
@@ -3883,7 +4032,10 @@ static PyMethodDef arraykit_methods[] = {
38834032
(PyCFunction)iterable_str_to_array_1d,
38844033
METH_VARARGS | METH_KEYWORDS,
38854034
NULL},
3886-
{"split_after_count", split_after_count, METH_VARARGS, NULL},
4035+
{"split_after_count",
4036+
(PyCFunction)split_after_count,
4037+
METH_VARARGS | METH_KEYWORDS,
4038+
NULL},
38874039
{"count_iteration", count_iteration, METH_O, NULL},
38884040
{"isna_element", isna_element, METH_O, NULL},
38894041
{"dtype_from_element", dtype_from_element, METH_O, NULL},

0 commit comments

Comments
 (0)