Skip to content

Commit fd6c666

Browse files
authored
Merge pull request #71 from static-frame/70/dta-improvements
`delimited_to_arrays` improvements
2 parents 5493235 + 747776d commit fd6c666

File tree

2 files changed

+77
-51
lines changed

2 files changed

+77
-51
lines changed

src/_arraykit.c

Lines changed: 14 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -714,8 +714,10 @@ AK_TP_resolve_field(AK_TypeParser* tp,
714714
if (tp->count_digit == 0) return TPS_STRING;
715715
// int
716716
if (tp->count_j == 0 &&
717-
tp->count_e == 0 &&
717+
tp->count_sign <= 1 &&
718+
tp->last_sign_pos <= 0 &&
718719
tp->count_decimal == 0 &&
720+
tp->count_e == 0 &&
719721
tp->count_paren_close == 0 &&
720722
tp->count_paren_open == 0 &&
721723
tp->count_nan == 0 &&
@@ -1283,9 +1285,6 @@ AK_CPL_Free(AK_CodePointLine* cpl)
12831285
{
12841286
PyMem_Free(cpl->buffer);
12851287
PyMem_Free(cpl->offsets);
1286-
// if (cpl->field) {
1287-
// PyMem_Free(cpl->field);
1288-
// }
12891288
if (cpl->type_parser) { // can exclude the check
12901289
PyMem_Free(cpl->type_parser);
12911290
}
@@ -1469,32 +1468,6 @@ AK_CPL_CurrentAdvance(AK_CodePointLine* cpl)
14691468
}
14701469

14711470
//------------------------------------------------------------------------------
1472-
// Set the CPL field to the characters accumulated in the CPL's buffer. This is only used for field converters that need a char* as an input argument. This has to be dynamically allocated and cleaned up appropriately.
1473-
// static inline char*
1474-
// AK_CPL_current_to_field(AK_CodePointLine* cpl)
1475-
// {
1476-
// // NOTE: we assume this is only called after offset_max is complete, and that this is only called once per CPL; we set it to the maximum size on first usage and then overwrite context on each subsequent usage.
1477-
// if (cpl->field == NULL) {
1478-
// // create a NULL-terminated string; need one more for string terminator
1479-
// cpl->field = (char*)PyMem_Malloc(sizeof(char) * (cpl->offset_max + 1));
1480-
// if (cpl->field == NULL) return (char*)PyErr_NoMemory();
1481-
// }
1482-
// Py_UCS4 *p = cpl->buffer_current_ptr;
1483-
// Py_UCS4 *end = p + cpl->offsets[cpl->offsets_current_index];
1484-
1485-
// // get pointer to field buffer to write to
1486-
// char *t = cpl->field;
1487-
// while (p < end) {
1488-
// if (AK_is_space(*p)) {
1489-
// ++p;
1490-
// continue;
1491-
// }
1492-
// *t++ = (char)*p++;
1493-
// }
1494-
// *t = '\0'; // must be NULL-terminated string
1495-
// return cpl->field;
1496-
// }
1497-
14981471
// This will take any case of "TRUE" as True, while marking everything else as False; this is the same approach taken with genfromtxt when the dtype is given as bool. This will not fail for invalid true or false strings.
14991472
static inline bool
15001473
AK_CPL_current_to_bool(AK_CodePointLine* cpl) {
@@ -2065,7 +2038,7 @@ AK_line_select_keep(
20652038
}
20662039

20672040
//------------------------------------------------------------------------------
2068-
// CodePointGrid Type, New, Destrctor
2041+
// CodePointGrid Type, New, Destructor
20692042

20702043
typedef struct AK_CodePointGrid {
20712044
Py_ssize_t lines_count; // accumulated number of lines
@@ -2464,11 +2437,11 @@ typedef struct AK_DelimitedReader{
24642437
AK_Dialect *dialect;
24652438
AK_DelimitedReaderState state;
24662439
Py_ssize_t field_len;
2467-
Py_ssize_t record_number;
2468-
Py_ssize_t record_iter_number;
2469-
Py_ssize_t field_number;
2440+
Py_ssize_t record_number; // total records loaded
2441+
Py_ssize_t record_iter_number; // records iterated (counting exclusion)
2442+
Py_ssize_t field_number; // field in current record, reset for each record
24702443
int axis;
2471-
Py_ssize_t *axis_pos;
2444+
Py_ssize_t *axis_pos; // points to either record_number or field_number
24722445
} AK_DelimitedReader;
24732446

24742447
// Called once at the close of each field in a line. Returns 0 on success, -1 on failure
@@ -2687,7 +2660,7 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
26872660
return -1;
26882661
case 0:
26892662
Py_DECREF(record);
2690-
return 1; // skip, process more lines
2663+
return 1; // skip, process more records
26912664
}
26922665
// NOTE: record_number should reflect the processed line count, and exlude any skipped lines. The value is initialized to -1 such the first line is number 0
26932666
++dr->record_number;
@@ -2721,9 +2694,10 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
27212694
static void
27222695
AK_DR_Free(AK_DelimitedReader *dr)
27232696
{
2724-
AK_Dialect_Free(dr->dialect);
2725-
dr->dialect = NULL;
2726-
Py_CLEAR(dr->input_iter);
2697+
if (dr->dialect) {
2698+
AK_Dialect_Free(dr->dialect);
2699+
}
2700+
Py_XDECREF(dr->input_iter); // might already be NULL
27272701
PyMem_Free(dr);
27282702
}
27292703

@@ -2755,6 +2729,7 @@ AK_DR_New(PyObject *iterable,
27552729

27562730
dr->record_number = -1;
27572731
dr->record_iter_number = -1;
2732+
dr->dialect = NULL; // init in case input_iter fails to init
27582733

27592734
dr->input_iter = PyObject_GetIter(iterable); // new ref, decref in free
27602735
if (dr->input_iter == NULL) {
@@ -2770,7 +2745,6 @@ AK_DR_New(PyObject *iterable,
27702745
quoting,
27712746
skipinitialspace,
27722747
strict);
2773-
27742748
if (dr->dialect == NULL) {
27752749
AK_DR_Free(dr);
27762750
return NULL;
@@ -2870,7 +2844,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
28702844
PyErr_SetString(PyExc_TypeError, "line_select must be a callable or None");
28712845
return NULL;
28722846
}
2873-
Py_XINCREF(line_select);
28742847

28752848
if ((axis < 0) || (axis > 1)) {
28762849
PyErr_SetString(PyExc_ValueError, "axis must be 0 or 1");
@@ -2886,7 +2859,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
28862859
skipinitialspace,
28872860
strict);
28882861
if (dr == NULL) { // can happen due to validation of dialect parameters
2889-
Py_XDECREF(line_select);
28902862
return NULL;
28912863
}
28922864

@@ -2896,7 +2868,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
28962868
&tsep,
28972869
thousandschar,
28982870
'\0')) {
2899-
Py_XDECREF(line_select);
29002871
AK_DR_Free(dr);
29012872
return NULL; // default is off (skips evaluation)
29022873
}
@@ -2906,15 +2877,13 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
29062877
&decc,
29072878
decimalchar,
29082879
'.')) {
2909-
Py_XDECREF(line_select);
29102880
AK_DR_Free(dr);
29112881
return NULL;
29122882
}
29132883

29142884
// dtypes inc / dec ref bound within CPG life
29152885
AK_CodePointGrid* cpg = AK_CPG_New(dtypes, tsep, decc);
29162886
if (cpg == NULL) { // error will be set
2917-
Py_XDECREF(line_select);
29182887
AK_DR_Free(dr);
29192888
return NULL;
29202889
}
@@ -2929,7 +2898,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
29292898
break;
29302899
}
29312900
else if (status == -1) {
2932-
Py_XDECREF(line_select);
29332901
AK_DR_Free(dr);
29342902
AK_CPG_Free(cpg);
29352903
return NULL;
@@ -2938,17 +2906,12 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
29382906
}
29392907
AK_DR_Free(dr);
29402908

2941-
29422909
PyObject* arrays = AK_CPG_ToArrayList(cpg, axis, line_select, tsep, decc);
29432910
// NOTE: do not need to check if arrays is NULL as we will return NULL anyway
2944-
2945-
Py_XDECREF(line_select);
29462911
AK_CPG_Free(cpg); // will free reference to dtypes
2947-
29482912
return arrays; // could be NULL
29492913
}
29502914

2951-
29522915
static char *iterable_str_to_array_1d_kwarg_names[] = {
29532916
"iterable",
29542917
"dtype",

test/test_delimited_to_arrays.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ def test_iterable_str_to_array_1d_int_13(self) -> None:
156156
with self.assertRaises(TypeError):
157157
a1 = iterable_str_to_array_1d(['3.000', '4.000', '1.000'], dtype=int, thousandschar=',')
158158

159+
159160
#---------------------------------------------------------------------------
160161

161162
def test_iterable_str_to_array_1d_uint_1(self) -> None:
@@ -703,6 +704,42 @@ def test_delimited_to_arrays_parse_i(self) -> None:
703704
post2 = delimited_to_arrays(msg, axis=1, skipinitialspace=True)
704705
self.assertEqual([a.tolist() for a in post2], [['a', 'b'], [10, 20], ['foo', 'c']])
705706

707+
def test_delimited_to_arrays_parse_j(self) -> None:
708+
msg = [
709+
'2021,2021-04-01,4',
710+
'2022,2022-05-01,3',
711+
]
712+
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
713+
self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04-01', '2022-05-01'], [4, 3]])
714+
715+
716+
def test_delimited_to_arrays_parse_k(self) -> None:
717+
msg = [
718+
'2021,2021-04,4',
719+
'2022,2022-05,3',
720+
]
721+
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
722+
self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04', '2022-05'], [4, 3]])
723+
724+
725+
def test_delimited_to_arrays_parse_l(self) -> None:
726+
msg = [
727+
'1,2,3',
728+
'2-,2-0,-3',
729+
]
730+
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
731+
self.assertEqual([a.tolist() for a in post1], [['1', '2-'], ['2', '2-0'], [3, -3]])
732+
733+
def test_delimited_to_arrays_parse_m(self) -> None:
734+
msg = [
735+
' 1, 2,3',
736+
' 2-, 2-0, -3',
737+
]
738+
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
739+
self.assertEqual([a.tolist() for a in post1], [[' 1', ' 2-'], [' 2', ' 2-0'], [3, -3]])
740+
741+
742+
# import ipdb; ipdb.set_trace()
706743

707744
#---------------------------------------------------------------------------
708745
def test_delimited_to_arrays_float_a(self) -> None:
@@ -1008,6 +1045,32 @@ def test_delimited_to_arrays_decimalchar_b(self) -> None:
10081045
[[1000, 2000, 4000], [4.0, 5.055, 6000.155]])
10091046

10101047

1048+
#---------------------------------------------------------------------------
1049+
def test_delimited_to_arrays_file_like_a(self) -> None:
1050+
def records():
1051+
msg = [
1052+
'1000;4',
1053+
'2000;5055',
1054+
]
1055+
yield from msg
1056+
1057+
with self.assertRaises(TypeError):
1058+
_ = delimited_to_arrays(records,
1059+
axis=1,
1060+
delimiter=';',
1061+
)
1062+
1063+
def test_delimited_to_arrays_file_like_b(self) -> None:
1064+
1065+
with self.assertRaises(TypeError):
1066+
_ = delimited_to_arrays(3,
1067+
axis=1,
1068+
delimiter=';',
1069+
dtypes=lambda x: int,
1070+
)
1071+
1072+
1073+
10111074
#---------------------------------------------------------------------------
10121075
def test_delimited_to_arrays_compare_int_a(self) -> None:
10131076
# genfromtxt might translate an empty field to -1 or 0

0 commit comments

Comments
 (0)