Merge pull request #71 from static-frame/70/dta-improvements

flexatone · web-flow · commit fd6c6664c10e · 2022-10-19T15:11:01.000-07:00
`delimited_to_arrays` improvements
diff --git a/src/_arraykit.c b/src/_arraykit.c
@@ -714,8 +714,10 @@ AK_TP_resolve_field(AK_TypeParser* tp,
         if (tp->count_digit == 0) return TPS_STRING;
         // int
         if (tp->count_j == 0 &&
-                tp->count_e == 0 &&
+                tp->count_sign <= 1 &&
+                tp->last_sign_pos <= 0 &&
                 tp->count_decimal == 0 &&
+                tp->count_e == 0 &&
                 tp->count_paren_close == 0 &&
                 tp->count_paren_open == 0 &&
                 tp->count_nan == 0 &&
@@ -1283,9 +1285,6 @@ AK_CPL_Free(AK_CodePointLine* cpl)
 {
     PyMem_Free(cpl->buffer);
     PyMem_Free(cpl->offsets);
-    // if (cpl->field) {
-    //     PyMem_Free(cpl->field);
-    // }
     if (cpl->type_parser) { // can exclude the check
         PyMem_Free(cpl->type_parser);
     }
@@ -1469,32 +1468,6 @@ AK_CPL_CurrentAdvance(AK_CodePointLine* cpl)
 }
 
 //------------------------------------------------------------------------------
-// Set the CPL field to the characters accumulated in the CPL's buffer. This is only used for field converters that need a char* as an input argument. This has to be dynamically allocated and cleaned up appropriately.
-// static inline char*
-// AK_CPL_current_to_field(AK_CodePointLine* cpl)
-// {
-//     // NOTE: we assume this is only called after offset_max is complete, and that this is only called once per CPL; we set it to the maximum size on first usage and then overwrite context on each subsequent usage.
-//     if (cpl->field == NULL) {
-//         // create a NULL-terminated string; need one more for string terminator
-//         cpl->field = (char*)PyMem_Malloc(sizeof(char) * (cpl->offset_max + 1));
-//         if (cpl->field == NULL) return (char*)PyErr_NoMemory();
-//     }
-//     Py_UCS4 *p = cpl->buffer_current_ptr;
-//     Py_UCS4 *end = p + cpl->offsets[cpl->offsets_current_index];
-
-//     // get pointer to field buffer to write to
-//     char *t = cpl->field;
-//     while (p < end) {
-//         if (AK_is_space(*p)) {
-//             ++p;
-//             continue;
-//         }
-//         *t++ = (char)*p++;
-//     }
-//     *t = '\0'; // must be NULL-terminated string
-//     return cpl->field;
-// }
-
 // This will take any case of "TRUE" as True, while marking everything else as False; this is the same approach taken with genfromtxt when the dtype is given as bool. This will not fail for invalid true or false strings.
 static inline bool
 AK_CPL_current_to_bool(AK_CodePointLine* cpl) {
@@ -2065,7 +2038,7 @@ AK_line_select_keep(
 }
 
 //------------------------------------------------------------------------------
-// CodePointGrid Type, New, Destrctor
+// CodePointGrid Type, New, Destructor
 
 typedef struct AK_CodePointGrid {
     Py_ssize_t lines_count;    // accumulated number of lines
@@ -2464,11 +2437,11 @@ typedef struct AK_DelimitedReader{
     AK_Dialect *dialect;
     AK_DelimitedReaderState state;
     Py_ssize_t field_len;
-    Py_ssize_t record_number;
-    Py_ssize_t record_iter_number;
-    Py_ssize_t field_number;
+    Py_ssize_t record_number; // total records loaded
+    Py_ssize_t record_iter_number; // records iterated (counting exclusion)
+    Py_ssize_t field_number; // field in current record, reset for each record
     int axis;
-    Py_ssize_t *axis_pos;
+    Py_ssize_t *axis_pos; // points to either record_number or field_number
 } AK_DelimitedReader;
 
 // Called once at the close of each field in a line. Returns 0 on success, -1 on failure
@@ -2687,7 +2660,7 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
                 return -1;
             case 0:
                 Py_DECREF(record);
-                return 1; // skip, process more lines
+                return 1; // skip, process more records
         }
         // NOTE: record_number should reflect the processed line count, and exlude any skipped lines. The value is initialized to -1 such the first line is number 0
         ++dr->record_number;
@@ -2721,9 +2694,10 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
 static void
 AK_DR_Free(AK_DelimitedReader *dr)
 {
-    AK_Dialect_Free(dr->dialect);
-    dr->dialect = NULL;
-    Py_CLEAR(dr->input_iter);
+    if (dr->dialect) {
+        AK_Dialect_Free(dr->dialect);
+    }
+    Py_XDECREF(dr->input_iter); // might already be NULL
     PyMem_Free(dr);
 }
 
@@ -2755,6 +2729,7 @@ AK_DR_New(PyObject *iterable,
 
     dr->record_number = -1;
     dr->record_iter_number = -1;
+    dr->dialect = NULL; // init in case input_iter fails to init
 
     dr->input_iter = PyObject_GetIter(iterable); // new ref, decref in free
     if (dr->input_iter == NULL) {
@@ -2770,7 +2745,6 @@ AK_DR_New(PyObject *iterable,
             quoting,
             skipinitialspace,
             strict);
-
     if (dr->dialect == NULL) {
         AK_DR_Free(dr);
         return NULL;
@@ -2870,7 +2844,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
         PyErr_SetString(PyExc_TypeError, "line_select must be a callable or None");
         return NULL;
     }
-    Py_XINCREF(line_select);
 
     if ((axis < 0) || (axis > 1)) {
         PyErr_SetString(PyExc_ValueError, "axis must be 0 or 1");
@@ -2886,7 +2859,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
             skipinitialspace,
             strict);
     if (dr == NULL) { // can happen due to validation of dialect parameters
-        Py_XDECREF(line_select);
         return NULL;
     }
 
@@ -2896,7 +2868,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
             &tsep,
             thousandschar,
             '\0')) {
-        Py_XDECREF(line_select);
         AK_DR_Free(dr);
         return NULL; // default is off (skips evaluation)
     }
@@ -2906,15 +2877,13 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
             &decc,
             decimalchar,
             '.')) {
-        Py_XDECREF(line_select);
         AK_DR_Free(dr);
         return NULL;
     }
 
     // dtypes inc / dec ref bound within CPG life
     AK_CodePointGrid* cpg = AK_CPG_New(dtypes, tsep, decc);
     if (cpg == NULL) { // error will be set
-        Py_XDECREF(line_select);
         AK_DR_Free(dr);
         return NULL;
     }
@@ -2929,7 +2898,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
             break;
         }
         else if (status == -1) {
-            Py_XDECREF(line_select);
             AK_DR_Free(dr);
             AK_CPG_Free(cpg);
             return NULL;
@@ -2938,17 +2906,12 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
     }
     AK_DR_Free(dr);
 
-
     PyObject* arrays = AK_CPG_ToArrayList(cpg, axis, line_select, tsep, decc);
     // NOTE: do not need to check if arrays is NULL as we will return NULL anyway
-
-    Py_XDECREF(line_select);
     AK_CPG_Free(cpg); // will free reference to dtypes
-
     return arrays; // could be NULL
 }
 
-
 static char *iterable_str_to_array_1d_kwarg_names[] = {
     "iterable",
     "dtype",
diff --git a/test/test_delimited_to_arrays.py b/test/test_delimited_to_arrays.py
@@ -156,6 +156,7 @@ def test_iterable_str_to_array_1d_int_13(self) -> None:
         with self.assertRaises(TypeError):
             a1 = iterable_str_to_array_1d(['3.000', '4.000', '1.000'], dtype=int, thousandschar=',')
 
+
     #---------------------------------------------------------------------------
 
     def test_iterable_str_to_array_1d_uint_1(self) -> None:
@@ -703,6 +704,42 @@ def test_delimited_to_arrays_parse_i(self) -> None:
         post2 = delimited_to_arrays(msg, axis=1, skipinitialspace=True)
         self.assertEqual([a.tolist() for a in post2], [['a', 'b'], [10, 20], ['foo', 'c']])
 
+    def test_delimited_to_arrays_parse_j(self) -> None:
+        msg = [
+            '2021,2021-04-01,4',
+            '2022,2022-05-01,3',
+            ]
+        post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
+        self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04-01', '2022-05-01'], [4, 3]])
+
+
+    def test_delimited_to_arrays_parse_k(self) -> None:
+        msg = [
+            '2021,2021-04,4',
+            '2022,2022-05,3',
+            ]
+        post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
+        self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04', '2022-05'], [4, 3]])
+
+
+    def test_delimited_to_arrays_parse_l(self) -> None:
+        msg = [
+            '1,2,3',
+            '2-,2-0,-3',
+            ]
+        post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
+        self.assertEqual([a.tolist() for a in post1], [['1', '2-'], ['2', '2-0'], [3, -3]])
+
+    def test_delimited_to_arrays_parse_m(self) -> None:
+        msg = [
+            '  1,   2,3',
+            ' 2-, 2-0, -3',
+            ]
+        post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
+        self.assertEqual([a.tolist() for a in post1], [['  1', ' 2-'], ['   2', ' 2-0'], [3, -3]])
+
+
+        # import ipdb; ipdb.set_trace()
 
     #---------------------------------------------------------------------------
     def test_delimited_to_arrays_float_a(self) -> None:
@@ -1008,6 +1045,32 @@ def test_delimited_to_arrays_decimalchar_b(self) -> None:
             [[1000, 2000, 4000], [4.0, 5.055, 6000.155]])
 
 
+    #---------------------------------------------------------------------------
+    def test_delimited_to_arrays_file_like_a(self) -> None:
+        def records():
+            msg = [
+                '1000;4',
+                '2000;5055',
+            ]
+            yield from msg
+
+        with self.assertRaises(TypeError):
+            _ = delimited_to_arrays(records,
+                    axis=1,
+                    delimiter=';',
+                    )
+
+    def test_delimited_to_arrays_file_like_b(self) -> None:
+
+        with self.assertRaises(TypeError):
+            _ = delimited_to_arrays(3,
+                    axis=1,
+                    delimiter=';',
+                    dtypes=lambda x: int,
+                    )
+
+
+
     #---------------------------------------------------------------------------
     def test_delimited_to_arrays_compare_int_a(self) -> None:
         # genfromtxt might translate an empty field to -1 or 0