Skip to content

Commit 7046524

Browse files
authored
Merge pull request #73 from static-frame/72/split-after-count
2 parents fd6c666 + e3d3503 commit 7046524

File tree

6 files changed

+220
-4
lines changed

6 files changed

+220
-4
lines changed

performance/__main__.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from performance.reference.util import isna_element as isna_element_ref
2626
from performance.reference.util import get_new_indexers_and_screen_ak
2727
from performance.reference.util import get_new_indexers_and_screen_ref
28+
from performance.reference.util import split_after_count as split_after_count_ref
29+
from performance.reference.util import count_iteration as count_iteration_ref
2830

2931
from performance.reference.array_go import ArrayGO as ArrayGOREF
3032

@@ -41,6 +43,8 @@
4143
from arraykit import array_deepcopy as array_deepcopy_ak
4244
from arraykit import delimited_to_arrays as delimited_to_arrays_ak
4345
from arraykit import isna_element as isna_element_ak
46+
from arraykit import split_after_count as split_after_count_ak
47+
from arraykit import count_iteration as count_iteration_ak
4448

4549
from arraykit import ArrayGO as ArrayGOAK
4650

@@ -695,6 +699,43 @@ class GetNewIndexersAndScreenPerfREF(GetNewIndexersAndScreenPerf):
695699
entry = staticmethod(get_new_indexers_and_screen_ref)
696700

697701

702+
703+
704+
#-------------------------------------------------------------------------------
705+
class SplitAfterCount(Perf):
706+
NUMBER = 200_000
707+
708+
def __init__(self):
709+
self.string = ''.join(['abcd,'] * 1000)
710+
711+
def main(self):
712+
post = self.entry(self.string, ',', 20)
713+
714+
class SplitAfterCountAK(SplitAfterCount):
715+
entry = staticmethod(split_after_count_ak)
716+
717+
class SplitAfterCountREF(SplitAfterCount):
718+
entry = staticmethod(split_after_count_ref)
719+
720+
721+
#-------------------------------------------------------------------------------
722+
class CountIterations(Perf):
723+
NUMBER = 10_000
724+
725+
def __init__(self):
726+
self.strio = io.StringIO('\n'.join(['abcd'] * 10_000))
727+
728+
def main(self):
729+
post = self.entry(self.strio)
730+
self.strio.seek(0)
731+
732+
class CountIterationsAK(CountIterations):
733+
entry = staticmethod(count_iteration_ak)
734+
735+
class CountIterationsREF(CountIterations):
736+
entry = staticmethod(count_iteration_ref)
737+
738+
698739
#-------------------------------------------------------------------------------
699740

700741
def get_arg_parser():

performance/reference/util.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,18 @@ def get_new_indexers_and_screen_ak(
248248
return np.unique(indexers, return_inverse=True)
249249

250250
return ak_routine(indexers, positions)
251+
252+
253+
def split_after_count(string: str, delimiter: str, count: int):
254+
*left, right = string.split(delimiter, maxsplit=count)
255+
return ','.join(left), right
256+
257+
def count_iteration(iterable: tp.Iterable):
258+
count = 0
259+
for i in iterable:
260+
count += 1
261+
return count
262+
263+
264+
265+

src/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@
1919
from ._arraykit import delimited_to_arrays as delimited_to_arrays
2020
from ._arraykit import iterable_str_to_array_1d as iterable_str_to_array_1d
2121
from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen
22+
from ._arraykit import split_after_count as split_after_count
23+
from ._arraykit import count_iteration as count_iteration

src/__init__.pyi

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ def delimited_to_arrays(
4545
decimalchar: str = '.',
4646
) -> tp.List[np.array]: ...
4747

48+
def split_after_count(
49+
string: str,
50+
*
51+
delimiter: str,
52+
count: int,
53+
) -> tp.Tuple[str, str]: ...
54+
55+
def count_iteration(__iterable: tp.Iterable) -> int: ...
56+
4857
def immutable_filter(__array: np.ndarray) -> np.ndarray: ...
4958
def mloc(__array: np.ndarray) -> int: ...
5059
def name_filter(__name: tp.Hashable) -> tp.Hashable: ...
@@ -57,4 +66,4 @@ def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
5766
def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
5867
def isna_element(__value: tp.Any) -> bool: ...
5968
def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ...
60-
def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> tp.Tuple[np.ndarray, np.ndarray]: ...
69+
def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> tp.Tuple[np.ndarray, np.ndarray]: ...

src/_arraykit.c

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,10 +1441,10 @@ AK_CPL_FromIterable(PyObject* iterable, bool type_parse, Py_UCS4 tsep, Py_UCS4 d
14411441
}
14421442
Py_DECREF(field);
14431443
}
1444+
Py_DECREF(iter);
14441445
if (PyErr_Occurred()) {
14451446
return NULL;
14461447
}
1447-
Py_DECREF(iter);
14481448
return cpl;
14491449
}
14501450

@@ -2955,6 +2955,97 @@ iterable_str_to_array_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwarg
29552955
return AK_IterableStrToArray1D(iterable, dtype_specifier, tsep, decc);
29562956
}
29572957

2958+
2959+
static PyObject *
2960+
split_after_count(PyObject *Py_UNUSED(m), PyObject *args)
2961+
{
2962+
PyObject *string = NULL;
2963+
PyObject *delimiter = NULL;
2964+
int count = 0;
2965+
2966+
if (!PyArg_ParseTuple(args,
2967+
"OOi:split_after_count",
2968+
&string,
2969+
&delimiter,
2970+
&count)) {
2971+
return NULL;
2972+
}
2973+
2974+
if (!PyUnicode_Check(string)) {
2975+
PyErr_Format(PyExc_RuntimeError,
2976+
"a string is required, not %.200s",
2977+
Py_TYPE(string)->tp_name
2978+
);
2979+
return NULL;
2980+
}
2981+
2982+
if (count <= 0) {
2983+
PyErr_Format(PyExc_RuntimeError,
2984+
"count must be greater than zero, not %i",
2985+
count
2986+
);
2987+
return NULL;
2988+
}
2989+
2990+
Py_UCS4 delim_char;
2991+
if (AK_set_char(
2992+
"delimiter",
2993+
&delim_char,
2994+
delimiter,
2995+
'\0')) return NULL;
2996+
2997+
unsigned int kind = PyUnicode_KIND(string);
2998+
const void *data = PyUnicode_DATA(string);
2999+
Py_ssize_t pos = 0;
3000+
Py_ssize_t delim_count = 0;
3001+
Py_ssize_t linelen = PyUnicode_GET_LENGTH(string);
3002+
Py_UCS4 c;
3003+
3004+
while (pos < linelen) {
3005+
c = PyUnicode_READ(kind, data, pos);
3006+
if (c == delim_char) {
3007+
delim_count++;
3008+
if (delim_count == count) {
3009+
break; // to not include delim at transition
3010+
// do not increment pos so as to exclude in left
3011+
}
3012+
}
3013+
pos++;
3014+
}
3015+
3016+
PyObject* left = PyUnicode_Substring(string, 0, pos);
3017+
PyObject* right = PyUnicode_Substring(string, pos+1, linelen);
3018+
PyObject *result = PyTuple_Pack(2, left, right);
3019+
Py_DECREF(left);
3020+
Py_DECREF(right);
3021+
return result;
3022+
}
3023+
3024+
3025+
3026+
static PyObject *
3027+
count_iteration(PyObject *Py_UNUSED(m), PyObject *iterable)
3028+
{
3029+
PyObject *iter = PyObject_GetIter(iterable);
3030+
if (iter == NULL) return NULL;
3031+
3032+
int count = 0;
3033+
PyObject *v;
3034+
3035+
while ((v = PyIter_Next(iter))) {
3036+
count++;
3037+
Py_DECREF(v);
3038+
}
3039+
Py_DECREF(iter);
3040+
if (PyErr_Occurred()) {
3041+
return NULL;
3042+
}
3043+
PyObject* result = PyLong_FromLong(count);
3044+
if (result == NULL) return NULL;
3045+
return result;
3046+
}
3047+
3048+
29583049
//------------------------------------------------------------------------------
29593050

29603051
// Return the integer version of the pointer to underlying data-buffer of array.
@@ -3087,7 +3178,7 @@ resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args)
30873178
{
30883179
PyArray_Descr *d1, *d2;
30893180
if (!PyArg_ParseTuple(args, "O!O!:resolve_dtype",
3090-
&PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2))
3181+
&PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2))
30913182
{
30923183
return NULL;
30933184
}
@@ -3817,6 +3908,8 @@ static PyMethodDef arraykit_methods[] = {
38173908
(PyCFunction)iterable_str_to_array_1d,
38183909
METH_VARARGS | METH_KEYWORDS,
38193910
NULL},
3911+
{"split_after_count", split_after_count, METH_VARARGS, NULL},
3912+
{"count_iteration", count_iteration, METH_O, NULL},
38203913
{"isna_element", isna_element, METH_O, NULL},
38213914
{"dtype_from_element", dtype_from_element, METH_O, NULL},
38223915
{"get_new_indexers_and_screen",

test/test_util.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import datetime
44
import unittest
55
import warnings
6-
6+
from io import StringIO
77
import numpy as np # type: ignore
88

99
from arraykit import resolve_dtype
@@ -17,6 +17,9 @@
1717
from arraykit import array_deepcopy
1818
from arraykit import isna_element
1919
from arraykit import dtype_from_element
20+
from arraykit import split_after_count
21+
from arraykit import count_iteration
22+
2023
from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full
2124
from arraykit import get_new_indexers_and_screen
2225

@@ -442,5 +445,58 @@ def test_get_new_indexers_and_screen_b(self) -> None:
442445
assert tuple(map(list, postB)) == (list(indexersB), list(indexersB))
443446

444447

448+
#---------------------------------------------------------------------------
449+
def test_split_after_count_a(self) -> None:
450+
post = split_after_count('a,b,c,d,e', ',', 2)
451+
self.assertEqual(post[0], 'a,b')
452+
self.assertEqual(post[1], 'c,d,e')
453+
454+
def test_split_after_count_b(self) -> None:
455+
post = split_after_count('a,b,c,d,e', ',', 4)
456+
self.assertEqual(post[0], 'a,b,c,d')
457+
self.assertEqual(post[1], 'e')
458+
459+
def test_split_after_count_c(self) -> None:
460+
post = split_after_count('a,b,c,d,e', ',', 5)
461+
self.assertEqual(post[0], 'a,b,c,d,e')
462+
self.assertEqual(post[1], '')
463+
464+
def test_split_after_count_d(self) -> None:
465+
post = split_after_count('a', ',', 5)
466+
self.assertEqual(post[0], 'a')
467+
self.assertEqual(post[1], '')
468+
469+
def test_split_after_count_e(self) -> None:
470+
with self.assertRaises(RuntimeError):
471+
post = split_after_count('a,', ',', 0)
472+
473+
def test_split_after_count_f(self) -> None:
474+
post = split_after_count('a,', ',', 1)
475+
self.assertEqual(post[0], 'a')
476+
self.assertEqual(post[1], '')
477+
478+
def test_split_after_count_g(self) -> None:
479+
post = split_after_count(',', ',', 1)
480+
self.assertEqual(post[0], '')
481+
self.assertEqual(post[1], '')
482+
483+
def test_split_after_count_h(self) -> None:
484+
post = split_after_count('a,b,c,d,e', '|', 5)
485+
self.assertEqual(post[0], 'a,b,c,d,e')
486+
self.assertEqual(post[1], '')
487+
488+
489+
#---------------------------------------------------------------------------
490+
def test_count_iteration_a(self) -> None:
491+
post = count_iteration(('a', 'b', 'c', 'd'))
492+
self.assertEqual(post, 4)
493+
494+
def test_count_iteration_b(self) -> None:
495+
s1 = StringIO(',1,a,b\n-,1,43,54\nX,2,1,3\nY,1,8,10\n-,2,6,20')
496+
post = count_iteration(s1)
497+
self.assertEqual(post, 5)
498+
499+
500+
445501
if __name__ == '__main__':
446502
unittest.main()

0 commit comments

Comments
 (0)