Skip to content

Commit 3175546

Browse files
authored
Merge pull request #164 from static-frame/162/nonzero-1d-noncontiguous
`nonzero_1d` handling for contiguous values
2 parents 982dc84 + b82f284 commit 3175546

File tree

3 files changed

+88
-106
lines changed

3 files changed

+88
-106
lines changed

doc/articles/nonzero_1d.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __call__(self):
4848

4949

5050
#-------------------------------------------------------------------------------
51-
NUMBER = 400
51+
NUMBER = 100
5252

5353
def seconds_to_display(seconds: float) -> str:
5454
seconds /= NUMBER

src/_arraykit.c

Lines changed: 71 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -3535,7 +3535,7 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35353535
//------------------------------------------------------------------------------
35363536
// general utility
35373537

3538-
#define NONZERO_APPEND_INDEX { \
3538+
#define NONZERO_APPEND_INDEX_RELATIVE { \
35393539
if (AK_UNLIKELY(count == capacity)) { \
35403540
capacity <<= 1; \
35413541
indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
@@ -3546,7 +3546,19 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35463546
indices[count++] = p - p_start; \
35473547
} \
35483548

3549+
#define NONZERO_APPEND_INDEX_ABSOLUTE { \
3550+
if (AK_UNLIKELY(count == capacity)) { \
3551+
capacity <<= 1; \
3552+
indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3553+
if (indices == NULL) { \
3554+
return NULL; \
3555+
} \
3556+
} \
3557+
indices[count++] = i; \
3558+
} \
3559+
35493560
// Given a Boolean, contiguous 1D array, return the index positions in an int64 array.
3561+
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
35503562
static inline PyObject*
35513563
AK_nonzero_1d(PyArrayObject* array) {
35523564
// the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate
@@ -3566,112 +3578,71 @@ AK_nonzero_1d(PyArrayObject* array) {
35663578
Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8;
35673579
npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity);
35683580

3569-
// array is contiguous, 1d, boolean
3570-
npy_bool* p_start = (npy_bool*)PyArray_DATA(array);
3571-
npy_bool* p = p_start;
3572-
npy_bool* p_end = p + count_max;
3573-
npy_bool* p_end_roll = p_end - size_div.rem;
3574-
35753581
NPY_BEGIN_THREADS_DEF;
35763582
NPY_BEGIN_THREADS;
3577-
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help.
3578-
// Using bit masks does not improve perforamnce over pointer arithmetic.
3579-
// Prescanning for all empty is very effective.
35803583

3581-
while (p < p_end_roll) {
3582-
if (*(npy_uint64*)p == 0) {
3583-
p += 8; // no true within this 8 byte roll region
3584-
continue;
3584+
if (PyArray_IS_C_CONTIGUOUS(array)) {
3585+
npy_bool* p_start = (npy_bool*)PyArray_DATA(array);
3586+
npy_bool* p = p_start;
3587+
npy_bool* p_end = p + count_max;
3588+
npy_bool* p_end_roll = p_end - size_div.rem;
3589+
3590+
while (p < p_end_roll) {
3591+
if (*(npy_uint64*)p == 0) {
3592+
p += 8; // no true within this 8 byte roll region
3593+
continue;
3594+
}
3595+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3596+
p++;
3597+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3598+
p++;
3599+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3600+
p++;
3601+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3602+
p++;
3603+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3604+
p++;
3605+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3606+
p++;
3607+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3608+
p++;
3609+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3610+
p++;
3611+
}
3612+
while (p < p_end) {
3613+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3614+
p++;
35853615
}
3586-
if (*p) {NONZERO_APPEND_INDEX;}
3587-
p++;
3588-
if (*p) {NONZERO_APPEND_INDEX;}
3589-
p++;
3590-
if (*p) {NONZERO_APPEND_INDEX;}
3591-
p++;
3592-
if (*p) {NONZERO_APPEND_INDEX;}
3593-
p++;
3594-
if (*p) {NONZERO_APPEND_INDEX;}
3595-
p++;
3596-
if (*p) {NONZERO_APPEND_INDEX;}
3597-
p++;
3598-
if (*p) {NONZERO_APPEND_INDEX;}
3599-
p++;
3600-
if (*p) {NONZERO_APPEND_INDEX;}
3601-
p++;
36023616
}
3603-
while (p < p_end) {
3604-
if (*p) {NONZERO_APPEND_INDEX;}
3605-
p++;
3617+
else {
3618+
npy_intp i = 0; // position within Boolean array
3619+
npy_intp i_end = count_max;
3620+
npy_intp i_end_roll = count_max - size_div.rem;
3621+
while (i < i_end_roll) {
3622+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3623+
i++;
3624+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3625+
i++;
3626+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3627+
i++;
3628+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3629+
i++;
3630+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3631+
i++;
3632+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3633+
i++;
3634+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3635+
i++;
3636+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3637+
i++;
3638+
}
3639+
while (i < i_end) {
3640+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3641+
i++;
3642+
}
36063643
}
36073644
NPY_END_THREADS;
36083645

3609-
// npy_uint64 roll;
3610-
// while (p < p_end_roll) {
3611-
// roll = *(npy_uint64*)p;
3612-
// if (roll == 0) {
3613-
// p += 8; // no true within this 8 byte roll region
3614-
// continue;
3615-
// }
3616-
// // this order depends on byte order
3617-
// if (roll & 0xFF) {NONZERO_APPEND_OFFSET(0);}
3618-
// if (roll & 0xFF00) {NONZERO_APPEND_OFFSET(1);}
3619-
// if (roll & 0xFF0000) {NONZERO_APPEND_OFFSET(2);}
3620-
// if (roll & 0xFF000000) {NONZERO_APPEND_OFFSET(3);}
3621-
// if (roll & 0xFF00000000) {NONZERO_APPEND_OFFSET(4);}
3622-
// if (roll & 0xFF0000000000) {NONZERO_APPEND_OFFSET(5);}
3623-
// if (roll & 0xFF000000000000) {NONZERO_APPEND_OFFSET(6);}
3624-
// if (roll & 0xFF00000000000000) {NONZERO_APPEND_OFFSET(7);}
3625-
// p += 8;
3626-
// }
3627-
// while (p < p_end) {
3628-
// if (*p) {NONZERO_APPEND_OFFSET(0);}
3629-
// p++;
3630-
// }
3631-
3632-
3633-
// while (p < p_end_roll) {
3634-
// if (*(npy_uint64*)p == 0) {
3635-
// p += 8; // no true within this roll region
3636-
// continue;
3637-
// }
3638-
// if (AK_UNLIKELY(count + 8 >= capacity)) {
3639-
// capacity <<= 1;
3640-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
3641-
// if (indices == NULL) {
3642-
// return NULL;
3643-
// }
3644-
// }
3645-
// if (*p) {indices[count++] = p - p_start;}
3646-
// p++;
3647-
// if (*p) {indices[count++] = p - p_start;}
3648-
// p++;
3649-
// if (*p) {indices[count++] = p - p_start;}
3650-
// p++;
3651-
// if (*p) {indices[count++] = p - p_start;}
3652-
// p++;
3653-
// if (*p) {indices[count++] = p - p_start;}
3654-
// p++;
3655-
// if (*p) {indices[count++] = p - p_start;}
3656-
// p++;
3657-
// if (*p) {indices[count++] = p - p_start;}
3658-
// p++;
3659-
// if (*p) {indices[count++] = p - p_start;}
3660-
// p++;
3661-
// }
3662-
// // at most three more indices remain
3663-
// if (AK_UNLIKELY(count + 7 >= capacity)) {
3664-
// capacity <<= 1;
3665-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
3666-
// if (indices == NULL) {
3667-
// return NULL;
3668-
// }
3669-
// }
3670-
// while (p < p_end) {
3671-
// if (*p) {indices[count++] = p - p_start;}
3672-
// p++;
3673-
// }
3674-
36753646
npy_intp dims = {count};
36763647
final = PyArray_SimpleNewFromData(1, &dims, NPY_INT64, (void*)indices);
36773648
if (!final) {
@@ -3683,7 +3654,7 @@ AK_nonzero_1d(PyArrayObject* array) {
36833654
PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
36843655
return final;
36853656
}
3686-
#undef NONZERO_APPEND_INDEX
3657+
#undef NONZERO_APPEND_INDEX_RELATIVE
36873658

36883659
static PyObject*
36893660
nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a) {
@@ -3697,10 +3668,6 @@ nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a) {
36973668
PyErr_SetString(PyExc_ValueError, "Array must be of type bool");
36983669
return NULL;
36993670
}
3700-
if (!PyArray_IS_C_CONTIGUOUS(array)) {
3701-
PyErr_SetString(PyExc_ValueError, "Array must be contiguous");
3702-
return NULL;
3703-
}
37043671
return AK_nonzero_1d(array);
37053672
}
37063673

test/test_nonzero_1d.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,19 @@ def test_nonzero_1d_e(self) -> None:
8383
a1[999] = True
8484
self.assertEqual(nonzero_1d(a1).tolist(), [999, 9_999_999])
8585
a1[0] = True
86-
self.assertEqual(nonzero_1d(a1).tolist(), [0, 999, 9_999_999])
86+
self.assertEqual(nonzero_1d(a1).tolist(), [0, 999, 9_999_999])
87+
88+
def test_nonzero_1d_f(self) -> None:
89+
# non-contiguous
90+
a1 = np.arange(40).reshape(10, 4) % 3 == 0
91+
a2 = a1[:, 3]
92+
self.assertEqual(nonzero_1d(a2).tolist(), [0, 3, 6, 9])
93+
94+
a3 = a1[:, 1]
95+
self.assertEqual(nonzero_1d(a3).tolist(), [2, 5, 8])
96+
97+
def test_nonzero_1d_g(self) -> None:
98+
a1 = np.arange(20).reshape(4, 5) % 3 == 0
99+
a2 = a1[:, 4]
100+
# array([False, True, False, False])
101+
self.assertEqual(nonzero_1d(a2).tolist(), [1])

0 commit comments

Comments
 (0)