Skip to content

Commit 1ad85c0

Browse files
committed
further studies
1 parent f812aa2 commit 1ad85c0

File tree

1 file changed

+30
-48
lines changed

1 file changed

+30
-48
lines changed

src/_arraykit.c

Lines changed: 30 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3546,7 +3546,7 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35463546
indices[count++] = p - p_start; \
35473547
} \
35483548

3549-
#define NONZERO_APPEND_OFFSET(offset) { \
3549+
#define NONZERO_APPEND_OFFSET(offset) do { \
35503550
if (AK_UNLIKELY(count == capacity)) { \
35513551
capacity <<= 1; \
35523552
indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
@@ -3555,7 +3555,7 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35553555
} \
35563556
} \
35573557
indices[count++] = p + offset - p_start; \
3558-
} \
3558+
} while(0) \
35593559

35603560

35613561
// Given a Boolean, contiguous 1D array, return the index positions in an int64 array.
@@ -3574,7 +3574,7 @@ AK_nonzero_1d(PyArrayObject* array) {
35743574
lldiv_t size_div = lldiv((long long)count_max, 8); // quot, rem
35753575

35763576
Py_ssize_t count = 0;
3577-
// the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large c
3577+
// the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large size, we set it to half the size
35783578
Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 2;
35793579

35803580
npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity);
@@ -3585,52 +3585,10 @@ AK_nonzero_1d(PyArrayObject* array) {
35853585
npy_bool* p_end = p + count_max;
35863586
npy_bool* p_end_roll = p_end - size_div.rem;
35873587

3588-
// while (p < p_end_roll) {
3589-
// if (*(npy_uint64*)p == 0) {
3590-
// p += 8; // no true within this 8 byte roll region
3591-
// continue;
3592-
// }
3593-
// if (*p) {indices[count++] = p - p_start;}
3594-
// p++;
3595-
// if (*p) {indices[count++] = p - p_start;}
3596-
// p++;
3597-
// if (*p) {indices[count++] = p - p_start;}
3598-
// p++;
3599-
// if (*p) {indices[count++] = p - p_start;}
3600-
// p++;
3601-
// if (*p) {indices[count++] = p - p_start;}
3602-
// p++;
3603-
// if (*p) {indices[count++] = p - p_start;}
3604-
// p++;
3605-
// if (*p) {indices[count++] = p - p_start;}
3606-
// p++;
3607-
// if (*p) {indices[count++] = p - p_start;}
3608-
// p++;
3609-
// }
3610-
// // at most three more indices remain
3611-
// while (p < p_end) {
3612-
// if (*p) {indices[count++] = p - p_start;}
3613-
// p++;
3614-
// }
36153588

3616-
// while (p < p_end_roll) {
3617-
// npy_uint64 roll = *(npy_uint64*)p;
3618-
// if (roll == 0) {
3619-
// p += 8; // no true within this 8 byte roll region
3620-
// continue;
3621-
// }
3622-
// if (roll >> 56 & 0xFF) {NONZERO_APPEND_OFFSET(0);}
3623-
// if (roll >> 48 & 0xFF) {NONZERO_APPEND_OFFSET(1);}
3624-
// if (roll >> 40 & 0xFF) {NONZERO_APPEND_OFFSET(2);}
3625-
// if (roll >> 32 & 0xFF) {NONZERO_APPEND_OFFSET(3);}
3626-
// if (roll >> 24 & 0xFF) {NONZERO_APPEND_OFFSET(4);}
3627-
// if (roll >> 16 & 0xFF) {NONZERO_APPEND_OFFSET(5);}
3628-
// if (roll >> 8 & 0xFF) {NONZERO_APPEND_OFFSET(6);}
3629-
// if (roll >> 0 & 0xFF) {NONZERO_APPEND_OFFSET(7);}
3630-
// while (p < p_end) {
3631-
// if (*p) {NONZERO_APPEND_OFFSET(0);}
3632-
// p++;
3633-
// }
3589+
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help.
3590+
// Doing esoteric things with bit masks does not generally improve perforamnce.
3591+
// Prescanning for all empty is very effective.
36343592

36353593
while (p < p_end_roll) {
36363594
if (*(npy_uint64*)p == 0) {
@@ -3659,6 +3617,30 @@ AK_nonzero_1d(PyArrayObject* array) {
36593617
p++;
36603618
}
36613619

3620+
// npy_uint64 roll;
3621+
// while (p < p_end_roll) {
3622+
// roll = *(npy_uint64*)p;
3623+
// if (roll == 0) {
3624+
// p += 8; // no true within this 8 byte roll region
3625+
// continue;
3626+
// }
3627+
// // this order depends on byte order
3628+
// if (roll & 0xFF) {NONZERO_APPEND_OFFSET(0);}
3629+
// if (roll & 0xFF00) {NONZERO_APPEND_OFFSET(1);}
3630+
// if (roll & 0xFF0000) {NONZERO_APPEND_OFFSET(2);}
3631+
// if (roll & 0xFF000000) {NONZERO_APPEND_OFFSET(3);}
3632+
// if (roll & 0xFF00000000) {NONZERO_APPEND_OFFSET(4);}
3633+
// if (roll & 0xFF0000000000) {NONZERO_APPEND_OFFSET(5);}
3634+
// if (roll & 0xFF000000000000) {NONZERO_APPEND_OFFSET(6);}
3635+
// if (roll & 0xFF00000000000000) {NONZERO_APPEND_OFFSET(7);}
3636+
// p += 8;
3637+
// }
3638+
// while (p < p_end) {
3639+
// if (*p) {NONZERO_APPEND_OFFSET(0);}
3640+
// p++;
3641+
// }
3642+
3643+
36623644
// while (p < p_end_roll) {
36633645
// if (*(npy_uint64*)p == 0) {
36643646
// p += 8; // no true within this roll region

0 commit comments

Comments
 (0)