You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3553
+
if (indices == NULL) { \
3554
+
return NULL; \
3555
+
} \
3556
+
} \
3557
+
indices[count++] = i; \
3558
+
} \
3559
+
3549
3560
// Given a Boolean, contiguous 1D array, return the index positions in an int64 array.
3561
+
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
3550
3562
staticinlinePyObject*
3551
3563
AK_nonzero_1d(PyArrayObject*array) {
3552
3564
// the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help.
3578
-
// Using bit masks does not improve perforamnce over pointer arithmetic.
3579
-
// Prescanning for all empty is very effective.
3580
3583
3581
-
while (p<p_end_roll) {
3582
-
if (*(npy_uint64*)p==0) {
3583
-
p+=8; // no true within this 8 byte roll region
3584
-
continue;
3584
+
if (PyArray_IS_C_CONTIGUOUS(array)) {
3585
+
npy_bool*p_start= (npy_bool*)PyArray_DATA(array);
3586
+
npy_bool*p=p_start;
3587
+
npy_bool*p_end=p+count_max;
3588
+
npy_bool*p_end_roll=p_end-size_div.rem;
3589
+
3590
+
while (p<p_end_roll) {
3591
+
if (*(npy_uint64*)p==0) {
3592
+
p+=8; // no true within this 8 byte roll region
3593
+
continue;
3594
+
}
3595
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3596
+
p++;
3597
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3598
+
p++;
3599
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3600
+
p++;
3601
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3602
+
p++;
3603
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3604
+
p++;
3605
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3606
+
p++;
3607
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3608
+
p++;
3609
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3610
+
p++;
3611
+
}
3612
+
while (p<p_end) {
3613
+
if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3614
+
p++;
3585
3615
}
3586
-
if (*p) {NONZERO_APPEND_INDEX;}
3587
-
p++;
3588
-
if (*p) {NONZERO_APPEND_INDEX;}
3589
-
p++;
3590
-
if (*p) {NONZERO_APPEND_INDEX;}
3591
-
p++;
3592
-
if (*p) {NONZERO_APPEND_INDEX;}
3593
-
p++;
3594
-
if (*p) {NONZERO_APPEND_INDEX;}
3595
-
p++;
3596
-
if (*p) {NONZERO_APPEND_INDEX;}
3597
-
p++;
3598
-
if (*p) {NONZERO_APPEND_INDEX;}
3599
-
p++;
3600
-
if (*p) {NONZERO_APPEND_INDEX;}
3601
-
p++;
3602
3616
}
3603
-
while (p<p_end) {
3604
-
if (*p) {NONZERO_APPEND_INDEX;}
3605
-
p++;
3617
+
else {
3618
+
npy_intpi=0; // position within Boolean array
3619
+
npy_intpi_end=count_max;
3620
+
npy_intpi_end_roll=count_max-size_div.rem;
3621
+
while (i<i_end_roll) {
3622
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3623
+
i++;
3624
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3625
+
i++;
3626
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3627
+
i++;
3628
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3629
+
i++;
3630
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3631
+
i++;
3632
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3633
+
i++;
3634
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3635
+
i++;
3636
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3637
+
i++;
3638
+
}
3639
+
while (i<i_end) {
3640
+
if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3641
+
i++;
3642
+
}
3606
3643
}
3607
3644
NPY_END_THREADS;
3608
3645
3609
-
// npy_uint64 roll;
3610
-
// while (p < p_end_roll) {
3611
-
// roll = *(npy_uint64*)p;
3612
-
// if (roll == 0) {
3613
-
// p += 8; // no true within this 8 byte roll region
3614
-
// continue;
3615
-
// }
3616
-
// // this order depends on byte order
3617
-
// if (roll & 0xFF) {NONZERO_APPEND_OFFSET(0);}
3618
-
// if (roll & 0xFF00) {NONZERO_APPEND_OFFSET(1);}
3619
-
// if (roll & 0xFF0000) {NONZERO_APPEND_OFFSET(2);}
3620
-
// if (roll & 0xFF000000) {NONZERO_APPEND_OFFSET(3);}
3621
-
// if (roll & 0xFF00000000) {NONZERO_APPEND_OFFSET(4);}
3622
-
// if (roll & 0xFF0000000000) {NONZERO_APPEND_OFFSET(5);}
3623
-
// if (roll & 0xFF000000000000) {NONZERO_APPEND_OFFSET(6);}
3624
-
// if (roll & 0xFF00000000000000) {NONZERO_APPEND_OFFSET(7);}
3625
-
// p += 8;
3626
-
// }
3627
-
// while (p < p_end) {
3628
-
// if (*p) {NONZERO_APPEND_OFFSET(0);}
3629
-
// p++;
3630
-
// }
3631
-
3632
-
3633
-
// while (p < p_end_roll) {
3634
-
// if (*(npy_uint64*)p == 0) {
3635
-
// p += 8; // no true within this roll region
3636
-
// continue;
3637
-
// }
3638
-
// if (AK_UNLIKELY(count + 8 >= capacity)) {
3639
-
// capacity <<= 1;
3640
-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
3641
-
// if (indices == NULL) {
3642
-
// return NULL;
3643
-
// }
3644
-
// }
3645
-
// if (*p) {indices[count++] = p - p_start;}
3646
-
// p++;
3647
-
// if (*p) {indices[count++] = p - p_start;}
3648
-
// p++;
3649
-
// if (*p) {indices[count++] = p - p_start;}
3650
-
// p++;
3651
-
// if (*p) {indices[count++] = p - p_start;}
3652
-
// p++;
3653
-
// if (*p) {indices[count++] = p - p_start;}
3654
-
// p++;
3655
-
// if (*p) {indices[count++] = p - p_start;}
3656
-
// p++;
3657
-
// if (*p) {indices[count++] = p - p_start;}
3658
-
// p++;
3659
-
// if (*p) {indices[count++] = p - p_start;}
3660
-
// p++;
3661
-
// }
3662
-
// // at most three more indices remain
3663
-
// if (AK_UNLIKELY(count + 7 >= capacity)) {
3664
-
// capacity <<= 1;
3665
-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
0 commit comments