@@ -576,30 +576,23 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
576
576
577
577
num_unique = len(positions)
578
578
element_locations = np.full(num_unique, num_unique, dtype=np.int64)
579
+ order_found = np.full(num_unique, num_unique, dtype=np.int64)
579
580
new_indexers = np.empty(len(indexers), dtype=np.int64)
580
581
581
582
num_found = 0
582
583
583
584
for i, element in enumerate(indexers):
584
585
if element_locations[element] == num_unique:
585
586
element_locations[element] = num_found
587
+ order_found[num_found] = element
586
588
num_found += 1
587
589
588
590
if num_found == num_unique:
589
591
return positions, indexers
590
592
591
593
new_indexers[i] = element_locations[element]
592
594
593
- return element_locations, new_indexers
594
- # ...
595
- # NOTE: These return values will be used in a Python wrapper like this:
596
- found_mask = element_locations != num_unique
597
-
598
- found_element_locations = element_locations[found_mask]
599
- order_found = np.argsort(found_element_locations)
600
-
601
- found_positions = positions[found_mask]
602
- return found_positions[order_found], new_indexers
595
+ return order_found[:num_found], new_indexers
603
596
*/
604
597
PyArrayObject * indexers ;
605
598
PyArrayObject * positions ;
@@ -644,46 +637,61 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
644
637
}
645
638
646
639
npy_intp dims = {num_unique };
647
- PyArrayObject * element_locations = (PyArrayObject * )PyArray_Empty (
648
- 1 , // ndim
649
- & dims , // shape
650
- PyArray_DescrFromType ( NPY_INT64 ), // dtype
651
- 0 // fortran
640
+ PyArrayObject * element_locations = (PyArrayObject * )PyArray_EMPTY (
641
+ 1 , // ndim
642
+ & dims , // shape
643
+ NPY_INT64 , // dtype
644
+ 0 // fortran
652
645
);
653
646
if (element_locations == NULL ) {
654
647
return NULL ;
655
648
}
656
649
657
- PyObject * num_unique_pyint = PyLong_FromLong (num_unique );
658
- if (num_unique_pyint == NULL ) {
650
+ PyArrayObject * order_found = (PyArrayObject * )PyArray_EMPTY (
651
+ 1 , // ndim
652
+ & dims , // shape
653
+ NPY_INT64 , // dtype
654
+ 0 // fortran
655
+ );
656
+ if (order_found == NULL ) {
659
657
Py_DECREF (element_locations );
660
658
return NULL ;
661
659
}
662
660
663
- // We use ``num_unique`` to signal that we haven't found the element yet
661
+ PyObject * num_unique_pyint = PyLong_FromLong (num_unique );
662
+ if (num_unique_pyint == NULL ) {
663
+ goto fail ;
664
+ }
665
+
666
+ // We use ``num_unique`` here to signal that we haven't found the element yet
664
667
// This works, because each element must be 0 < num_unique.
665
668
int fill_success = PyArray_FillWithScalar (element_locations , num_unique_pyint );
669
+ if (fill_success != 0 ) {
670
+ Py_DECREF (num_unique_pyint );
671
+ goto fail ;
672
+ }
673
+
674
+ fill_success = PyArray_FillWithScalar (order_found , num_unique_pyint );
666
675
Py_DECREF (num_unique_pyint );
667
676
if (fill_success != 0 ) {
668
- Py_DECREF (element_locations );
669
- return NULL ;
677
+ goto fail ;
670
678
}
671
679
672
- PyArrayObject * new_indexers = (PyArrayObject * )PyArray_Empty (
673
- 1 , // ndim
674
- PyArray_DIMS (indexers ), // shape
675
- PyArray_DescrFromType ( NPY_INT64 ), // dtype
676
- 0 // fortran
680
+ PyArrayObject * new_indexers = (PyArrayObject * )PyArray_EMPTY (
681
+ 1 , // ndim
682
+ PyArray_DIMS (indexers ), // shape
683
+ NPY_INT64 , // dtype
684
+ 0 // fortran
677
685
);
678
686
if (new_indexers == NULL ) {
679
- Py_DECREF (element_locations );
680
- return NULL ;
687
+ goto fail ;
681
688
}
682
689
683
690
// We know that our incoming dtypes are all int64! This is a safe cast.
684
691
// Plus, it's easier (and less error prone) to work with native C-arrays
685
692
// over using numpy's iteration APIs.
686
693
npy_int64 * element_location_values = (npy_int64 * )PyArray_DATA (element_locations );
694
+ npy_int64 * order_found_values = (npy_int64 * )PyArray_DATA (order_found );
687
695
npy_int64 * new_indexers_values = (npy_int64 * )PyArray_DATA (new_indexers );
688
696
689
697
// Now, implement the core algorithm by looping over the ``indexers``.
@@ -698,25 +706,27 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
698
706
NULL // dtype
699
707
);
700
708
if (indexer_iter == NULL ) {
701
- Py_DECREF (element_locations );
702
709
Py_DECREF (new_indexers );
703
- return NULL ;
710
+ goto fail ;
704
711
}
705
712
706
713
// The iternext function gets stored in a local variable so it can be called repeatedly in an efficient manner.
707
714
NpyIter_IterNextFunc * indexer_iternext = NpyIter_GetIterNext (indexer_iter , NULL );
708
715
if (indexer_iternext == NULL ) {
709
716
NpyIter_Deallocate (indexer_iter );
710
- Py_DECREF (element_locations );
711
717
Py_DECREF (new_indexers );
712
- return NULL ;
718
+ goto fail ;
713
719
}
714
720
715
721
// All of these will be updated by the iterator
716
722
char * * dataptr = NpyIter_GetDataPtrArray (indexer_iter );
717
723
npy_intp * strideptr = NpyIter_GetInnerStrideArray (indexer_iter );
718
724
npy_intp * innersizeptr = NpyIter_GetInnerLoopSizePtr (indexer_iter );
719
725
726
+ // No gil is required from here on!
727
+ NPY_BEGIN_THREADS_DEF ;
728
+ NPY_BEGIN_THREADS ;
729
+
720
730
size_t i = 0 ;
721
731
npy_int64 num_found = 0 ;
722
732
do {
@@ -731,17 +741,15 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
731
741
732
742
if (element_location_values [element ] == num_unique ) {
733
743
element_location_values [element ] = num_found ;
744
+ order_found_values [num_found ] = element ;
734
745
++ num_found ;
735
746
736
747
if (num_found == num_unique ) {
737
748
// This insight is core to the performance of the algorithm.
738
749
// If we have found every possible indexer, we can simply return
739
750
// back the inputs! Essentially, we can observe on <= single pass
740
751
// that we have the opportunity for re-use
741
- NpyIter_Deallocate (indexer_iter );
742
- Py_DECREF (element_locations );
743
- Py_DECREF (new_indexers );
744
- return PyTuple_Pack (2 , indexers , positions );
752
+ goto finish_early ;
745
753
}
746
754
}
747
755
@@ -754,12 +762,37 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
754
762
// Increment the iterator to the next inner loop
755
763
} while (indexer_iternext (indexer_iter ));
756
764
757
- NpyIter_Deallocate ( indexer_iter ) ;
765
+ NPY_END_THREADS ;
758
766
759
- PyObject * result = PyTuple_Pack ( 2 , new_indexers , element_locations );
767
+ NpyIter_Deallocate ( indexer_iter );
760
768
Py_DECREF (element_locations );
769
+
770
+ // new_positions = order_found[:num_unique]
771
+ PyObject * new_positions = PySequence_GetSlice (order_found , 0 , num_found );
772
+ Py_DECREF (order_found );
773
+ if (new_positions == NULL ) {
774
+ return NULL ;
775
+ }
776
+
777
+ // return new_positions, new_indexers
778
+ PyObject * result = PyTuple_Pack (2 , new_positions , new_indexers );
761
779
Py_DECREF (new_indexers );
780
+ Py_DECREF (new_positions );
762
781
return result ;
782
+
783
+ finish_early :
784
+ NPY_END_THREADS ;
785
+
786
+ NpyIter_Deallocate (indexer_iter );
787
+ Py_DECREF (element_locations );
788
+ Py_DECREF (order_found );
789
+ Py_DECREF (new_indexers );
790
+ return PyTuple_Pack (2 , positions , indexers );
791
+
792
+ fail :
793
+ Py_DECREF (element_locations );
794
+ Py_DECREF (order_found );
795
+ return NULL ;
763
796
}
764
797
765
798
//------------------------------------------------------------------------------
0 commit comments