Skip to content

Commit 483ed88

Browse files
authored
Merge pull request #63 from chaburkland/burkland/new_indexers_from_indexer_subset
Add get_new_indexers_and_screen function
2 parents b3624a5 + 5ea3ca5 commit 483ed88

File tree

6 files changed

+508
-14
lines changed

6 files changed

+508
-14
lines changed

performance/__main__.py

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
import collections
1+
from collections import namedtuple
22
import datetime
33
import timeit
44
import argparse
5+
import typing as tp
6+
from enum import Enum
57

68
import numpy as np
79

@@ -17,6 +19,8 @@
1719
from performance.reference.util import dtype_from_element as dtype_from_element_ref
1820
from performance.reference.util import array_deepcopy as array_deepcopy_ref
1921
from performance.reference.util import isna_element as isna_element_ref
22+
from performance.reference.util import get_new_indexers_and_screen_ak
23+
from performance.reference.util import get_new_indexers_and_screen_ref
2024

2125
from performance.reference.array_go import ArrayGO as ArrayGOREF
2226

@@ -258,7 +262,7 @@ class DtypeFromElementPerf(Perf):
258262
NUMBER = 1000
259263

260264
def __init__(self):
261-
NT = collections.namedtuple('NT', tuple('abc'))
265+
NT = namedtuple('NT', tuple('abc'))
262266

263267
self.values = [
264268
np.longlong(-1), np.int_(-1), np.intc(-1), np.short(-1), np.byte(-1),
@@ -359,6 +363,103 @@ class IsNaElementPerfREF(IsNaElementPerf):
359363
entry = staticmethod(isna_element_ref)
360364

361365

366+
#-------------------------------------------------------------------------------
367+
class GetNewIndexersAndScreenPerf(Perf):
368+
FUNCTIONS = (
369+
"ordered",
370+
"unordered",
371+
"tiled",
372+
"repeat",
373+
"quick_exit",
374+
"late_exit",
375+
"small",
376+
"large",
377+
)
378+
NUMBER = 5
379+
380+
TILED = "tiled"
381+
REPEATED = "repeated"
382+
ORDERED = "ordered"
383+
UNORDERED = "unordered"
384+
385+
class Key(tp.NamedTuple):
386+
type1: str
387+
type2: str
388+
increment: int
389+
scale: int
390+
391+
def __init__(self):
392+
NUMBERS = np.arange(500_000, dtype=np.int64)
393+
POSITIONS = np.arange(500_000, dtype=np.int64)
394+
395+
np.random.seed(0)
396+
397+
self.cases: tp.Dict[self.Key, tp.Tuple[np.ndarray, np.ndarray]] = {}
398+
399+
for scale in (5, 50, 500, 5_000, 50_000):
400+
tiled_ordered = np.tile(NUMBERS[:scale], len(NUMBERS) // scale)
401+
repeated_ordered = np.repeat(NUMBERS[:scale], len(NUMBERS) // scale)
402+
tiled_unordered = tiled_ordered.copy()
403+
repeated_unordered = repeated_ordered.copy()
404+
np.random.shuffle(tiled_unordered)
405+
np.random.shuffle(repeated_unordered)
406+
407+
increment = scale
408+
while increment <= len(NUMBERS):
409+
positions = POSITIONS[:increment]
410+
key_kwargs = dict(increment=increment, scale=scale)
411+
self.cases[
412+
self.Key(type1=self.TILED, type2=self.ORDERED, **key_kwargs)
413+
] = (tiled_ordered, positions)
414+
self.cases[
415+
self.Key(type1=self.REPEATED, type2=self.ORDERED, **key_kwargs)
416+
] = (repeated_ordered, positions)
417+
self.cases[
418+
self.Key(type1=self.TILED, type2=self.UNORDERED, **key_kwargs)
419+
] = (tiled_unordered, positions)
420+
self.cases[
421+
self.Key(type1=self.REPEATED, type2=self.UNORDERED, **key_kwargs)
422+
] = (repeated_unordered, positions)
423+
increment *= 10
424+
425+
def evaluate_cases_by_condition(self, condition):
426+
for key, (indexers, positions) in self.cases.items():
427+
if condition(key):
428+
self.entry(indexers=indexers, positions=positions)
429+
430+
def ordered(self):
431+
self.evaluate_cases_by_condition(lambda key: key.type2 == self.ORDERED)
432+
433+
def unordered(self):
434+
self.evaluate_cases_by_condition(lambda key: key.type2 == self.UNORDERED)
435+
436+
def tiled(self):
437+
self.evaluate_cases_by_condition(lambda key: key.type1 == self.TILED)
438+
439+
def repeat(self):
440+
self.evaluate_cases_by_condition(lambda key: key.type1 == self.REPEATED)
441+
442+
def quick_exit(self):
443+
self.evaluate_cases_by_condition(lambda key: key.increment == key.scale)
444+
445+
def late_exit(self):
446+
self.evaluate_cases_by_condition(lambda key: key.increment > key.scale)
447+
448+
def small(self):
449+
self.evaluate_cases_by_condition(lambda key: key.scale <= 500)
450+
451+
def large(self):
452+
self.evaluate_cases_by_condition(lambda key: key.scale > 500)
453+
454+
455+
class GetNewIndexersAndScreenPerfAK(GetNewIndexersAndScreenPerf):
456+
entry = staticmethod(get_new_indexers_and_screen_ak)
457+
458+
459+
class GetNewIndexersAndScreenPerfREF(GetNewIndexersAndScreenPerf):
460+
entry = staticmethod(get_new_indexers_and_screen_ref)
461+
462+
362463
#-------------------------------------------------------------------------------
363464

364465
def get_arg_parser():
@@ -399,7 +500,7 @@ def main():
399500
number=cls_runner.NUMBER)
400501
records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak']))
401502

402-
width = 24
503+
width = 32
403504
for record in records:
404505
print(''.join(
405506
(r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record

performance/reference/util.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,35 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype:
216216
# NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls
217217
return np.array(value).dtype
218218

219+
220+
def get_new_indexers_and_screen_ref(
221+
indexers: np.ndarray,
222+
positions: np.ndarray,
223+
) -> tp.Tuple[np.ndarray, np.ndarray]:
224+
225+
positions = indexers.argsort()
226+
227+
# get the sorted indexers
228+
indexers = indexers[positions]
229+
230+
mask = np.empty(indexers.shape, dtype=DTYPE_BOOL)
231+
mask[0] = True
232+
mask[1:] = indexers[1:] != indexers[:-1]
233+
234+
new_indexers = np.empty(mask.shape, dtype=DTYPE_INT_DEFAULT)
235+
new_indexers[positions] = np.cumsum(mask) - 1
236+
new_indexers.flags.writeable = False
237+
238+
return new_indexers, indexers[mask]
239+
240+
241+
def get_new_indexers_and_screen_ak(
242+
indexers: np.ndarray,
243+
positions: np.ndarray,
244+
) -> tp.Tuple[np.ndarray, np.ndarray]:
245+
from arraykit import get_new_indexers_and_screen as ak_routine
246+
247+
if len(positions) > len(indexers):
248+
return np.unique(indexers, return_inverse=True)
249+
250+
return ak_routine(indexers, positions)

src/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@
1616
from ._arraykit import resolve_dtype_iter as resolve_dtype_iter
1717
from ._arraykit import isna_element as isna_element
1818
from ._arraykit import dtype_from_element as dtype_from_element
19+
from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen

src/__init__.pyi

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ __version__: str
88

99
class ArrayGO:
1010

11-
values: np.array
11+
values: np.ndarray
1212
def __init__(
1313
self, iterable: tp.Iterable[object], *, own_iterable: bool = ...
1414
) -> None: ...
@@ -20,16 +20,16 @@ class ArrayGO:
2020
def copy(self: _T) -> _T: ...
2121
def extend(self, __values: tp.Iterable[object]) -> None: ...
2222

23-
def immutable_filter(__array: np.array) -> np.array: ...
24-
def mloc(__array: np.array) -> int: ...
23+
def immutable_filter(__array: np.ndarray) -> np.ndarray: ...
24+
def mloc(__array: np.ndarray) -> int: ...
2525
def name_filter(__name: tp.Hashable) -> tp.Hashable: ...
26-
def shape_filter(__array: np.array) -> np.ndarray: ...
27-
def column_2d_filter(__array: np.array) -> np.ndarray: ...
28-
def column_1d_filter(__array: np.array) -> np.ndarray: ...
29-
def row_1d_filter(__array: np.array) -> np.ndarray: ...
30-
def array_deepcopy(__array: np.array, memo: tp.Dict[int, tp.Any]) -> np.ndarray: ...
26+
def shape_filter(__array: np.ndarray) -> np.ndarray: ...
27+
def column_2d_filter(__array: np.ndarray) -> np.ndarray: ...
28+
def column_1d_filter(__array: np.ndarray) -> np.ndarray: ...
29+
def row_1d_filter(__array: np.ndarray) -> np.ndarray: ...
30+
def array_deepcopy(__array: np.ndarray, memo: tp.Dict[int, tp.Any]) -> np.ndarray: ...
3131
def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
3232
def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
3333
def isna_element(__value: tp.Any) -> bool: ...
3434
def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ...
35-
35+
def get_new_indexers_and_screen(__indexers: np.ndarray, __positions: np.ndarray) -> tp.Tuple[np.ndarray, np.ndarray]: ...

0 commit comments

Comments
 (0)