Skip to content

Commit b0edba7

Browse files
authored
Merge pull request #124 from static-frame/123/indices-to-contig
2 parents b1069ea + 3bdf248 commit b0edba7

File tree

6 files changed

+821
-172
lines changed

6 files changed

+821
-172
lines changed

doc/articles/block_index.py

Lines changed: 11 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,54 +8,22 @@
88
from itertools import repeat
99
import pickle
1010

11+
import matplotlib.pyplot as plt
12+
import numpy as np
13+
import pandas as pd
14+
1115
from arraykit import BlockIndex
1216
# from arraykit import ErrorInitTypeBlocks
13-
from arraykit import shape_filter
14-
from arraykit import resolve_dtype
1517

1618
import arraykit as ak
1719

18-
import matplotlib.pyplot as plt
19-
import numpy as np
20-
import pandas as pd
21-
2220
sys.path.append(os.getcwd())
2321

22+
from performance.reference.block_index import from_blocks
23+
2424

25-
def from_blocks(
26-
raw_blocks: tp.Iterable[np.ndarray],
27-
):
28-
index: tp.List[tp.Tuple[int, int]] = [] # columns position to blocks key
29-
block_count = 0
30-
row_count = None
31-
column_count = 0
32-
dtype = None
33-
34-
for block in raw_blocks:
35-
if not block.__class__ is np.ndarray:
36-
raise ErrorInitTypeBlocks(f'found non array block: {block}')
37-
if block.ndim > 2:
38-
raise ErrorInitTypeBlocks(f'cannot include array with {block.ndim} dimensions')
39-
40-
r, c = shape_filter(block)
41-
42-
if row_count is not None and r != row_count: #type: ignore [unreachable]
43-
raise ErrorInitTypeBlocks(f'mismatched row count: {r}: {row_count}')
44-
else:
45-
row_count = r
46-
if c == 0:
47-
continue
48-
49-
if dtype is None:
50-
dtype = block.dtype
51-
else:
52-
dtype = resolve_dtype(dtype, block.dtype)
53-
54-
for i in range(c):
55-
index.append((block_count, i))
56-
column_count += c
57-
block_count += 1
58-
return (row_count, column_count), index
25+
26+
#-------------------------------------------------------------------------------
5927

6028
class ArrayProcessor:
6129
NAME = ''
@@ -78,6 +46,7 @@ def __init__(self, arrays: tp.Iterable[np.ndarray]):
7846
self.selector_bool_array = (np.arange(len(self.bi)) % 2) == 0
7947
self.selector_slice = slice(0, len(self.bi), 2)
8048

49+
8150
#-------------------------------------------------------------------------------
8251
class BlockIndexLoad(ArrayProcessor):
8352
NAME = 'BlockIndex: load'
@@ -223,13 +192,11 @@ class TupleIndexIterBoolArray(ArrayProcessor):
223192

224193
def __call__(self):
225194
ti = self.ti
226-
_ = [ti[i] for i in self.selector_bool_array if i]
227-
228-
195+
_ = [ti[i] for i, b in enumerate(self.selector_bool_array) if b]
229196

230197

231198
#-------------------------------------------------------------------------------
232-
NUMBER = 50
199+
NUMBER = 5
233200

234201
def seconds_to_display(seconds: float) -> str:
235202
seconds /= NUMBER

performance/reference/block_index.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
2+
from arraykit import shape_filter
3+
from arraykit import resolve_dtype
4+
5+
import typing as tp
6+
import numpy as np
7+
8+
#-------------------------------------------------------------------------------
9+
def from_blocks(
10+
raw_blocks: tp.Iterable[np.ndarray],
11+
):
12+
'''Simulation of legacy routine within TypeBlocks.
13+
'''
14+
index: tp.List[tp.Tuple[int, int]] = [] # columns position to blocks key
15+
block_count = 0
16+
row_count = None
17+
column_count = 0
18+
dtype = None
19+
20+
for block in raw_blocks:
21+
if not block.__class__ is np.ndarray:
22+
raise ErrorInitTypeBlocks(f'found non array block: {block}')
23+
if block.ndim > 2:
24+
raise ErrorInitTypeBlocks(f'cannot include array with {block.ndim} dimensions')
25+
26+
r, c = shape_filter(block)
27+
28+
if row_count is not None and r != row_count: #type: ignore [unreachable]
29+
raise ErrorInitTypeBlocks(f'mismatched row count: {r}: {row_count}')
30+
else:
31+
row_count = r
32+
if c == 0:
33+
continue
34+
35+
if dtype is None:
36+
dtype = block.dtype
37+
else:
38+
dtype = resolve_dtype(dtype, block.dtype)
39+
40+
for i in range(c):
41+
index.append((block_count, i))
42+
column_count += c
43+
block_count += 1
44+
return (row_count, column_count), index
45+
46+
#-------------------------------------------------------------------------------
47+
48+
49+
def cols_to_slice(indices: tp.Sequence[int]) -> slice:
50+
'''Translate an iterable of contiguous integers into a slice.
51+
Integers are assumed to be ordered (ascending or descending) and contiguous.
52+
'''
53+
start_idx = indices[0]
54+
# single column as a single slice
55+
if len(indices) == 1:
56+
return slice(start_idx, start_idx + 1)
57+
58+
stop_idx = indices[-1]
59+
if stop_idx > start_idx: # ascending indices
60+
return slice(start_idx, stop_idx + 1)
61+
62+
if stop_idx == 0:
63+
return slice(start_idx, None, -1)
64+
# stop is less than start, need to reduce by 1 to cover range
65+
return slice(start_idx, stop_idx - 1, -1)
66+
67+
def indices_to_contiguous_pairs(indices: tp.Iterable[tp.Tuple[int, int]]
68+
) -> tp.Iterator[tp.Tuple[int, slice]]:
69+
'''Indices are pairs of (block_idx, value); convert these to pairs of (block_idx, slice) when we identify contiguous indices
70+
within a block (these are block slices)
71+
'''
72+
# store pairs of block idx, ascending col list
73+
last: tp.Optional[tp.Tuple[int, int]] = None
74+
75+
for block_idx, col in indices:
76+
if not last:
77+
last = (block_idx, col)
78+
bundle = [col]
79+
continue
80+
if last[0] == block_idx and abs(col - last[1]) == 1:
81+
# if contiguous, update last, add to bundle
82+
last = (block_idx, col)
83+
# do not need to store all col, only the last,
84+
# however probably easier to just accumulate all
85+
bundle.append(col)
86+
continue
87+
# either new block, or not contiguous on same block
88+
yield (last[0], cols_to_slice(bundle))
89+
# start a new bundle
90+
bundle = [col]
91+
last = (block_idx, col)
92+
93+
# last can be None
94+
if last and bundle:
95+
yield (last[0], cols_to_slice(bundle))
96+
97+
98+
class IterContiguous:
99+
def __init__(self, indices):
100+
self.indices = iter(indices)
101+
self.last_block = -1
102+
self.last_column = -1
103+
self.next_block = -1
104+
self.next_column = -1
105+
106+
@staticmethod
107+
def build_slice(start, end_inclusive):
108+
# this works, but we reatain slices to force 2D selections; we might explore changing this
109+
# if start == end_inclusive:
110+
# return start
111+
112+
if start <= end_inclusive:
113+
return slice(start, end_inclusive + 1, None) # can be 1
114+
# reverse slice
115+
if end_inclusive == 0:
116+
return slice(start, None, -1)
117+
return slice(start, end_inclusive - 1, -1)
118+
119+
def getter(self) -> tp.Tuple[int, slice]:
120+
slice_start = -1
121+
while True:
122+
if self.next_block == -2:
123+
return None # terminate the loop
124+
if self.next_block != -1:
125+
# discontinuity found on last iteration, set new start
126+
self.last_block = self.next_block
127+
self.last_column = self.next_column
128+
slice_start = self.last_column
129+
self.next_block = -1 # clear next state
130+
self.next_column = -1
131+
132+
try:
133+
block, column = next(self.indices)
134+
except StopIteration:
135+
# no more pairs, but set a previous slice_start that has not been emitted
136+
# return that now, and flag for end on next call
137+
self.next_block = -2
138+
return self.last_block, self.build_slice(slice_start, self.last_column)
139+
140+
if self.last_block == -1:
141+
# initialization
142+
self.last_block = block
143+
self.last_column = column
144+
slice_start = column
145+
continue
146+
147+
if self.last_block == block and abs(column - self.last_column) == 1: # contiguous
148+
self.last_column = column
149+
continue
150+
151+
# not contiguous, need to emit a slice for previous region
152+
# store this block, column as next, so we have
153+
self.next_block = block
154+
self.next_column = column
155+
return self.last_block, self.build_slice(slice_start, self.last_column)
156+
157+
158+
def iter(self) -> tp.Iterator[tp.Tuple[int, slice]]:
159+
while True:
160+
post = self.getter()
161+
if post is not None:
162+
yield post
163+
else:
164+
break
165+
166+
#-------------------------------------------------------------------------------
167+
168+
169+
170+
if __name__ == '__main__':
171+
samples = (
172+
[(0, 0), (0, 1), (0, 2), (1, 1), (1, 3), (2, 0), (3, 0), (3, 1), (3, 2)],
173+
[(0, 0), (2, 1), (3, 5), (10, 1)],
174+
[(0, 0), (2, 1), (2, 2), (2, 5), (2, 6), (10, 1)],
175+
[(10, 1)],
176+
[(0, 1), (0, 2), (0, 3), (0, 4)],
177+
[(0, 0), (2, 3), (2, 2), (2, 1), (2, 6), (10, 1)],
178+
[(2, 3), (0, 0), (2, 2), (2, 1), (2, 6), (2, 7)],
179+
[(2, 3), (2, 2), (5, 2), (5, 1), (5, 0), (2, 1), (2, 0)],
180+
181+
)
182+
for sample in samples:
183+
p1 = list(indices_to_contiguous_pairs(sample))
184+
print(sample)
185+
print(p1)
186+
187+
188+
iterc = IterContiguous(sample)
189+
p2 = list(iterc.iter())
190+
print(p2)

src/__init__.pyi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ class BlockIndex:
4646
def iter_select(self,
4747
__key: tp.Union[slice, np.ndarray, tp.List[int]],
4848
) -> tp.Iterator[tp.Tuple[int, int]]: ...
49+
def iter_contiguous(self,
50+
__key: tp.Union[slice, np.ndarray, tp.List[int]],
51+
ascending: bool = False,
52+
reduce: bool = False,
53+
) -> tp.Iterator[tp.Tuple[int, int]]: ...
54+
4955

5056
def iterable_str_to_array_1d(
5157
iterable: tp.Iterable[str],

0 commit comments

Comments
 (0)