4
4
5
5
6
6
def align_nd_chunks (
7
- nd_var_chunks : tuple [tuple [int , ...], ...],
7
+ nd_v_chunks : tuple [tuple [int , ...], ...],
8
8
nd_backend_chunks : tuple [tuple [int , ...], ...],
9
9
) -> tuple [tuple [int , ...], ...]:
10
- if len (nd_backend_chunks ) != len (nd_var_chunks ):
10
+ if len (nd_backend_chunks ) != len (nd_v_chunks ):
11
11
raise ValueError (
12
12
"The number of dimensions on the backend and the variable must be the same."
13
13
)
14
14
15
15
nd_aligned_chunks : list [tuple [int , ...]] = []
16
- for backend_chunks , var_chunks in zip (
17
- nd_backend_chunks , nd_var_chunks , strict = True
18
- ):
16
+ for backend_chunks , v_chunks in zip (nd_backend_chunks , nd_v_chunks , strict = True ):
19
17
# Validate that they have the same number of elements
20
- if sum (backend_chunks ) != sum (var_chunks ):
18
+ if sum (backend_chunks ) != sum (v_chunks ):
21
19
raise ValueError (
22
20
"The number of elements in the backend does not "
23
21
"match the number of elements in the variable. "
@@ -42,39 +40,39 @@ def align_nd_chunks(
42
40
nd_aligned_chunks .append (backend_chunks )
43
41
continue
44
42
45
- if len (var_chunks ) == 1 :
46
- nd_aligned_chunks .append (var_chunks )
43
+ if len (v_chunks ) == 1 :
44
+ nd_aligned_chunks .append (v_chunks )
47
45
continue
48
46
49
47
# Size of the chunk on the backend
50
48
fixed_chunk = max (backend_chunks )
51
49
52
50
# The ideal size of the chunks is the maximum of the two; this would avoid
53
51
# that we use more memory than expected
54
- max_chunk = max (fixed_chunk , * var_chunks )
52
+ max_chunk = max (fixed_chunk , * v_chunks )
55
53
56
54
# The algorithm assumes that the chunks on this array are aligned except the last one
57
55
# because it can be considered a partial one
58
56
aligned_chunks : list [int ] = []
59
57
60
58
# For simplicity of the algorithm, let's transform the Array chunks in such a way that
61
59
# we remove the partial chunks. To achieve this, we add artificial data to the borders
62
- t_var_chunks = list (var_chunks )
63
- t_var_chunks [0 ] += fixed_chunk - backend_chunks [0 ]
64
- t_var_chunks [- 1 ] += fixed_chunk - backend_chunks [- 1 ]
60
+ t_v_chunks = list (v_chunks )
61
+ t_v_chunks [0 ] += fixed_chunk - backend_chunks [0 ]
62
+ t_v_chunks [- 1 ] += fixed_chunk - backend_chunks [- 1 ]
65
63
66
64
# The unfilled_size is the amount of space that has not been filled on the last
67
65
# processed chunk; this is equivalent to the amount of data that would need to be
68
66
# added to a partial Zarr chunk to fill it up to the fixed_chunk size
69
67
unfilled_size = 0
70
68
71
- for var_chunk in t_var_chunks :
69
+ for v_chunk in t_v_chunks :
72
70
# Ideally, we should try to preserve the original Dask chunks, but this is only
73
71
# possible if the last processed chunk was aligned (unfilled_size == 0)
74
- ideal_chunk = var_chunk
72
+ ideal_chunk = v_chunk
75
73
if unfilled_size :
76
74
# If that scenario is not possible, the best option is to merge the chunks
77
- ideal_chunk = var_chunk + aligned_chunks [- 1 ]
75
+ ideal_chunk = v_chunk + aligned_chunks [- 1 ]
78
76
79
77
while ideal_chunk :
80
78
if not unfilled_size :
@@ -105,27 +103,27 @@ def align_nd_chunks(
105
103
border_size = fixed_chunk - backend_chunks [::order ][0 ]
106
104
aligned_chunks = aligned_chunks [::order ]
107
105
aligned_chunks [0 ] -= border_size
108
- t_var_chunks = t_var_chunks [::order ]
109
- t_var_chunks [0 ] -= border_size
106
+ t_v_chunks = t_v_chunks [::order ]
107
+ t_v_chunks [0 ] -= border_size
110
108
if (
111
109
len (aligned_chunks ) >= 2
112
110
and aligned_chunks [0 ] + aligned_chunks [1 ] <= max_chunk
113
- and aligned_chunks [0 ] != t_var_chunks [0 ]
111
+ and aligned_chunks [0 ] != t_v_chunks [0 ]
114
112
):
115
113
# The artificial data added to the border can introduce inefficient chunks
116
114
# on the borders, for that reason, we will check if we can merge them or not
117
115
# Example:
118
116
# backend_chunks = [6, 6, 1]
119
- # var_chunks = [6, 7]
120
- # t_var_chunks = [6, 12]
121
- # The ideal output should preserve the same var_chunks , but the previous loop
117
+ # v_chunks = [6, 7]
118
+ # t_v_chunks = [6, 12]
119
+ # The ideal output should preserve the same v_chunks , but the previous loop
122
120
# is going to produce aligned_chunks = [6, 6, 6]
123
121
# And after removing the artificial data, we will end up with aligned_chunks = [6, 6, 1]
124
122
# which is not ideal and can be merged into a single chunk
125
123
aligned_chunks [1 ] += aligned_chunks [0 ]
126
124
aligned_chunks = aligned_chunks [1 :]
127
125
128
- t_var_chunks = t_var_chunks [::order ]
126
+ t_v_chunks = t_v_chunks [::order ]
129
127
aligned_chunks = aligned_chunks [::order ]
130
128
131
129
nd_aligned_chunks .append (tuple (aligned_chunks ))
@@ -144,6 +142,11 @@ def build_grid_chunks(
144
142
region_start = region .start or 0
145
143
# Generate the zarr chunks inside the region of this dim
146
144
chunks_on_region = [chunk_size - (region_start % chunk_size )]
145
+ if chunks_on_region [0 ] >= size :
146
+ # This is useful for the scenarios where the chunk_size are bigger
147
+ # than the variable chunks, which can happens when the user specifies
148
+ # the enc_chunks manually.
149
+ return (size ,)
147
150
chunks_on_region .extend ([chunk_size ] * ((size - chunks_on_region [0 ]) // chunk_size ))
148
151
if (size - chunks_on_region [0 ]) % chunk_size != 0 :
149
152
chunks_on_region .append ((size - chunks_on_region [0 ]) % chunk_size )
@@ -155,45 +158,45 @@ def grid_rechunk(
155
158
enc_chunks : tuple [int , ...],
156
159
region : tuple [slice , ...],
157
160
) -> Variable :
158
- nd_var_chunks = v .chunks
159
- if not nd_var_chunks :
161
+ nd_v_chunks = v .chunks
162
+ if not nd_v_chunks :
160
163
return v
161
164
162
165
nd_grid_chunks = tuple (
163
166
build_grid_chunks (
164
- sum ( var_chunks ) ,
167
+ v_size ,
165
168
region = interval ,
166
169
chunk_size = chunk_size ,
167
170
)
168
- for var_chunks , chunk_size , interval in zip (
169
- nd_var_chunks , enc_chunks , region , strict = True
171
+ for v_size , chunk_size , interval in zip (
172
+ v . shape , enc_chunks , region , strict = True
170
173
)
171
174
)
172
175
173
176
nd_aligned_chunks = align_nd_chunks (
174
- nd_var_chunks = nd_var_chunks ,
177
+ nd_v_chunks = nd_v_chunks ,
175
178
nd_backend_chunks = nd_grid_chunks ,
176
179
)
177
180
v = v .chunk (dict (zip (v .dims , nd_aligned_chunks , strict = True )))
178
181
return v
179
182
180
183
181
184
def validate_grid_chunks_alignment (
182
- nd_var_chunks : tuple [tuple [int , ...], ...] | None ,
185
+ nd_v_chunks : tuple [tuple [int , ...], ...] | None ,
183
186
enc_chunks : tuple [int , ...],
184
187
backend_shape : tuple [int , ...],
185
188
region : tuple [slice , ...],
186
189
allow_partial_chunks : bool ,
187
190
name : str ,
188
191
):
189
- if nd_var_chunks is None :
192
+ if nd_v_chunks is None :
190
193
return
191
194
base_error = (
192
195
"Specified Zarr chunks encoding['chunks']={enc_chunks!r} for "
193
196
"variable named {name!r} would overlap multiple Dask chunks. "
194
- "Check the chunk at position {var_chunk_pos}, which has a size of "
195
- "{var_chunk_size} on dimension {dim_i}. It is unaligned with "
196
- "backend chunks of size {chunk_size} in region {region}. "
197
+ "Please check the Dask chunks at position {v_chunk_pos} and "
198
+ "{v_chunk_pos_next}, on axis {axis}, they are overlapped "
199
+ "on the same Zarr chunk in the region {region}. "
197
200
"Writing this array in parallel with Dask could lead to corrupted data. "
198
201
"To resolve this issue, consider one of the following options: "
199
202
"- Rechunk the array using `chunk()`. "
@@ -202,22 +205,23 @@ def validate_grid_chunks_alignment(
202
205
"- Enable automatic chunks alignment with `align_chunks=True`."
203
206
)
204
207
205
- for dim_i , chunk_size , var_chunks , interval , size in zip (
208
+ for axis , chunk_size , v_chunks , interval , size in zip (
206
209
range (len (enc_chunks )),
207
210
enc_chunks ,
208
- nd_var_chunks ,
211
+ nd_v_chunks ,
209
212
region ,
210
213
backend_shape ,
211
214
strict = True ,
212
215
):
213
- for i , chunk in enumerate (var_chunks [1 :- 1 ]):
216
+ for i , chunk in enumerate (v_chunks [1 :- 1 ]):
214
217
if chunk % chunk_size :
215
218
raise ValueError (
216
219
base_error .format (
217
- var_chunk_pos = i + 1 ,
218
- var_chunk_size = chunk ,
220
+ v_chunk_pos = i + 1 ,
221
+ v_chunk_pos_next = i + 2 ,
222
+ v_chunk_size = chunk ,
223
+ axis = axis ,
219
224
name = name ,
220
- dim_i = dim_i ,
221
225
chunk_size = chunk_size ,
222
226
region = interval ,
223
227
enc_chunks = enc_chunks ,
@@ -226,20 +230,21 @@ def validate_grid_chunks_alignment(
226
230
227
231
interval_start = interval .start or 0
228
232
229
- if len (var_chunks ) > 1 :
233
+ if len (v_chunks ) > 1 :
230
234
# The first border size is the amount of data that needs to be updated on the
231
235
# first chunk taking into account the region slice.
232
236
first_border_size = chunk_size
233
237
if allow_partial_chunks :
234
238
first_border_size = chunk_size - interval_start % chunk_size
235
239
236
- if (var_chunks [0 ] - first_border_size ) % chunk_size :
240
+ if (v_chunks [0 ] - first_border_size ) % chunk_size :
237
241
raise ValueError (
238
242
base_error .format (
239
- var_chunk_pos = 0 ,
240
- var_chunk_size = var_chunks [0 ],
243
+ v_chunk_pos = 0 ,
244
+ v_chunk_pos_next = 0 ,
245
+ v_chunk_size = v_chunks [0 ],
246
+ axis = axis ,
241
247
name = name ,
242
- dim_i = dim_i ,
243
248
chunk_size = chunk_size ,
244
249
region = interval ,
245
250
enc_chunks = enc_chunks ,
@@ -250,10 +255,11 @@ def validate_grid_chunks_alignment(
250
255
region_stop = interval .stop or size
251
256
252
257
error_on_last_chunk = base_error .format (
253
- var_chunk_pos = len (var_chunks ) - 1 ,
254
- var_chunk_size = var_chunks [- 1 ],
258
+ v_chunk_pos = len (v_chunks ) - 1 ,
259
+ v_chunk_pos_next = len (v_chunks ) - 1 ,
260
+ v_chunk_size = v_chunks [- 1 ],
261
+ axis = axis ,
255
262
name = name ,
256
- dim_i = dim_i ,
257
263
chunk_size = chunk_size ,
258
264
region = interval ,
259
265
enc_chunks = enc_chunks ,
@@ -267,7 +273,7 @@ def validate_grid_chunks_alignment(
267
273
# If the region is covering the last chunk then check
268
274
# if the reminder with the default chunk size
269
275
# is equal to the size of the last chunk
270
- if var_chunks [- 1 ] % chunk_size != size % chunk_size :
276
+ if v_chunks [- 1 ] % chunk_size != size % chunk_size :
271
277
raise ValueError (error_on_last_chunk )
272
- elif var_chunks [- 1 ] % chunk_size :
278
+ elif v_chunks [- 1 ] % chunk_size :
273
279
raise ValueError (error_on_last_chunk )
0 commit comments