Skip to content

Commit 6cd2ee6

Browse files
committed
Optimize M2P
1 parent 19531d0 commit 6cd2ee6

File tree

3 files changed

+224
-13
lines changed

3 files changed

+224
-13
lines changed

sumpy/e2p.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,9 @@ class E2PFromSingleBox(E2PBase):
123123
def default_name(self):
124124
return "e2p_from_single_box"
125125

126-
def get_kernel(self, max_ntargets_in_one_box):
126+
def get_kernel(self, max_ntargets_in_one_box, max_work_items):
127127
ncoeffs = len(self.expansion)
128128
loopy_args = self.get_loopy_args()
129-
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))
130129

131130
loopy_knl = lp.make_kernel(
132131
[
@@ -208,11 +207,16 @@ def get_kernel(self, max_ntargets_in_one_box):
208207
return loopy_knl
209208

210209
def get_optimized_kernel(self, max_ntargets_in_one_box):
211-
inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations()
212-
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
210+
inner_knl, optimizations = self.get_cached_loopy_knl_and_optimizations()
211+
212+
ncoeffs = len(self.expansion)
213+
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))
214+
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box,
215+
max_work_items=max_work_items)
216+
213217
knl = lp.tag_inames(knl, {"itgt_box": "g.0"})
214-
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
215-
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
218+
knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0")
219+
knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0")
216220
knl = lp.add_inames_to_insn(knl, "dummy",
217221
"id:fetch_init* or id:fetch_center or id:kernel_scaling")
218222
knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
@@ -273,10 +277,9 @@ class E2PFromCSR(E2PBase):
273277
def default_name(self):
274278
return "e2p_from_csr"
275279

276-
def get_kernel(self, max_ntargets_in_one_box):
280+
def get_kernel(self, max_ntargets_in_one_box, max_work_items):
277281
ncoeffs = len(self.expansion)
278282
loopy_args = self.get_loopy_args()
279-
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))
280283

281284
loopy_knl = lp.make_kernel(
282285
[
@@ -378,13 +381,17 @@ def get_kernel(self, max_ntargets_in_one_box):
378381
return loopy_knl
379382

380383
def get_optimized_kernel(self, max_ntargets_in_one_box):
381-
_, optimizations = self.get_loopy_evaluator_and_optimizations()
382-
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
384+
_, optimizations = self.get_cached_loopy_knl_and_optimizations()
385+
ncoeffs = len(self.expansion)
386+
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))
387+
388+
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box,
389+
max_work_items=max_work_items)
383390
knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"})
384391
knl = lp.unprivatize_temporaries_with_inames(knl,
385392
"itgt_offset", "result_temp")
386-
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
387-
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
393+
knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0")
394+
knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0")
388395
knl = lp.privatize_temporaries_with_inames(knl,
389396
"itgt_offset_outer", "result_temp")
390397
knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result")

sumpy/expansion/loopy.py

Lines changed: 188 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import loopy as lp
2727
import numpy as np
2828
from sumpy.expansion import ExpansionBase
29-
from sumpy.kernel import Kernel
29+
from sumpy.kernel import Kernel, LaplaceKernel
3030
import sumpy.symbolic as sym
3131
from sumpy.assignment_collection import SymbolicAssignmentCollection
3232
from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments
@@ -134,6 +134,193 @@ def make_e2p_loopy_kernel(expansion: ExpansionBase, kernels: Sequence[Kernel]) \
134134
return loopy_knl, optimizations
135135

136136

137+
def make_m2p_loopy_kernel_for_volume_taylor(
138+
expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit:
139+
140+
#raise NotImplementedError()
141+
if len(kernels) > 1:
142+
raise NotImplementedError()
143+
144+
kernel = kernels[0]
145+
if not isinstance(kernel, LaplaceKernel):
146+
raise NotImplementedError()
147+
148+
dim = expansion.dim
149+
order = expansion.order
150+
151+
if dim != 3 or order < 2:
152+
raise NotImplementedError()
153+
154+
b = pymbolic.var("b")
155+
ncoeffs = len(expansion.get_coefficient_identifiers())
156+
157+
temp = pymbolic.var("temp")
158+
inv_r = pymbolic.var("inv_r")
159+
inv_r2 = pymbolic.var("inv_r2")
160+
161+
domains = [
162+
"{[idim]: 0<=idim<dim}",
163+
]
164+
insns = [
165+
lp.Assignment(
166+
assignee="b[idim]",
167+
expression="(target[idim]-center[idim])/rscale",
168+
temp_var_type=lp.Optional(None),
169+
),
170+
lp.Assignment(
171+
assignee=inv_r,
172+
expression="rsqrt(b[0]*b[0] + b[1]*b[1] + b[2]* b[2])",
173+
temp_var_type=lp.Optional(None),
174+
),
175+
lp.Assignment(
176+
id="inv_r2",
177+
assignee=inv_r2,
178+
expression=inv_r*inv_r,
179+
temp_var_type=lp.Optional(None),
180+
),
181+
]
182+
183+
init_exprs = {}
184+
init_exprs[(0, 0, 0)] = inv_r
185+
186+
# Order 1 expressions
187+
for i in range(dim):
188+
mi = [0]*dim
189+
mi[i] = 1
190+
init_exprs[tuple(mi)] = -inv_r * b[i]
191+
192+
# Order 2 expressions
193+
for i in range(dim):
194+
for j in range(dim):
195+
mi = [0]*dim
196+
mi[i] = 1
197+
mi[j] = 1
198+
init_exprs[tuple(mi)] = inv_r*inv_r2*inv_r2*3*b[i]*b[j]
199+
200+
# Order 3 expressions
201+
init_exprs[(1, 1, 1)] = -15 * inv_r * inv_r2 * inv_r2 * inv_r2 \
202+
* b[0] * b[1] * b[2]
203+
204+
depends_on = frozenset(["inv_r2"])
205+
for i in range(2):
206+
for j in range(2):
207+
for k in range(2):
208+
insn = lp.Assignment(
209+
id=f"init_{i}_{j}_{k}",
210+
assignee=temp[i, j, k],
211+
expression=init_exprs[(i, j, k)],
212+
depends_on=depends_on,
213+
)
214+
depends_on = frozenset([f"init_{i}_{j}_{k}"])
215+
insns.append(insn)
216+
217+
wrangler = expansion.expansion_terms_wrangler
218+
result = pymbolic.var("result")
219+
coeffs = pymbolic.var("coeffs")
220+
221+
# For x1 == 0, 1
222+
x2 = pymbolic.var("x2")
223+
x1 = pymbolic.var("x1")
224+
x0 = pymbolic.var("x0")
225+
domains += [
226+
f"{{[{x1}]: 0<={x1}<=1}}",
227+
f"{{[{x0}]: 0<={x0}<=1 and {x0}<order-{x1} }}",
228+
f"{{[{x2}]: 0<={x2}<=order-{x0}-{x1} }}",
229+
]
230+
expr = (2*x2 - 1) * b[2] * temp[x0, x1, x2 - 1] + (x2 - 1)*(x2 - 1) \
231+
* temp[x0, x1, x2 - 2]
232+
expr += prim.If(prim.Comparison(x0, ">=", 1),
233+
2*x0*b[0]*temp[x0 - 1, x1, x2], 0)
234+
expr += prim.If(prim.Comparison(x1, ">=", 1),
235+
2*x1*b[1]*temp[x0, x1 - 1, x2], 0)
236+
expr *= -inv_r2
237+
insns += [
238+
lp.Assignment(
239+
id=f"temp_{x0}_{x1}_{x2}",
240+
assignee=temp[x0, x1, x2],
241+
expression=-inv_r2,
242+
depends_on=depends_on,
243+
predicates=frozenset([prim.Comparison(x2, ">=", 2)]),
244+
),
245+
lp.Assignment(
246+
id=f"update_{x0}_{x1}_{x2}",
247+
assignee=result[0],
248+
expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])]
249+
* temp[x0, x1, x2]),
250+
depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"])
251+
)
252+
]
253+
depends_on = frozenset([f"update_{x0}_{x1}_{x2}"])
254+
255+
# For x1 >=2
256+
x2 = pymbolic.var("y2")
257+
x1 = pymbolic.var("y1")
258+
x0 = pymbolic.var("y0")
259+
domains += [
260+
f"{{[{x1}]: 2<={x1}<=order}}",
261+
f"{{[{x0}]: 0<={x0}<=1 and {x0}<order-{x1} }}",
262+
f"{{[{x2}]: 0<={x2}<=order-{x0}-{x1} }}",
263+
]
264+
expr = ((2*x1 - 1) * b[1] * temp[x0, (x1 - 1) % 2, x2] + (x1 - 1)*(x1 - 1)
265+
* temp[x0, (x1 - 2) % 2, x2])
266+
expr += prim.If(prim.Comparison(x0, ">=", 1),
267+
2*x0*b[0]*temp[x0 - 1, x1 % 2, x2], 0)
268+
expr += prim.If(prim.Comparison(x2, ">=", 1),
269+
2*x2*b[2]*temp[x0, x1 % 2, x2 - 1], 0)
270+
expr += prim.If(prim.Comparison(x2, ">=", 2),
271+
2*b[2]*temp[x0, x1 % 2, x2 - 1], 0)
272+
expr *= -inv_r2
273+
insns += [
274+
lp.Assignment(
275+
id=f"temp_{x0}_{x1}_{x2}",
276+
assignee=temp[x0, x1 % 2, x2],
277+
expression=-inv_r2,
278+
depends_on=depends_on,
279+
),
280+
lp.Assignment(
281+
id=f"update_{x0}_{x1}_{x2}",
282+
assignee=result[0],
283+
expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])]
284+
* temp[x0, x1 % 2, x2]),
285+
depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"])
286+
)
287+
]
288+
depends_on = frozenset([f"update_{x0}_{x1}_{x2}"])
289+
290+
loopy_knl = lp.make_function(domains, insns,
291+
kernel_data=[
292+
lp.GlobalArg("result", shape=(len(kernels),), is_input=True,
293+
is_output=True),
294+
lp.GlobalArg("coeffs",
295+
shape=(ncoeffs,), is_input=True, is_output=False),
296+
lp.GlobalArg("center, target",
297+
shape=(dim,), is_input=True, is_output=False),
298+
lp.ValueArg("rscale", is_input=True),
299+
lp.ValueArg("itgt", is_input=True),
300+
lp.ValueArg("ntargets", is_input=True),
301+
lp.GlobalArg("targets",
302+
shape=(dim, "ntargets"), is_input=True, is_output=False),
303+
lp.TemporaryVariable("temp"),
304+
...],
305+
name="e2p",
306+
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
307+
fixed_parameters={"dim": dim, "order": order},
308+
)
309+
310+
loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
311+
loopy_knl = lp.tag_inames(loopy_knl, "x0*:unr")
312+
loopy_knl = lp.tag_inames(loopy_knl, "x1*:unr")
313+
loopy_knl = lp.tag_inames(loopy_knl, "y0*:unr")
314+
for kernel in kernels:
315+
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
316+
317+
loopy_knl = lp.simplify_indices(loopy_knl)
318+
319+
optimizations = []
320+
321+
return loopy_knl, optimizations
322+
323+
137324
def make_p2e_loopy_kernel(
138325
expansion: ExpansionBase, kernels: Sequence[Kernel],
139326
strength_usage: Sequence[int], nstrengths: int) -> lp.TranslationUnit:

sumpy/expansion/multipole.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030
VolumeTaylorExpansionMixin,
3131
LinearPDEConformingVolumeTaylorExpansion)
3232
from sumpy.tools import mi_set_axis, add_to_sac, mi_power, mi_factorial
33+
from sumpy.kernel import Kernel
34+
35+
import loopy as lp
36+
37+
from typing import Sequence
3338

3439
import logging
3540
logger = logging.getLogger(__name__)
@@ -103,6 +108,18 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None):
103108
result = sym.Add(*tuple(result))
104109
return result
105110

111+
def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit:
112+
"""
113+
:returns: a :mod:`loopy` kernel that returns the evaluated
114+
target transforms of the potential given by *kernels*.
115+
"""
116+
from sumpy.expansion.loopy import (make_e2p_loopy_kernel,
117+
make_m2p_loopy_kernel_for_volume_taylor)
118+
try:
119+
return make_m2p_loopy_kernel_for_volume_taylor(self, kernels)
120+
except NotImplementedError:
121+
return make_e2p_loopy_kernel(self, kernels)
122+
106123
def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
107124
dvec, tgt_rscale, sac=None, _fast_version=True):
108125
if not isinstance(src_expansion, type(self)):

0 commit comments

Comments
 (0)