From bbe472116928da13ec7fe205c14369f2ecbabeae Mon Sep 17 00:00:00 2001 From: learning-chip Date: Wed, 4 Mar 2026 10:23:02 +0000 Subject: [PATCH 01/53] update ptoas to https://github.com/huawei-csl/PTOAS/releases/tag/20260304 --- .github/workflows/ci.yml | 2 +- docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4bec59b2..b56bf119 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: env: RELEASE_REPO: huawei-csl/PTOAS - RELEASE_TAG: 20260303 + RELEASE_TAG: 20260304 CLI_DIR: /installers/ptoas-cli PTOISA_COMMIT: 672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc diff --git a/docker/README.md b/docker/README.md index 55bd125c..edbce52a 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,7 +1,7 @@ Usage: ```bash -RELEASE_TAG=20260303 +RELEASE_TAG=20260304 sudo docker build \ --build-arg RELEASE_TAG=$RELEASE_TAG \ . -t pto_dsl:$RELEASE_TAG From 03525e5bca8c428d55263bccb386705195ad39e3 Mon Sep 17 00:00:00 2001 From: Mirko De Vita <61700769+MirkoDeVita98@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:49:46 +0100 Subject: [PATCH 02/53] Geglu Dynamic Multicore (#54) * added dynamic multicore geglu example * added bench geglu and readme for validate examples --------- Co-authored-by: mirkodevita --- .../aot/geglu_dynamic_multicore/.gitignore | 3 + .../aot/geglu_dynamic_multicore/README.md | 7 + .../geglu_dynamic_multicore/bench_geglu.py | 128 +++++++++++++ .../aot/geglu_dynamic_multicore/caller.cpp | 26 +++ .../aot/geglu_dynamic_multicore/compile.sh | 22 +++ .../geglu_dynamic_multicore/geglu_builder.py | 172 ++++++++++++++++++ .../aot/geglu_dynamic_multicore/run_geglu.py | 121 ++++++++++++ 7 files changed, 479 insertions(+) create mode 100644 examples/aot/geglu_dynamic_multicore/.gitignore create mode 100644 examples/aot/geglu_dynamic_multicore/README.md create mode 100644 examples/aot/geglu_dynamic_multicore/bench_geglu.py create mode 100644 examples/aot/geglu_dynamic_multicore/caller.cpp create mode 100755 examples/aot/geglu_dynamic_multicore/compile.sh create mode 100644 examples/aot/geglu_dynamic_multicore/geglu_builder.py create mode 100644 examples/aot/geglu_dynamic_multicore/run_geglu.py diff --git a/examples/aot/geglu_dynamic_multicore/.gitignore b/examples/aot/geglu_dynamic_multicore/.gitignore new file mode 100644 index 00000000..99be97c5 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/.gitignore @@ -0,0 +1,3 @@ +geglu.pto +geglu.cpp +geglu_lib.so diff --git a/examples/aot/geglu_dynamic_multicore/README.md b/examples/aot/geglu_dynamic_multicore/README.md new file mode 100644 index 00000000..374bb9cc --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/README.md @@ -0,0 +1,7 @@ +Usage: + +```bash +bash ./compile.sh +python ./run_geglu.py +python ./bench_geglu.py +``` diff --git a/examples/aot/geglu_dynamic_multicore/bench_geglu.py b/examples/aot/geglu_dynamic_multicore/bench_geglu.py new file mode 100644 index 00000000..2c36e5a7 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/bench_geglu.py @@ -0,0 +1,128 @@ +import argparse +import ctypes + +import torch +import torch.nn.functional as F +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def geglu_func(a, b, c, batch, n_cols, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + batch, + n_cols, + ) + + return geglu_func + + +def bench_geglu( + geglu_func, a, b, c, kernel_name="geglu_func", warmup_iters=5, benchmark_iters=50 +): + batch, n_cols = a.shape + # reads a and b, writes c + io_bytes = a.numel() * a.element_size() * 3 + # Overwrite a large buffer between launches to reduce L2 cache reuse. + cache = torch.empty((256 * 1024 * 1024,), dtype=torch.int8, device=a.device) + + def time_op(fn): + for _ in range(warmup_iters): + fn() + torch.npu.synchronize() + + mixed_start = torch.npu.Event(enable_timing=True) + mixed_end = torch.npu.Event(enable_timing=True) + cache_start = torch.npu.Event(enable_timing=True) + cache_end = torch.npu.Event(enable_timing=True) + + mixed_start.record() + for _ in range(benchmark_iters): + cache.zero_() + fn() + mixed_end.record() + torch.npu.synchronize() + + cache_start.record() + for _ in range(benchmark_iters): + cache.zero_() + cache_end.record() + torch.npu.synchronize() + + mixed_total_ms = mixed_start.elapsed_time(mixed_end) + cache_total_ms = cache_start.elapsed_time(cache_end) + kernel_total_ms = max(mixed_total_ms - cache_total_ms, 0.0) + return kernel_total_ms / benchmark_iters + + custom_ms = time_op(lambda: geglu_func(a, b, c, batch, n_cols)) + torch_ms = time_op(lambda: torch.mul(F.gelu(a, approximate="tanh"), b)) + + custom_bw_gbs = (io_bytes / (custom_ms / 1e3)) / 1e9 + torch_bw_gbs = (io_bytes / (torch_ms / 1e3)) / 1e9 + + print( + f"{kernel_name}: {custom_ms:.3f} ms, " + f"effective bandwidth: {custom_bw_gbs:.3f} GB/s " + f"(IO={io_bytes / 1e6:.2f} MB)" + ) + print( + f"torch gelu*b: {torch_ms:.3f} ms, " + f"effective bandwidth: {torch_bw_gbs:.3f} GB/s " + f"(IO={io_bytes / 1e6:.2f} MB)" + ) + + +def run_bench(lib_path, block_dim=24, batch=1024, n_cols=8192): + device = get_test_device() + torch.npu.set_device(device) + + geglu_func = load_lib(lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + b = torch.randn(batch, n_cols, device=device, dtype=dtype) + c = torch.empty(batch, n_cols, device=device, dtype=dtype) + + geglu_func(a, b, c, batch, n_cols) + torch.npu.synchronize() + + a_f32 = a.float() + ref = (0.5 * a_f32 * (1.0 + torch.tanh(a_f32))).to(dtype) * b + torch.testing.assert_close(c, ref, rtol=1e-2, atol=1e-2) + + bench_geglu(geglu_func, a, b, c, kernel_name=f"geglu ({lib_path})") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--lib", default="./geglu_lib.so") + parser.add_argument("--block-dim", type=int, default=24) + parser.add_argument("--batch", type=int, default=1024) + parser.add_argument("--n-cols", type=int, default=8192) + args = parser.parse_args() + run_bench(args.lib, block_dim=args.block_dim, batch=args.batch, n_cols=args.n_cols) diff --git a/examples/aot/geglu_dynamic_multicore/caller.cpp b/examples/aot/geglu_dynamic_multicore/caller.cpp new file mode 100644 index 00000000..85351fd4 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/caller.cpp @@ -0,0 +1,26 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "geglu.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *a, + uint8_t *b, + uint8_t *c, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(a), + reinterpret_cast(b), + reinterpret_cast(c), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/aot/geglu_dynamic_multicore/compile.sh b/examples/aot/geglu_dynamic_multicore/compile.sh new file mode 100755 index 00000000..9da3faa7 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/compile.sh @@ -0,0 +1,22 @@ +set -e + +rm -f geglu.pto geglu.cpp geglu_lib.so + +python ./geglu_builder.py > ./geglu.pto +ptoas --enable-insert-sync ./geglu.pto -o ./geglu.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"geglu.cpp\"" \ + ./caller.cpp \ + -o ./geglu_lib.so diff --git a/examples/aot/geglu_dynamic_multicore/geglu_builder.py b/examples/aot/geglu_dynamic_multicore/geglu_builder.py new file mode 100644 index 00000000..092f05de --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/geglu_builder.py @@ -0,0 +1,172 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + +const = pto.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_geglu(fn_name="geglu_fp16"): + """ + Build a dynamic-batch GEGLU kernel in PTO DSL. + + Computes c = gelu_approx(a) * b, where: + gelu_approx(a) = 0.5 * a * (1 + tanh(a)) + tanh(a) = (exp(2a) - 1) / (exp(2a) + 1) + + Constants (1.0, 2.0) are derived from the input tile itself using + the identity exp(a - a) = exp(0) = 1.0, which avoids the need for + scalar-tile broadcast operations not available in PTO DSL. + + UB tile budget (fp16, 5 tiles × 32 KB = 160 KB < 192 KB): + tb_a : input row a + tb_b : input row b + tb_ones : constant 1.0 (recomputed each row via exp(a-a)) + tb_tmp1 : intermediate / final output + tb_tmp2 : intermediate + + Kernel args: + a_ptr : fp16[batch * n_cols] -- gating input + b_ptr : fp16[batch * n_cols] -- linear input + c_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = pto.index_cast(batch_i32) + n_cols = pto.index_cast(n_cols_i32) + + with pto.vector_section(): + # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. + + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + bid = pto.index_cast(pto.get_block_idx()) + num_cores = pto.index_cast(pto.get_block_num()) + + # Distribute rows across cores (row-level parallelism). + rows_per_core = pto.ceil_div(batch, num_cores) + row_start = bid * rows_per_core + row_end = pto.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_a = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[total_elems], strides=[c1] + ) + tv_b = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[total_elems], strides=[c1] + ) + tv_c = pto.as_tensor( + tensor_type, ptr=c_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + # Allocate 5 UB tiles (160 KB total, well under 192 KB UB). + tb_a = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_b = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.for_range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_a = pto.slice_view( + subtensor_type, + source=tv_a, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_b = pto.slice_view( + subtensor_type, + source=tv_b, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_c = pto.slice_view( + subtensor_type, + source=tv_c, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_a, tb_a) + pto.load(sv_b, tb_b) + + # Derive constants from data (no scalar-tile broadcast needed): + # a - a = 0 => exp(0) = 1.0 + pto.sub(tb_a, tb_a, tb_tmp2) # tmp2 = 0.0 + pto.exp(tb_tmp2, tb_ones) # ones = 1.0 + + # tanh(a) = (exp(2a) - 1) / (exp(2a) + 1) + pto.add(tb_a, tb_a, tb_tmp1) # tmp1 = 2a + pto.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(2a) + pto.sub(tb_tmp1, tb_ones, tb_tmp2) # tmp2 = exp(2a) - 1 + pto.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = exp(2a) + 1 + pto.div(tb_tmp2, tb_tmp1, tb_tmp2) # tmp2 = tanh(a) + + # gelu_approx(a) = a * (1 + tanh(a)) / 2 + pto.add(tb_ones, tb_tmp2, tb_tmp1) # tmp1 = 1 + tanh(a) + pto.mul(tb_a, tb_tmp1, tb_tmp1) # tmp1 = a * (1 + tanh(a)) + pto.add(tb_ones, tb_ones, tb_tmp2) # tmp2 = 2.0 + pto.div(tb_tmp1, tb_tmp2, tb_tmp1) # tmp1 = gelu_approx(a) + + # GEGLU: c = gelu_approx(a) * b + pto.mul(tb_tmp1, tb_b, tb_tmp1) # tmp1 = c + pto.store(tb_tmp1, sv_c) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="geglu_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_geglu(fn_name=args.fn_name)) diff --git a/examples/aot/geglu_dynamic_multicore/run_geglu.py b/examples/aot/geglu_dynamic_multicore/run_geglu.py new file mode 100644 index 00000000..a180206c --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/run_geglu.py @@ -0,0 +1,121 @@ +import argparse +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def geglu_func(a, b, c, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + batch, + n_cols, + ) + + return geglu_func + + +def geglu_ref(a, b): + """Reference GEGLU matching the PTO kernel. + + Computes c = gelu_approx(a) * b, where: + gelu_approx(a) = 0.5 * a * (1 + tanh(a)) + tanh(a) = (exp(2a) - 1) / (exp(2a) + 1) + + Note: This is a simplified tanh-based GELU (without the polynomial + inner argument used in the full approximation). It matches what the + PTO kernel computes using only tile-tile operations. + """ + a_f32 = a.float() + gelu_a = 0.5 * a_f32 * (1.0 + torch.tanh(a_f32)) + return gelu_a.to(a.dtype) * b + + +def test_geglu(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + geglu = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + # Use small range to stay within fp16 exp range (avoid overflow). + a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + b = torch.randn(batch, n_cols, device=device, dtype=dtype) + c = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = geglu_ref(a, b) + geglu(a, b, c, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(c, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./geglu_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_geglu(args.lib, block_dim=args.block_dim) From 9635d5f74d885166aea2a9ebcd77ae5069ee9b6c Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Wed, 4 Mar 2026 22:35:34 +0100 Subject: [PATCH 03/53] fast hadamard example, in both manual-sync and auto-sync versions (#52) * wip: fast hadamard builder * more closely match cpp ref * caller and execution script * default to manual-sync * extra barrier * disable `pto.barrier("STORE_VEC")` for now due to https://github.com/zhangstevenunity/PTOAS/pull/187 * do not early-exit * print full mismatch msg * allocate double buffer * extra sync between VEC and MTE3 * try refactor to bulk load-store just like manual cpp ref * add one missing sync * fix subset view indexing * add missing sync again * Revert "add missing sync again" This reverts commit ba25329a915888da27b547a01fcb2bed9351f751. * Revert "fix subset view indexing" This reverts commit 9e3316ff4c94215134a3c15e034bced109cf42dd. * add C++ fast hadamard reference * change compile option fixes run-time error, why? * fallback to legacy `--cce-soc-core-type=VecCore` * commit hadamard_manual_sync.cpp for record * fix early-exit logic in Python builder * update generated cpp and run log * fix n_half * Revert "fix n_half" This reverts commit 49cd8720c02e76448313b86158b875acaafd2495. * syntax sugar * fallback to loading one sample at a time, unit test now all pass * test batch 29 to span over two rounds * remove --test-both option * remove cpp references for a clean PR --- examples/aot/fast_hadamard/.gitignore | 7 + examples/aot/fast_hadamard/README.md | 7 + examples/aot/fast_hadamard/caller.cpp | 28 ++ examples/aot/fast_hadamard/compile.sh | 53 ++++ .../aot/fast_hadamard/hadamard_builder.py | 272 ++++++++++++++++++ examples/aot/fast_hadamard/run.log | 149 ++++++++++ examples/aot/fast_hadamard/run_hadamard.py | 122 ++++++++ ptodsl/language.py | 9 + 8 files changed, 647 insertions(+) create mode 100644 examples/aot/fast_hadamard/.gitignore create mode 100644 examples/aot/fast_hadamard/README.md create mode 100644 examples/aot/fast_hadamard/caller.cpp create mode 100644 examples/aot/fast_hadamard/compile.sh create mode 100644 examples/aot/fast_hadamard/hadamard_builder.py create mode 100644 examples/aot/fast_hadamard/run.log create mode 100644 examples/aot/fast_hadamard/run_hadamard.py diff --git a/examples/aot/fast_hadamard/.gitignore b/examples/aot/fast_hadamard/.gitignore new file mode 100644 index 00000000..4b573710 --- /dev/null +++ b/examples/aot/fast_hadamard/.gitignore @@ -0,0 +1,7 @@ +hadamard_no_sync.pto +hadamard_manual_sync.pto +hadamard_auto_sync.cpp +hadamard_manual_sync.cpp +hadamard_auto_sync.pto +hadamard_auto_sync_lib.so +hadamard_manual_sync_lib.so diff --git a/examples/aot/fast_hadamard/README.md b/examples/aot/fast_hadamard/README.md new file mode 100644 index 00000000..c980523e --- /dev/null +++ b/examples/aot/fast_hadamard/README.md @@ -0,0 +1,7 @@ +Usage: + +```bash +bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs +python ./run_hadamard.py # test manual-sync lib (default) +python ./run_hadamard.py --lib ./hadamard_auto_sync_lib.so # test auto-sync lib +``` diff --git a/examples/aot/fast_hadamard/caller.cpp b/examples/aot/fast_hadamard/caller.cpp new file mode 100644 index 00000000..d207b76d --- /dev/null +++ b/examples/aot/fast_hadamard/caller.cpp @@ -0,0 +1,28 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "hadamard_auto_sync.cpp" +#endif +#include KERNEL_CPP + +#ifndef KERNEL_FN +#define KERNEL_FN _kernel +#endif + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint32_t batch, + uint32_t n, + uint32_t log2_n) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + KERNEL_FN<<>>( + reinterpret_cast(x), + static_cast(batch), + static_cast(n), + static_cast(log2_n)); +} diff --git a/examples/aot/fast_hadamard/compile.sh b/examples/aot/fast_hadamard/compile.sh new file mode 100644 index 00000000..02b94cfd --- /dev/null +++ b/examples/aot/fast_hadamard/compile.sh @@ -0,0 +1,53 @@ +set -e + +rm -f \ + hadamard_auto_sync.pto hadamard_manual_sync.pto \ + hadamard_auto_sync.cpp hadamard_manual_sync.cpp \ + hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so + +# Auto-sync path: rely on ptoas synchronization insertion. +python ./hadamard_builder.py > ./hadamard_auto_sync.pto +ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp + +# Manual-sync path: explicit record/wait events from builder. +python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto +ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --cce-soc-version=Ascend910B2 \ + --cce-soc-core-type=VecCore \ + -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"hadamard_auto_sync.cpp\"" \ + ./caller.cpp \ + -o ./hadamard_auto_sync_lib.so + +# TODO: use `--npu-arch=dav-2201` instead of legacy `--cce-soc-version=Ascend910B2 --cce-soc-core-type=VecCore` +# need to change kernel vid calculation accordingly + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --cce-soc-version=Ascend910B2 \ + --cce-soc-core-type=VecCore \ + -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \ + ./caller.cpp \ + -o ./hadamard_manual_sync_lib.so diff --git a/examples/aot/fast_hadamard/hadamard_builder.py b/examples/aot/fast_hadamard/hadamard_builder.py new file mode 100644 index 00000000..d51e75ab --- /dev/null +++ b/examples/aot/fast_hadamard/hadamard_builder.py @@ -0,0 +1,272 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + +const = pto.const + +ELEMENTS_PER_TILE = 32 * 1024 // 2 # 32KB UB / sizeof(fp16) +HALF_ELEMENTS_PER_TILE = ELEMENTS_PER_TILE // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_full = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + subtensor_half = pto.SubTensorType(shape=[1, HALF_ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_full = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + tile_half = pto.TileBufType( + shape=[1, HALF_ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_full": subtensor_full, + "subtensor_half": subtensor_half, + "tile_full": tile_full, + "tile_half": tile_half, + } + + +def build_fast_hadamard(fn_name="fast_hadamard_fp16", manual_sync=False): + """ + Build a dynamic-batch fast-hadamard kernel in PTO DSL. + + Args: + fn_name: generated kernel symbol name. + manual_sync: + - False: rely on `ptoas --enable-insert-sync`. + - True: emit explicit record/wait events with event_id 0/1. + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_i32: "index_dtype", + log2_n_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c2 = const(2) + c_tile = const(ELEMENTS_PER_TILE) + + batch = pto.index_cast(batch_i32) + n = pto.index_cast(n_i32) + log2_n = pto.index_cast(log2_n_i32) + + with pto.vector_section(): + # Match reference early scalar setup/return order. + bid = pto.index_cast(pto.get_block_idx()) + num_blocks = pto.index_cast(pto.get_block_num()) + + # Match reference kernel partitioning: block-level split only. + num_cores = num_blocks + samples_per_core = pto.ceil_div(batch, num_cores) + sample_offset = bid * samples_per_core + + # Early reject for invalid n. + valid_n = n > c0 + within_tile = c_tile >= n + with pto.if_context(valid_n): + with pto.if_context(within_tile): + with pto.if_context(sample_offset < batch): + samples_end = sample_offset + samples_per_core + samples_to_process = pto.select( + samples_end > batch, + batch - sample_offset, + samples_per_core, + ) + + with pto.if_context(samples_to_process > c0): + total_elements = batch * n + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + ) + + # Two independent tile sets (ping/pong) so event_id 0/1 map to + # disjoint UB buffers, matching the manual C++ reference. + tb_row_0 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + + tb_row_1 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + + n_half = n // c2 + + # Keep one sample per chunk. Multi-sample chunks interact + # poorly with static tile subset sizing in current PTO Python + # bindings and can corrupt rows for larger batches. + samples_per_load = c1 + num_chunks = pto.ceil_div(samples_to_process, samples_per_load) + + if manual_sync: + pto.record_event("VEC", "LOAD", event_id=0) + pto.record_event("VEC", "LOAD", event_id=1) + pto.record_event("STORE_VEC", "VEC", event_id=0) + pto.record_event("STORE_VEC", "VEC", event_id=1) + + for chunk_i in pto.for_range(c0, num_chunks, c1): + sample_done = chunk_i * samples_per_load + chunk_left = samples_to_process - sample_done + cur_samples = pto.select( + chunk_left < samples_per_load, chunk_left, samples_per_load + ) + + with pto.if_context(cur_samples > c0): + gm_offset = (sample_offset + sample_done) * n + use_ev0 = (chunk_i % c2) == c0 + + with pto.if_context(use_ev0, has_else=True) as branch: + for s in pto.for_range(c0, cur_samples, c1): + row_offset = gm_offset + s * n + sv_row = pto.slice_view( + subtensor_full, + source=tv_x, + offsets=[row_offset], + sizes=[n], + ) + # Alias row halves inside UB row tile (no GM round-trip + # per Hadamard iteration). + tb_first_0 = pto.subset( + tb_row_0, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] + ) + tb_second_0 = pto.subset( + tb_row_0, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] + ) + + if manual_sync: + pto.wait_event("VEC", "LOAD", event_id=0) + pto.wait_event("STORE_VEC", "VEC", event_id=0) + pto.load(sv_row, tb_row_0) + if manual_sync: + pto.record_wait_pair("LOAD", "VEC", event_id=0) + + for _ in pto.for_range(c0, log2_n, c1): + pto.gather( + tb_row_0, tb_even_0, mask_pattern="P0101" + ) + pto.gather( + tb_row_0, tb_odd_0, mask_pattern="P1010" + ) + if manual_sync: + pto.barrier("VEC") + pto.add(tb_even_0, tb_odd_0, tb_first_0) + pto.sub(tb_even_0, tb_odd_0, tb_second_0) + if manual_sync: + pto.barrier("VEC") + + if manual_sync: + pto.record_wait_pair( + "VEC", "STORE_VEC", event_id=0 + ) + pto.store(tb_row_0, sv_row) + if manual_sync: + pto.record_event( + "STORE_VEC", "VEC", event_id=0 + ) + pto.record_event("VEC", "LOAD", event_id=0) + + with branch.else_context(): + for s in pto.for_range(c0, cur_samples, c1): + row_offset = gm_offset + s * n + sv_row = pto.slice_view( + subtensor_full, + source=tv_x, + offsets=[row_offset], + sizes=[n], + ) + # Alias row halves inside UB row tile (no GM + # round-trip per Hadamard iteration). + tb_first_1 = pto.subset( + tb_row_1, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] + ) + tb_second_1 = pto.subset( + tb_row_1, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] + ) + + if manual_sync: + pto.wait_event("VEC", "LOAD", event_id=1) + pto.wait_event("STORE_VEC", "VEC", event_id=1) + pto.load(sv_row, tb_row_1) + if manual_sync: + pto.record_wait_pair( + "LOAD", "VEC", event_id=1 + ) + + for _ in pto.for_range(c0, log2_n, c1): + pto.gather( + tb_row_1, tb_even_1, mask_pattern="P0101" + ) + pto.gather( + tb_row_1, tb_odd_1, mask_pattern="P1010" + ) + if manual_sync: + pto.barrier("VEC") + pto.add(tb_even_1, tb_odd_1, tb_first_1) + pto.sub(tb_even_1, tb_odd_1, tb_second_1) + if manual_sync: + pto.barrier("VEC") + + if manual_sync: + pto.record_wait_pair( + "VEC", "STORE_VEC", event_id=1 + ) + pto.store(tb_row_1, sv_row) + if manual_sync: + pto.record_event( + "STORE_VEC", "VEC", event_id=1 + ) + pto.record_event("VEC", "LOAD", event_id=1) + + if manual_sync: + pto.wait_event("VEC", "LOAD", event_id=0) + pto.wait_event("VEC", "LOAD", event_id=1) + pto.wait_event("STORE_VEC", "VEC", event_id=0) + pto.wait_event("STORE_VEC", "VEC", event_id=1) + + # Function name is controlled by the Python function symbol used with + # to_ir_module; keep fn_name arg for compatibility with caller scripts. + _ = fn_name + return _kernel + + +if __name__ == "__main__": + # Default: autosync variant, compile with: + # ptoas --enable-insert-sync hadamard.pto -o hadamard.cpp + # + # Manual sync variant: + # python hadamard_builder.py --manual-sync > hadamard.pto + # ptoas hadamard.pto -o hadamard.cpp + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Emit explicit record/wait events instead of relying on --enable-insert-sync.", + ) + parser.add_argument( + "--fn-name", + default="fast_hadamard_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_fast_hadamard(fn_name=args.fn_name, manual_sync=args.manual_sync)) diff --git a/examples/aot/fast_hadamard/run.log b/examples/aot/fast_hadamard/run.log new file mode 100644 index 00000000..4d4dfc48 --- /dev/null +++ b/examples/aot/fast_hadamard/run.log @@ -0,0 +1,149 @@ +[match] batch=1, n=128, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=256, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=512, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=1024, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=2048, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=4096, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=8192, lib=./hadamard_manual_sync_lib.so +[match] batch=1, n=16384, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=128, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=256, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=512, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=1024, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=2048, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=4096, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=8192, lib=./hadamard_manual_sync_lib.so +[match] batch=7, n=16384, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=128, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=256, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=512, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=1024, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=2048, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=4096, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=8192, lib=./hadamard_manual_sync_lib.so +[match] batch=22, n=16384, lib=./hadamard_manual_sync_lib.so +[mismatch] batch=65, n=128, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 1792 / 8320 (21.5%) +Greatest absolute difference: 41.03125 at index (19, 116) (up to 1e-05 allowed) +Greatest relative difference: 113.0625 at index (57, 93) (up to 0.001 allowed) +[mismatch] batch=65, n=256, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 7680 / 16640 (46.2%) +Greatest absolute difference: 70.3125 at index (34, 156) (up to 1e-05 allowed) +Greatest relative difference: inf at index (0, 67) (up to 0.001 allowed) +[mismatch] batch=65, n=512, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 14591 / 33280 (43.8%) +Greatest absolute difference: 88.875 at index (10, 58) (up to 1e-05 allowed) +Greatest relative difference: inf at index (22, 218) (up to 0.001 allowed) +[mismatch] batch=65, n=1024, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 24576 / 66560 (36.9%) +Greatest absolute difference: 127.75 at index (3, 403) (up to 1e-05 allowed) +Greatest relative difference: inf at index (4, 1001) (up to 0.001 allowed) +[mismatch] batch=65, n=2048, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 62464 / 133120 (46.9%) +Greatest absolute difference: 193.25 at index (52, 214) (up to 1e-05 allowed) +Greatest relative difference: inf at index (0, 1983) (up to 0.001 allowed) +[mismatch] batch=65, n=4096, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 94464 / 266240 (35.5%) +Greatest absolute difference: 283.25 at index (18, 1588) (up to 1e-05 allowed) +Greatest relative difference: inf at index (18, 1700) (up to 0.001 allowed) +[mismatch] batch=65, n=8192, lib=./hadamard_manual_sync_lib.so + detail: +Tensor-likes are not close! + +Mismatched elements: 104959 / 532480 (19.7%) +Greatest absolute difference: 400.5 at index (60, 2799) (up to 1e-05 allowed) +Greatest relative difference: inf at index (3, 8170) (up to 0.001 allowed) +[match] batch=65, n=16384, lib=./hadamard_manual_sync_lib.so +detailed summary for ./hadamard_manual_sync_lib.so: + batch=1, n=128, status=match + batch=1, n=256, status=match + batch=1, n=512, status=match + batch=1, n=1024, status=match + batch=1, n=2048, status=match + batch=1, n=4096, status=match + batch=1, n=8192, status=match + batch=1, n=16384, status=match + batch=7, n=128, status=match + batch=7, n=256, status=match + batch=7, n=512, status=match + batch=7, n=1024, status=match + batch=7, n=2048, status=match + batch=7, n=4096, status=match + batch=7, n=8192, status=match + batch=7, n=16384, status=match + batch=22, n=128, status=match + batch=22, n=256, status=match + batch=22, n=512, status=match + batch=22, n=1024, status=match + batch=22, n=2048, status=match + batch=22, n=4096, status=match + batch=22, n=8192, status=match + batch=22, n=16384, status=match + batch=65, n=128, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 1792 / 8320 (21.5%) +Greatest absolute difference: 41.03125 at index (19, 116) (up to 1e-05 allowed) +Greatest relative difference: 113.0625 at index (57, 93) (up to 0.001 allowed) + batch=65, n=256, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 7680 / 16640 (46.2%) +Greatest absolute difference: 70.3125 at index (34, 156) (up to 1e-05 allowed) +Greatest relative difference: inf at index (0, 67) (up to 0.001 allowed) + batch=65, n=512, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 14591 / 33280 (43.8%) +Greatest absolute difference: 88.875 at index (10, 58) (up to 1e-05 allowed) +Greatest relative difference: inf at index (22, 218) (up to 0.001 allowed) + batch=65, n=1024, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 24576 / 66560 (36.9%) +Greatest absolute difference: 127.75 at index (3, 403) (up to 1e-05 allowed) +Greatest relative difference: inf at index (4, 1001) (up to 0.001 allowed) + batch=65, n=2048, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 62464 / 133120 (46.9%) +Greatest absolute difference: 193.25 at index (52, 214) (up to 1e-05 allowed) +Greatest relative difference: inf at index (0, 1983) (up to 0.001 allowed) + batch=65, n=4096, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 94464 / 266240 (35.5%) +Greatest absolute difference: 283.25 at index (18, 1588) (up to 1e-05 allowed) +Greatest relative difference: inf at index (18, 1700) (up to 0.001 allowed) + batch=65, n=8192, status=mismatch + detail: +Tensor-likes are not close! + +Mismatched elements: 104959 / 532480 (19.7%) +Greatest absolute difference: 400.5 at index (60, 2799) (up to 1e-05 allowed) +Greatest relative difference: inf at index (3, 8170) (up to 0.001 allowed) + batch=65, n=16384, status=match diff --git a/examples/aot/fast_hadamard/run_hadamard.py b/examples/aot/fast_hadamard/run_hadamard.py new file mode 100644 index 00000000..dbc67243 --- /dev/null +++ b/examples/aot/fast_hadamard/run_hadamard.py @@ -0,0 +1,122 @@ +import argparse +import ctypes +import math + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x (in-place) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n + ctypes.c_uint32, # log2_n + ] + lib.call_kernel.restype = None + + def hadamard_func(x, batch, n, log2_n, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + batch, + n, + log2_n, + ) + + return hadamard_func + + +def hadamard_ref_inplace(x): + """Reference FHT matching TGATHER(P0101/P1010) + TADD/TSUB layout.""" + x = x.clone() + n = x.shape[-1] + n_half = n // 2 + log2_n = int(math.log2(n)) + for _ in range(log2_n): + even = x[..., 0::2].clone() + odd = x[..., 1::2].clone() + x[..., :n_half] = even + odd + x[..., n_half:] = even - odd + return x + + +def _is_power_of_two(v): + return v > 0 and (v & (v - 1)) == 0 + + +def test_hadamard(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + hadamard = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 7, 29, 65] + n_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n in n_list: + if not _is_power_of_two(n): + continue + log2_n = int(math.log2(n)) + x = torch.randn(batch, n, device=device, dtype=dtype) + y_ref = hadamard_ref_inplace(x) + + hadamard(x, batch, n, log2_n) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(x, y_ref) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n={n}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n, status, detail)) + + print(f"detailed summary for {lib_path}:") + for batch, n, status, detail in results: + msg = f" batch={batch}, n={n}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./hadamard_manual_sync_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_hadamard(args.lib, block_dim=args.block_dim) diff --git a/ptodsl/language.py b/ptodsl/language.py index d465ddd4..c05ef54b 100644 --- a/ptodsl/language.py +++ b/ptodsl/language.py @@ -227,6 +227,11 @@ def alloc_tile(tile_type, *, valid_row=None, valid_col=None): return pto.AllocTileOp(tile_type, **kwargs).result +def subset(source, offsets, sizes): + offset_vals = [_unwrap(v) for v in offsets] + return pto.subset(source, offset_vals, sizes) + + def load(source, dest): pto.TLoadOp(None, source, dest) @@ -402,3 +407,7 @@ def record_wait_pair(record_op, wait_op, event_id=0): ev = _resolve_event_id(event_id) pto.record_event(rec, w, ev) pto.wait_event(rec, w, ev) + + +def barrier(sync_op): + pto.barrier(_resolve_sync_op(sync_op)) From 79cd084b615cac6de270ebee47cd8f695041dd04 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:47:51 +0100 Subject: [PATCH 04/53] Various code clean-ups for hadamard example (#59) * split-out auto vs manual sync branches, so that default auto-sync builder looks clean * remove old run log * reuse code across if and else branches * fix bracket * remove temporary `num_blocks` var * simplify lib path * move kernel side dim check to launch time, to reduce indentation * flatten module builder, do not use closure --- examples/aot/fast_hadamard/README.md | 4 +- examples/aot/fast_hadamard/caller.cpp | 2 +- examples/aot/fast_hadamard/compile.sh | 2 +- .../aot/fast_hadamard/hadamard_builder.py | 424 +++++++++--------- examples/aot/fast_hadamard/run.log | 149 ------ examples/aot/fast_hadamard/run_hadamard.py | 16 +- 6 files changed, 227 insertions(+), 370 deletions(-) delete mode 100644 examples/aot/fast_hadamard/run.log diff --git a/examples/aot/fast_hadamard/README.md b/examples/aot/fast_hadamard/README.md index c980523e..2d036518 100644 --- a/examples/aot/fast_hadamard/README.md +++ b/examples/aot/fast_hadamard/README.md @@ -2,6 +2,6 @@ Usage: ```bash bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs -python ./run_hadamard.py # test manual-sync lib (default) -python ./run_hadamard.py --lib ./hadamard_auto_sync_lib.so # test auto-sync lib +python ./run_hadamard.py # test auto-sync lib (default) +python ./run_hadamard.py --manual-sync # test manual-sync lib ``` diff --git a/examples/aot/fast_hadamard/caller.cpp b/examples/aot/fast_hadamard/caller.cpp index d207b76d..1ddaff6a 100644 --- a/examples/aot/fast_hadamard/caller.cpp +++ b/examples/aot/fast_hadamard/caller.cpp @@ -4,7 +4,7 @@ #include KERNEL_CPP #ifndef KERNEL_FN -#define KERNEL_FN _kernel +#define KERNEL_FN fast_hadamard_autosync #endif #ifndef NUM_CORES diff --git a/examples/aot/fast_hadamard/compile.sh b/examples/aot/fast_hadamard/compile.sh index 02b94cfd..3f7bad2a 100644 --- a/examples/aot/fast_hadamard/compile.sh +++ b/examples/aot/fast_hadamard/compile.sh @@ -27,7 +27,6 @@ bisheng \ --cce-soc-core-type=VecCore \ -DMEMORY_BASE \ -std=gnu++17 \ - -DKERNEL_CPP="\"hadamard_auto_sync.cpp\"" \ ./caller.cpp \ -o ./hadamard_auto_sync_lib.so @@ -49,5 +48,6 @@ bisheng \ -DMEMORY_BASE \ -std=gnu++17 \ -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \ + -DKERNEL_FN=fast_hadamard_manualsync \ ./caller.cpp \ -o ./hadamard_manual_sync_lib.so diff --git a/examples/aot/fast_hadamard/hadamard_builder.py b/examples/aot/fast_hadamard/hadamard_builder.py index d51e75ab..7353edf0 100644 --- a/examples/aot/fast_hadamard/hadamard_builder.py +++ b/examples/aot/fast_hadamard/hadamard_builder.py @@ -43,230 +43,228 @@ def meta_data(): } -def build_fast_hadamard(fn_name="fast_hadamard_fp16", manual_sync=False): - """ - Build a dynamic-batch fast-hadamard kernel in PTO DSL. - - Args: - fn_name: generated kernel symbol name. - manual_sync: - - False: rely on `ptoas --enable-insert-sync`. - - True: emit explicit record/wait events with event_id 0/1. - """ - - @to_ir_module(meta_data=meta_data) - def _kernel( - x_ptr: "ptr_type", - batch_i32: "index_dtype", - n_i32: "index_dtype", - log2_n_i32: "index_dtype", - ) -> None: - c0 = const(0) - c1 = const(1) - c2 = const(2) - c_tile = const(ELEMENTS_PER_TILE) - - batch = pto.index_cast(batch_i32) - n = pto.index_cast(n_i32) - log2_n = pto.index_cast(log2_n_i32) - - with pto.vector_section(): - # Match reference early scalar setup/return order. - bid = pto.index_cast(pto.get_block_idx()) - num_blocks = pto.index_cast(pto.get_block_num()) - - # Match reference kernel partitioning: block-level split only. - num_cores = num_blocks - samples_per_core = pto.ceil_div(batch, num_cores) - sample_offset = bid * samples_per_core - - # Early reject for invalid n. - valid_n = n > c0 - within_tile = c_tile >= n - with pto.if_context(valid_n): - with pto.if_context(within_tile): - with pto.if_context(sample_offset < batch): - samples_end = sample_offset + samples_per_core - samples_to_process = pto.select( - samples_end > batch, - batch - sample_offset, - samples_per_core, +@to_ir_module(meta_data=meta_data) +def fast_hadamard_autosync( + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_i32: "index_dtype", + log2_n_i32: "index_dtype", +) -> None: + c0 = const(0) + c1 = const(1) + c2 = const(2) + + batch = pto.index_cast(batch_i32) + n = pto.index_cast(n_i32) + log2_n = pto.index_cast(log2_n_i32) + + with pto.vector_section(): + # Match reference early scalar setup/return order. + bid = pto.index_cast(pto.get_block_idx()) + num_cores = pto.index_cast(pto.get_block_num()) + samples_per_core = pto.ceil_div(batch, num_cores) + sample_offset = bid * samples_per_core + + with pto.if_context(sample_offset < batch): + samples_end = sample_offset + samples_per_core + samples_to_process = pto.select( + samples_end > batch, + batch - sample_offset, + samples_per_core, + ) + + with pto.if_context(samples_to_process > c0): + total_elements = batch * n + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + ) + + # Two independent tile sets (ping/pong) so event_id 0/1 map to + # disjoint UB buffers, matching the manual C++ reference. + tb_row_0 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + + tb_row_1 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + + n_half = n // c2 + + # Keep one sample per chunk. Multi-sample chunks interact + # poorly with static tile subset sizing in current PTO Python + # bindings and can corrupt rows for larger batches. + samples_per_load = c1 + num_chunks = pto.ceil_div(samples_to_process, samples_per_load) + + def process_rows(tb_row, tb_even, tb_odd, gm_offset, cur_samples): + for s in pto.for_range(c0, cur_samples, c1): + row_offset = gm_offset + s * n + sv_row = pto.slice_view( + subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] ) - - with pto.if_context(samples_to_process > c0): - total_elements = batch * n - tv_x = pto.as_tensor( - tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + # Alias row halves inside UB row tile (no GM round-trip + # per Hadamard iteration). + tb_first = pto.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]) + tb_second = pto.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]) + + pto.load(sv_row, tb_row) + for _ in pto.for_range(c0, log2_n, c1): + pto.gather(tb_row, tb_even, mask_pattern="P0101") + pto.gather(tb_row, tb_odd, mask_pattern="P1010") + pto.add(tb_even, tb_odd, tb_first) + pto.sub(tb_even, tb_odd, tb_second) + pto.store(tb_row, sv_row) + + for chunk_i in pto.for_range(c0, num_chunks, c1): + sample_done = chunk_i * samples_per_load + chunk_left = samples_to_process - sample_done + cur_samples = pto.select( + chunk_left < samples_per_load, chunk_left, samples_per_load + ) + + with pto.if_context(cur_samples > c0): + gm_offset = (sample_offset + sample_done) * n + use_ev0 = (chunk_i % c2) == c0 + + with pto.if_context(use_ev0, has_else=True) as branch: + process_rows( + tb_row_0, tb_even_0, tb_odd_0, gm_offset, cur_samples + ) + with branch.else_context(): + process_rows( + tb_row_1, tb_even_1, tb_odd_1, gm_offset, cur_samples ) - # Two independent tile sets (ping/pong) so event_id 0/1 map to - # disjoint UB buffers, matching the manual C++ reference. - tb_row_0 = pto.alloc_tile(tile_full, valid_col=n) - tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2) - tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2) - - tb_row_1 = pto.alloc_tile(tile_full, valid_col=n) - tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2) - tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2) - - n_half = n // c2 - - # Keep one sample per chunk. Multi-sample chunks interact - # poorly with static tile subset sizing in current PTO Python - # bindings and can corrupt rows for larger batches. - samples_per_load = c1 - num_chunks = pto.ceil_div(samples_to_process, samples_per_load) - - if manual_sync: - pto.record_event("VEC", "LOAD", event_id=0) - pto.record_event("VEC", "LOAD", event_id=1) - pto.record_event("STORE_VEC", "VEC", event_id=0) - pto.record_event("STORE_VEC", "VEC", event_id=1) - - for chunk_i in pto.for_range(c0, num_chunks, c1): - sample_done = chunk_i * samples_per_load - chunk_left = samples_to_process - sample_done - cur_samples = pto.select( - chunk_left < samples_per_load, chunk_left, samples_per_load - ) - - with pto.if_context(cur_samples > c0): - gm_offset = (sample_offset + sample_done) * n - use_ev0 = (chunk_i % c2) == c0 - - with pto.if_context(use_ev0, has_else=True) as branch: - for s in pto.for_range(c0, cur_samples, c1): - row_offset = gm_offset + s * n - sv_row = pto.slice_view( - subtensor_full, - source=tv_x, - offsets=[row_offset], - sizes=[n], - ) - # Alias row halves inside UB row tile (no GM round-trip - # per Hadamard iteration). - tb_first_0 = pto.subset( - tb_row_0, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] - ) - tb_second_0 = pto.subset( - tb_row_0, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] - ) - - if manual_sync: - pto.wait_event("VEC", "LOAD", event_id=0) - pto.wait_event("STORE_VEC", "VEC", event_id=0) - pto.load(sv_row, tb_row_0) - if manual_sync: - pto.record_wait_pair("LOAD", "VEC", event_id=0) - - for _ in pto.for_range(c0, log2_n, c1): - pto.gather( - tb_row_0, tb_even_0, mask_pattern="P0101" - ) - pto.gather( - tb_row_0, tb_odd_0, mask_pattern="P1010" - ) - if manual_sync: - pto.barrier("VEC") - pto.add(tb_even_0, tb_odd_0, tb_first_0) - pto.sub(tb_even_0, tb_odd_0, tb_second_0) - if manual_sync: - pto.barrier("VEC") - - if manual_sync: - pto.record_wait_pair( - "VEC", "STORE_VEC", event_id=0 - ) - pto.store(tb_row_0, sv_row) - if manual_sync: - pto.record_event( - "STORE_VEC", "VEC", event_id=0 - ) - pto.record_event("VEC", "LOAD", event_id=0) - - with branch.else_context(): - for s in pto.for_range(c0, cur_samples, c1): - row_offset = gm_offset + s * n - sv_row = pto.slice_view( - subtensor_full, - source=tv_x, - offsets=[row_offset], - sizes=[n], - ) - # Alias row halves inside UB row tile (no GM - # round-trip per Hadamard iteration). - tb_first_1 = pto.subset( - tb_row_1, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] - ) - tb_second_1 = pto.subset( - tb_row_1, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] - ) - - if manual_sync: - pto.wait_event("VEC", "LOAD", event_id=1) - pto.wait_event("STORE_VEC", "VEC", event_id=1) - pto.load(sv_row, tb_row_1) - if manual_sync: - pto.record_wait_pair( - "LOAD", "VEC", event_id=1 - ) - - for _ in pto.for_range(c0, log2_n, c1): - pto.gather( - tb_row_1, tb_even_1, mask_pattern="P0101" - ) - pto.gather( - tb_row_1, tb_odd_1, mask_pattern="P1010" - ) - if manual_sync: - pto.barrier("VEC") - pto.add(tb_even_1, tb_odd_1, tb_first_1) - pto.sub(tb_even_1, tb_odd_1, tb_second_1) - if manual_sync: - pto.barrier("VEC") - - if manual_sync: - pto.record_wait_pair( - "VEC", "STORE_VEC", event_id=1 - ) - pto.store(tb_row_1, sv_row) - if manual_sync: - pto.record_event( - "STORE_VEC", "VEC", event_id=1 - ) - pto.record_event("VEC", "LOAD", event_id=1) - - if manual_sync: - pto.wait_event("VEC", "LOAD", event_id=0) - pto.wait_event("VEC", "LOAD", event_id=1) - pto.wait_event("STORE_VEC", "VEC", event_id=0) - pto.wait_event("STORE_VEC", "VEC", event_id=1) - - # Function name is controlled by the Python function symbol used with - # to_ir_module; keep fn_name arg for compatibility with caller scripts. - _ = fn_name - return _kernel + +@to_ir_module(meta_data=meta_data) +def fast_hadamard_manualsync( + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_i32: "index_dtype", + log2_n_i32: "index_dtype", +) -> None: + c0 = const(0) + c1 = const(1) + c2 = const(2) + + batch = pto.index_cast(batch_i32) + n = pto.index_cast(n_i32) + log2_n = pto.index_cast(log2_n_i32) + + with pto.vector_section(): + # Match reference early scalar setup/return order. + bid = pto.index_cast(pto.get_block_idx()) + num_cores = pto.index_cast(pto.get_block_num()) + samples_per_core = pto.ceil_div(batch, num_cores) + sample_offset = bid * samples_per_core + + with pto.if_context(sample_offset < batch): + samples_end = sample_offset + samples_per_core + samples_to_process = pto.select( + samples_end > batch, + batch - sample_offset, + samples_per_core, + ) + + with pto.if_context(samples_to_process > c0): + total_elements = batch * n + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + ) + + # Two independent tile sets (ping/pong) so event_id 0/1 map to + # disjoint UB buffers, matching the manual C++ reference. + tb_row_0 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + + tb_row_1 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + + n_half = n // c2 + + # Keep one sample per chunk. Multi-sample chunks interact + # poorly with static tile subset sizing in current PTO Python + # bindings and can corrupt rows for larger batches. + samples_per_load = c1 + num_chunks = pto.ceil_div(samples_to_process, samples_per_load) + + def process_rows( + tb_row, tb_even, tb_odd, event_id, gm_offset, cur_samples + ): + for s in pto.for_range(c0, cur_samples, c1): + row_offset = gm_offset + s * n + sv_row = pto.slice_view( + subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] + ) + # Alias row halves inside UB row tile (no GM round-trip + # per Hadamard iteration). + tb_first = pto.subset( + tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] + ) + tb_second = pto.subset( + tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] + ) + + pto.wait_event("VEC", "LOAD", event_id=event_id) + pto.wait_event("STORE_VEC", "VEC", event_id=event_id) + pto.load(sv_row, tb_row) + pto.record_wait_pair("LOAD", "VEC", event_id=event_id) + + for _ in pto.for_range(c0, log2_n, c1): + pto.gather(tb_row, tb_even, mask_pattern="P0101") + pto.gather(tb_row, tb_odd, mask_pattern="P1010") + pto.barrier("VEC") + pto.add(tb_even, tb_odd, tb_first) + pto.sub(tb_even, tb_odd, tb_second) + pto.barrier("VEC") + + pto.record_wait_pair( + "VEC", "STORE_VEC", event_id=event_id + ) + pto.store(tb_row, sv_row) + pto.record_event("STORE_VEC", "VEC", event_id=event_id) + pto.record_event("VEC", "LOAD", event_id=event_id) + + for event_id in (0, 1): + pto.record_event("VEC", "LOAD", event_id=event_id) + pto.record_event("STORE_VEC", "VEC", event_id=event_id) + + for chunk_i in pto.for_range(c0, num_chunks, c1): + sample_done = chunk_i * samples_per_load + chunk_left = samples_to_process - sample_done + cur_samples = pto.select( + chunk_left < samples_per_load, chunk_left, samples_per_load + ) + + with pto.if_context(cur_samples > c0): + gm_offset = (sample_offset + sample_done) * n + use_ev0 = (chunk_i % c2) == c0 + + with pto.if_context(use_ev0, has_else=True) as branch: + process_rows(tb_row_0, tb_even_0, tb_odd_0, 0, gm_offset, cur_samples) + with branch.else_context(): + process_rows(tb_row_1, tb_even_1, tb_odd_1, 1, gm_offset, cur_samples) + + for event_id in (0, 1): + pto.wait_event("VEC", "LOAD", event_id=event_id) + pto.wait_event("STORE_VEC", "VEC", event_id=event_id) + if __name__ == "__main__": - # Default: autosync variant, compile with: - # ptoas --enable-insert-sync hadamard.pto -o hadamard.cpp - # - # Manual sync variant: - # python hadamard_builder.py --manual-sync > hadamard.pto - # ptoas hadamard.pto -o hadamard.cpp import argparse - parser = argparse.ArgumentParser() parser.add_argument( "--manual-sync", action="store_true", help="Emit explicit record/wait events instead of relying on --enable-insert-sync.", ) - parser.add_argument( - "--fn-name", - default="fast_hadamard_fp16", - help="Generated kernel function name.", - ) args = parser.parse_args() - print(build_fast_hadamard(fn_name=args.fn_name, manual_sync=args.manual_sync)) + if args.manual_sync: + module = fast_hadamard_manualsync + else: + module = fast_hadamard_autosync + print(module) diff --git a/examples/aot/fast_hadamard/run.log b/examples/aot/fast_hadamard/run.log deleted file mode 100644 index 4d4dfc48..00000000 --- a/examples/aot/fast_hadamard/run.log +++ /dev/null @@ -1,149 +0,0 @@ -[match] batch=1, n=128, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=256, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=512, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=1024, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=2048, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=4096, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=8192, lib=./hadamard_manual_sync_lib.so -[match] batch=1, n=16384, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=128, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=256, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=512, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=1024, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=2048, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=4096, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=8192, lib=./hadamard_manual_sync_lib.so -[match] batch=7, n=16384, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=128, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=256, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=512, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=1024, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=2048, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=4096, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=8192, lib=./hadamard_manual_sync_lib.so -[match] batch=22, n=16384, lib=./hadamard_manual_sync_lib.so -[mismatch] batch=65, n=128, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 1792 / 8320 (21.5%) -Greatest absolute difference: 41.03125 at index (19, 116) (up to 1e-05 allowed) -Greatest relative difference: 113.0625 at index (57, 93) (up to 0.001 allowed) -[mismatch] batch=65, n=256, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 7680 / 16640 (46.2%) -Greatest absolute difference: 70.3125 at index (34, 156) (up to 1e-05 allowed) -Greatest relative difference: inf at index (0, 67) (up to 0.001 allowed) -[mismatch] batch=65, n=512, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 14591 / 33280 (43.8%) -Greatest absolute difference: 88.875 at index (10, 58) (up to 1e-05 allowed) -Greatest relative difference: inf at index (22, 218) (up to 0.001 allowed) -[mismatch] batch=65, n=1024, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 24576 / 66560 (36.9%) -Greatest absolute difference: 127.75 at index (3, 403) (up to 1e-05 allowed) -Greatest relative difference: inf at index (4, 1001) (up to 0.001 allowed) -[mismatch] batch=65, n=2048, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 62464 / 133120 (46.9%) -Greatest absolute difference: 193.25 at index (52, 214) (up to 1e-05 allowed) -Greatest relative difference: inf at index (0, 1983) (up to 0.001 allowed) -[mismatch] batch=65, n=4096, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 94464 / 266240 (35.5%) -Greatest absolute difference: 283.25 at index (18, 1588) (up to 1e-05 allowed) -Greatest relative difference: inf at index (18, 1700) (up to 0.001 allowed) -[mismatch] batch=65, n=8192, lib=./hadamard_manual_sync_lib.so - detail: -Tensor-likes are not close! - -Mismatched elements: 104959 / 532480 (19.7%) -Greatest absolute difference: 400.5 at index (60, 2799) (up to 1e-05 allowed) -Greatest relative difference: inf at index (3, 8170) (up to 0.001 allowed) -[match] batch=65, n=16384, lib=./hadamard_manual_sync_lib.so -detailed summary for ./hadamard_manual_sync_lib.so: - batch=1, n=128, status=match - batch=1, n=256, status=match - batch=1, n=512, status=match - batch=1, n=1024, status=match - batch=1, n=2048, status=match - batch=1, n=4096, status=match - batch=1, n=8192, status=match - batch=1, n=16384, status=match - batch=7, n=128, status=match - batch=7, n=256, status=match - batch=7, n=512, status=match - batch=7, n=1024, status=match - batch=7, n=2048, status=match - batch=7, n=4096, status=match - batch=7, n=8192, status=match - batch=7, n=16384, status=match - batch=22, n=128, status=match - batch=22, n=256, status=match - batch=22, n=512, status=match - batch=22, n=1024, status=match - batch=22, n=2048, status=match - batch=22, n=4096, status=match - batch=22, n=8192, status=match - batch=22, n=16384, status=match - batch=65, n=128, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 1792 / 8320 (21.5%) -Greatest absolute difference: 41.03125 at index (19, 116) (up to 1e-05 allowed) -Greatest relative difference: 113.0625 at index (57, 93) (up to 0.001 allowed) - batch=65, n=256, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 7680 / 16640 (46.2%) -Greatest absolute difference: 70.3125 at index (34, 156) (up to 1e-05 allowed) -Greatest relative difference: inf at index (0, 67) (up to 0.001 allowed) - batch=65, n=512, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 14591 / 33280 (43.8%) -Greatest absolute difference: 88.875 at index (10, 58) (up to 1e-05 allowed) -Greatest relative difference: inf at index (22, 218) (up to 0.001 allowed) - batch=65, n=1024, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 24576 / 66560 (36.9%) -Greatest absolute difference: 127.75 at index (3, 403) (up to 1e-05 allowed) -Greatest relative difference: inf at index (4, 1001) (up to 0.001 allowed) - batch=65, n=2048, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 62464 / 133120 (46.9%) -Greatest absolute difference: 193.25 at index (52, 214) (up to 1e-05 allowed) -Greatest relative difference: inf at index (0, 1983) (up to 0.001 allowed) - batch=65, n=4096, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 94464 / 266240 (35.5%) -Greatest absolute difference: 283.25 at index (18, 1588) (up to 1e-05 allowed) -Greatest relative difference: inf at index (18, 1700) (up to 0.001 allowed) - batch=65, n=8192, status=mismatch - detail: -Tensor-likes are not close! - -Mismatched elements: 104959 / 532480 (19.7%) -Greatest absolute difference: 400.5 at index (60, 2799) (up to 1e-05 allowed) -Greatest relative difference: inf at index (3, 8170) (up to 0.001 allowed) - batch=65, n=16384, status=match diff --git a/examples/aot/fast_hadamard/run_hadamard.py b/examples/aot/fast_hadamard/run_hadamard.py index dbc67243..65195b88 100644 --- a/examples/aot/fast_hadamard/run_hadamard.py +++ b/examples/aot/fast_hadamard/run_hadamard.py @@ -7,6 +7,8 @@ from ptodsl.test_util import get_test_device +ELEMENTS_PER_TILE = 32 * 1024 // 2 # 32KB UB / sizeof(fp16) + def torch_to_ctypes(tensor): return ctypes.c_void_p(tensor.data_ptr()) @@ -27,6 +29,7 @@ def load_lib(lib_path, block_dim=24): def hadamard_func(x, batch, n, log2_n, block_dim=block_dim, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ + assert n <= ELEMENTS_PER_TILE, f"n must be <= {ELEMENTS_PER_TILE}, got {n}" lib.call_kernel( block_dim, stream_ptr, @@ -108,9 +111,9 @@ def test_hadamard(lib_path, block_dim=24): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--lib", - default="./hadamard_manual_sync_lib.so", - help="Path to the shared library generated by compile.sh.", + "--manual-sync", + action="store_true", + help="Use manual-sync library instead of the default auto-sync library.", ) parser.add_argument( "--block-dim", @@ -119,4 +122,9 @@ def test_hadamard(lib_path, block_dim=24): help="Kernel blockDim (default: 24).", ) args = parser.parse_args() - test_hadamard(args.lib, block_dim=args.block_dim) + lib_path = ( + "./hadamard_manual_sync_lib.so" + if args.manual_sync + else "./hadamard_auto_sync_lib.so" + ) + test_hadamard(lib_path, block_dim=args.block_dim) From 0475779cbdc215d03c569813ec625bb6c1f465e0 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:10:38 +0100 Subject: [PATCH 05/53] Performance measurement and tuning for fast hadamard python example (#62) * clean-up * perf benchmark * switch to `--npu-arch=dav-2201` and adjust builder accordingly. * plot bandwidth (script taken from cpp ref) --- examples/aot/fast_hadamard/.gitignore | 2 + examples/aot/fast_hadamard/README.md | 1 + examples/aot/fast_hadamard/compile.sh | 11 +-- .../aot/fast_hadamard/hadamard_builder.py | 26 +++-- examples/aot/fast_hadamard/plot_perf.py | 72 ++++++++++++++ examples/aot/fast_hadamard/run_hadamard.py | 94 +++++++++++++++++-- 6 files changed, 181 insertions(+), 25 deletions(-) create mode 100644 examples/aot/fast_hadamard/plot_perf.py diff --git a/examples/aot/fast_hadamard/.gitignore b/examples/aot/fast_hadamard/.gitignore index 4b573710..663e5a84 100644 --- a/examples/aot/fast_hadamard/.gitignore +++ b/examples/aot/fast_hadamard/.gitignore @@ -5,3 +5,5 @@ hadamard_manual_sync.cpp hadamard_auto_sync.pto hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so + +perf_data* diff --git a/examples/aot/fast_hadamard/README.md b/examples/aot/fast_hadamard/README.md index 2d036518..6b19cee9 100644 --- a/examples/aot/fast_hadamard/README.md +++ b/examples/aot/fast_hadamard/README.md @@ -4,4 +4,5 @@ Usage: bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs python ./run_hadamard.py # test auto-sync lib (default) python ./run_hadamard.py --manual-sync # test manual-sync lib +python ./plot_perf.py # optionally visualization ``` diff --git a/examples/aot/fast_hadamard/compile.sh b/examples/aot/fast_hadamard/compile.sh index 3f7bad2a..a95f6148 100644 --- a/examples/aot/fast_hadamard/compile.sh +++ b/examples/aot/fast_hadamard/compile.sh @@ -23,16 +23,11 @@ bisheng \ -mllvm -cce-aicore-record-overflow=true \ -mllvm -cce-aicore-addr-transform \ -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --cce-soc-version=Ascend910B2 \ - --cce-soc-core-type=VecCore \ - -DMEMORY_BASE \ + --npu-arch=dav-2201 -DMEMORY_BASE \ -std=gnu++17 \ ./caller.cpp \ -o ./hadamard_auto_sync_lib.so -# TODO: use `--npu-arch=dav-2201` instead of legacy `--cce-soc-version=Ascend910B2 --cce-soc-core-type=VecCore` -# need to change kernel vid calculation accordingly - bisheng \ -I${ASCEND_TOOLKIT_HOME}/include \ -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ @@ -43,9 +38,7 @@ bisheng \ -mllvm -cce-aicore-record-overflow=true \ -mllvm -cce-aicore-addr-transform \ -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --cce-soc-version=Ascend910B2 \ - --cce-soc-core-type=VecCore \ - -DMEMORY_BASE \ + --npu-arch=dav-2201 -DMEMORY_BASE \ -std=gnu++17 \ -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \ -DKERNEL_FN=fast_hadamard_manualsync \ diff --git a/examples/aot/fast_hadamard/hadamard_builder.py b/examples/aot/fast_hadamard/hadamard_builder.py index 7353edf0..4ea10e12 100644 --- a/examples/aot/fast_hadamard/hadamard_builder.py +++ b/examples/aot/fast_hadamard/hadamard_builder.py @@ -58,12 +58,17 @@ def fast_hadamard_autosync( n = pto.index_cast(n_i32) log2_n = pto.index_cast(log2_n_i32) + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + with pto.vector_section(): - # Match reference early scalar setup/return order. - bid = pto.index_cast(pto.get_block_idx()) - num_cores = pto.index_cast(pto.get_block_num()) samples_per_core = pto.ceil_div(batch, num_cores) - sample_offset = bid * samples_per_core + sample_offset = vid * samples_per_core with pto.if_context(sample_offset < batch): samples_end = sample_offset + samples_per_core @@ -152,12 +157,17 @@ def fast_hadamard_manualsync( n = pto.index_cast(n_i32) log2_n = pto.index_cast(log2_n_i32) + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + with pto.vector_section(): - # Match reference early scalar setup/return order. - bid = pto.index_cast(pto.get_block_idx()) - num_cores = pto.index_cast(pto.get_block_num()) samples_per_core = pto.ceil_div(batch, num_cores) - sample_offset = bid * samples_per_core + sample_offset = vid * samples_per_core with pto.if_context(sample_offset < batch): samples_end = sample_offset + samples_per_core diff --git a/examples/aot/fast_hadamard/plot_perf.py b/examples/aot/fast_hadamard/plot_perf.py new file mode 100644 index 00000000..6a894fd7 --- /dev/null +++ b/examples/aot/fast_hadamard/plot_perf.py @@ -0,0 +1,72 @@ +import os +import csv +try: + import matplotlib.pyplot as plt +except ImportError: + plt = None + + +def plot_bandwidth(input_dir="./perf_data/", output_path="bw_vs_shape.png"): + """Generate bandwidth plot from benchmark CSVs.""" + if plt is None: + print("Warning: matplotlib is not installed; skipping plot generation.") + return + + BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024] + BENCH_BLOCK_DIMS = [20, 24] + + fig, axes = plt.subplots(1, len(BENCH_BLOCK_DIMS), figsize=(14, 6), sharey=True) + if len(BENCH_BLOCK_DIMS) == 1: + axes = [axes] + + for ax, block_dim in zip(axes, BENCH_BLOCK_DIMS): + csv_path = os.path.join(input_dir, f"fht_pto_bd{block_dim}.csv") + if not os.path.exists(csv_path): + ax.set_title(f"BLOCK_DIM={block_dim} (no data)") + continue + + # Parse CSV: hidden_dim -> {batch: bw} + data = {} + with open(csv_path, encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + batch = int(row["batch"]) + n = int(row["N"]) + bw = float(row["bandwidth_gbs"]) + data.setdefault(n, {})[batch] = bw + + for idx, hidden_dim in enumerate(sorted(data.keys())): + batches = sorted(data[hidden_dim].keys()) + bws = [data[hidden_dim][b] for b in batches] + + if idx < 10: + marker = "o" + else: + last_markers = ["s", "^", "D"] + marker = last_markers[idx - 10] + + ax.plot( + batches, + bws, + marker=marker, + markersize=4, + label=f"hidden_dim={hidden_dim}", + ) + + ax.set_xscale("log", base=2) + ax.set_xticks(BENCH_BATCHES) + ax.set_xticklabels([str(b) for b in BENCH_BATCHES], rotation=45, fontsize=7) + ax.set_xlabel("batch") + ax.set_title(f"BLOCK_DIM={block_dim}") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=7, ncol=2) + + axes[0].set_ylabel("Bandwidth (GB/s)") + fig.suptitle("Fast Hadamard PTO-DSL: Bandwidth vs Shape") + fig.tight_layout() + fig.savefig(input_dir + output_path, dpi=150) + print(f"\nPlot saved to {input_dir+output_path}") + + +if __name__ == "__main__": + plot_bandwidth() diff --git a/examples/aot/fast_hadamard/run_hadamard.py b/examples/aot/fast_hadamard/run_hadamard.py index 65195b88..60ee8aee 100644 --- a/examples/aot/fast_hadamard/run_hadamard.py +++ b/examples/aot/fast_hadamard/run_hadamard.py @@ -1,5 +1,7 @@ +import os import argparse import ctypes +import csv import math import torch @@ -60,12 +62,7 @@ def _is_power_of_two(v): return v > 0 and (v & (v - 1)) == 0 -def test_hadamard(lib_path, block_dim=24): - device = get_test_device() - torch.npu.set_device(device) - - hadamard = load_lib(lib_path=lib_path, block_dim=block_dim) - +def test_hadamard(hadamard_func, block_dim=24): torch.manual_seed(0) dtype = torch.float16 batch_list = [1, 7, 29, 65] @@ -80,7 +77,7 @@ def test_hadamard(lib_path, block_dim=24): x = torch.randn(batch, n, device=device, dtype=dtype) y_ref = hadamard_ref_inplace(x) - hadamard(x, batch, n, log2_n) + hadamard_func(x, batch, n, log2_n) torch.npu.synchronize() is_match = True @@ -108,6 +105,80 @@ def test_hadamard(lib_path, block_dim=24): return results +def benchmark(hadamard_func, warmup=2, repeats=20, output_dir="./perf_data/"): + """Benchmark across (batch, N, block_dim) configs. + + Uses separate input tensors per run to avoid L2 cache reuse, + and a single timing-event pair averaged over all runs. + """ + TEST_HIDDEN_DIMS = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024] + BENCH_BLOCK_DIMS = [20, 24] + + os.makedirs(output_dir, exist_ok=True) + + for block_dim in BENCH_BLOCK_DIMS: + print(f"\n{'=' * 60}") + print(f"BENCHMARK (BLOCK_DIM={block_dim})") + print(f"{'=' * 60}") + header = ( + f"{'batch':>6s} {'N':>6s}" + f" {'duration_us':>12s} {'bandwidth_gbs':>14s}" + ) + print(header) + print("-" * len(header)) + + records = [] + + for batch in BENCH_BATCHES: + for n in TEST_HIDDEN_DIMS: + log2_n = int(math.log2(n)) + allocated = warmup + repeats + + # Separate GM tensors to avoid L2 cache reuse + x_list = [ + torch.randn(batch, n, device="npu", dtype=torch.float16) + for _ in range(allocated) + ] + + # Warmup + for i in range(warmup): + hadamard_func(x_list[i], batch, n, log2_n, block_dim=block_dim) + torch.npu.synchronize() + + # Timed runs — single event pair, average over repeats + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + + start.record() + for i in range(repeats): + hadamard_func( + x_list[warmup + i], + batch, + n, + log2_n, + block_dim=block_dim, + ) + end.record() + torch.npu.synchronize() + + duration_ms = start.elapsed_time(end) / repeats + dur_us = duration_ms * 1e3 + + # Bandwidth: read + write = 2 * batch * n * sizeof(half) + data_bytes = 2 * batch * n * 2 + bw_gbs = (data_bytes / 1e9) / (dur_us / 1e6) if dur_us > 0 else 0.0 + + print(f"{batch:>6d} {n:>6d}" f" {dur_us:>12.2f} {bw_gbs:>14.2f}") + records.append(f"{batch},{n},{dur_us:.4f},{bw_gbs:.4f}") + + csv_path = os.path.join(output_dir, f"fht_pto_bd{block_dim}.csv") + with open(csv_path, "w", encoding="utf-8") as f: + f.write("batch,N,duration_us,bandwidth_gbs\n") + f.write("\n".join(records) + "\n") + print(f"\nSaved to {csv_path}") + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -122,9 +193,16 @@ def test_hadamard(lib_path, block_dim=24): help="Kernel blockDim (default: 24).", ) args = parser.parse_args() + lib_path = ( "./hadamard_manual_sync_lib.so" if args.manual_sync else "./hadamard_auto_sync_lib.so" ) - test_hadamard(lib_path, block_dim=args.block_dim) + + device = get_test_device() + torch.npu.set_device(device) + hadamard_func = load_lib(lib_path=lib_path, block_dim=args.block_dim) + + test_hadamard(hadamard_func) + benchmark(hadamard_func) From bf95136ae5018ee81b95356008ce45dbd76e82c5 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:59:07 +0100 Subject: [PATCH 06/53] fix vector index counts for other vector examples (#64) * fix vector index counts for dynamic add * fix geglu vector index count --- examples/aot/add_dynamic_multicore/add_builder.py | 8 +++----- .../aot/add_dynamic_multicore/add_double_builder.py | 8 +++----- examples/aot/geglu_dynamic_multicore/geglu_builder.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/aot/add_dynamic_multicore/add_builder.py b/examples/aot/add_dynamic_multicore/add_builder.py index 72804ce4..df613306 100644 --- a/examples/aot/add_dynamic_multicore/add_builder.py +++ b/examples/aot/add_dynamic_multicore/add_builder.py @@ -43,18 +43,16 @@ def vec_add_1d_dynamic( cid = pto.get_block_idx() sub_bid = pto.get_subblock_idx() sub_bnum = pto.get_subblock_num() - cidmul = cid * sub_bnum - vid = cidmul + sub_bid num_blocks = pto.get_block_num() # Convert i64/i32 values to index for arithmetic ops. - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores total_elements = pto.index_cast(argN) num_tiles_global = pto.ceil_div(total_elements, c_tile) num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) - tile_offset_this_core = vid_idx * num_tiles_per_core + tile_offset_this_core = vid * num_tiles_per_core with pto.vector_section(): tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[total_elements], strides=[c1]) diff --git a/examples/aot/add_dynamic_multicore/add_double_builder.py b/examples/aot/add_dynamic_multicore/add_double_builder.py index 436afeb1..073ceb15 100644 --- a/examples/aot/add_dynamic_multicore/add_double_builder.py +++ b/examples/aot/add_dynamic_multicore/add_double_builder.py @@ -44,18 +44,16 @@ def vec_add_1d_dynamic( cid = pto.get_block_idx() sub_bid = pto.get_subblock_idx() sub_bnum = pto.get_subblock_num() - cidmul = cid * sub_bnum - vid = cidmul + sub_bid num_blocks = pto.get_block_num() # Convert i64/i32 values to index for arithmetic ops. - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores total_elements = pto.index_cast(argN) num_tiles_global = pto.ceil_div(total_elements, c_tile) num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) - tile_offset_this_core = vid_idx * num_tiles_per_core + tile_offset_this_core = vid * num_tiles_per_core with pto.vector_section(): tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[total_elements], strides=[c1]) diff --git a/examples/aot/geglu_dynamic_multicore/geglu_builder.py b/examples/aot/geglu_dynamic_multicore/geglu_builder.py index 092f05de..4151d19c 100644 --- a/examples/aot/geglu_dynamic_multicore/geglu_builder.py +++ b/examples/aot/geglu_dynamic_multicore/geglu_builder.py @@ -80,12 +80,17 @@ def _kernel( with pto.if_context(n_cols > c0): with pto.if_context(c_tile >= n_cols): - bid = pto.index_cast(pto.get_block_idx()) - num_cores = pto.index_cast(pto.get_block_num()) + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores # Distribute rows across cores (row-level parallelism). rows_per_core = pto.ceil_div(batch, num_cores) - row_start = bid * rows_per_core + row_start = vid * rows_per_core row_end = pto.min_u(row_start + rows_per_core, batch) num_rows = row_end - row_start From 9dec6399e08095e1fc90ab677d643dfbb8406562 Mon Sep 17 00:00:00 2001 From: Mirko De Vita <61700769+MirkoDeVita98@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:34:32 +0100 Subject: [PATCH 07/53] Rowsum dynamic fp32 test (#63) * added rowsum dynamic multicore tests for fp32 * added more rowsum tests and removed unnecessary checks in builder * use more general batch sizes --------- Co-authored-by: mirkodevita Co-authored-by: learning-chip --- ptodsl/language.py | 4 + tests/npu/rowsum_dynamic_multicore/caller.py | 31 +++++ tests/npu/rowsum_dynamic_multicore/compile.sh | 29 ++++ tests/npu/rowsum_dynamic_multicore/gen_ir.py | 14 ++ .../rowsum_builder.py | 129 ++++++++++++++++++ .../rowsum_dynamic_multicore/test_rowsum.py | 74 ++++++++++ 6 files changed, 281 insertions(+) create mode 100644 tests/npu/rowsum_dynamic_multicore/caller.py create mode 100755 tests/npu/rowsum_dynamic_multicore/compile.sh create mode 100644 tests/npu/rowsum_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/rowsum_dynamic_multicore/rowsum_builder.py create mode 100644 tests/npu/rowsum_dynamic_multicore/test_rowsum.py diff --git a/ptodsl/language.py b/ptodsl/language.py index c05ef54b..43f6365d 100644 --- a/ptodsl/language.py +++ b/ptodsl/language.py @@ -411,3 +411,7 @@ def record_wait_pair(record_op, wait_op, event_id=0): def barrier(sync_op): pto.barrier(_resolve_sync_op(sync_op)) + + +def row_sum(src, tmp, dst): + pto.TRowSumOp(src = src, tmp = tmp, dst = dst) \ No newline at end of file diff --git a/tests/npu/rowsum_dynamic_multicore/caller.py b/tests/npu/rowsum_dynamic_multicore/caller.py new file mode 100644 index 00000000..d62967ea --- /dev/null +++ b/tests/npu/rowsum_dynamic_multicore/caller.py @@ -0,0 +1,31 @@ +"""Generate caller.cpp for the dynamic multicore rowsum kernel (fp32). + +Usage: python caller.py +""" + +_BLOCK_DIM = 24 + + +def generate_caller(): + return f"""\ +#include "rowsum.cpp" + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{{ + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +}} +""" + + +if __name__ == "__main__": + print(generate_caller()) diff --git a/tests/npu/rowsum_dynamic_multicore/compile.sh b/tests/npu/rowsum_dynamic_multicore/compile.sh new file mode 100755 index 00000000..4b20495c --- /dev/null +++ b/tests/npu/rowsum_dynamic_multicore/compile.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +TMP=$(mktemp -d) +trap "rm -rf $TMP" EXIT + +python "$SCRIPT_DIR/gen_ir.py" > "$TMP/rowsum.pto" +ptoas --enable-insert-sync "$TMP/rowsum.pto" -o "$TMP/rowsum.cpp" + +python "$SCRIPT_DIR/caller.py" > "$TMP/caller.cpp" + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + "$TMP/caller.cpp" \ + -o "$SCRIPT_DIR/rowsum_lib.so" + +echo "Built rowsum_lib.so successfully." diff --git a/tests/npu/rowsum_dynamic_multicore/gen_ir.py b/tests/npu/rowsum_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..5e903352 --- /dev/null +++ b/tests/npu/rowsum_dynamic_multicore/gen_ir.py @@ -0,0 +1,14 @@ +"""Print MLIR IR for the dynamic multicore rowsum kernel (fp32). + +Usage: python gen_ir.py +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from rowsum_builder import build_rowsum + +if __name__ == "__main__": + print(build_rowsum(dtype="fp32")) diff --git a/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py b/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py new file mode 100644 index 00000000..631bcb94 --- /dev/null +++ b/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py @@ -0,0 +1,129 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + +const = pto.const + +# 32 KB of UB +_TILE_SIZE_BYTES = 32 * 1024 +_DTYPE_BYTES = {"fp16": 2, "fp32": 4} + + +def meta_data(dtype="fp32"): + pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] + elements_per_tile = _TILE_SIZE_BYTES // _DTYPE_BYTES[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=pto_dtype) + subtensor_in = pto.SubTensorType(shape=[1, elements_per_tile], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, elements_per_tile], + valid_shape=[1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "elements_per_tile": elements_per_tile, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_in": subtensor_in, + "tile_type": tile_type, + } + + +def build_rowsum(fn_name="rowsum_fp32", dtype="fp32"): + """ + Computes per-row sum across columns using PTO TROWSUM (pto.row_sum wrapper). + + Args: + x_ptr : dtype[batch * n_cols] input matrix flattened row-major + y_ptr : dtype[batch] output vector, one sum per row + batch : int32 + n_cols: int32 (<= elements_per_tile) + + Semantics: + y[row] = sum_{j=0..n_cols-1} x[row, j] + """ + _meta_data = lambda: meta_data(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + + batch = pto.index_cast(batch_i32) + n_cols = pto.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = pto.index_cast(pto.get_block_idx()) + num_cores = pto.index_cast(pto.get_block_num()) + + rows_per_core = pto.ceil_div(batch, num_cores) + row_start = bid * rows_per_core + row_end = pto.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[batch], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_sum = pto.alloc_tile( + tile_type, valid_col=c1 + ) # scalar output + tb_tmp = pto.alloc_tile( + tile_type, valid_col=n_cols + ) # scratch + + for r in pto.for_range(c0, num_rows, c1): + gm_offset = (row_start + r) * n_cols + + sv_x = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + + # y is a vector of length batch; write one element per row + sv_y = pto.slice_view( + subtensor_in, + source=tv_y, + offsets=[row_start + r], + sizes=[c1], + ) + + pto.load(sv_x, tb_x) + pto.row_sum(tb_x, tb_tmp, tb_sum) + + # Store the 1-element tile to y[row] + pto.store(tb_sum, sv_y) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--fn-name", default="rowsum_fp32") + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + print(build_rowsum(fn_name=args.fn_name, dtype=args.dtype)) diff --git a/tests/npu/rowsum_dynamic_multicore/test_rowsum.py b/tests/npu/rowsum_dynamic_multicore/test_rowsum.py new file mode 100644 index 00000000..e8f3c2fc --- /dev/null +++ b/tests/npu/rowsum_dynamic_multicore/test_rowsum.py @@ -0,0 +1,74 @@ +import ctypes +import os +import subprocess + +import pytest +import torch + +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() +_LIB_PATH = os.path.join(_DIR, "rowsum_lib.so") +_BLOCK_DIM = 24 + +_BATCH_LIST = [1, 7, 29, 32, 65, 200] +_N_COLS_LIST = [128, 256, 512, 1024, 2048, 4096, 8192] + +_SHAPE_PARAMS = [ + pytest.param(batch, n_cols, id=f"batch{batch}-cols{n_cols}") + for batch in _BATCH_LIST + for n_cols in _N_COLS_LIST +] + + +@pytest.fixture(scope="session") +def compiled_rowsum(): + subprocess.check_call(["bash", os.path.join(_DIR, "compile.sh")], cwd=_DIR) + yield + os.remove(_LIB_PATH) + + +def test_build_rowsum(compiled_rowsum): + assert os.path.exists(_LIB_PATH) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("batch, n_cols", _SHAPE_PARAMS) +def test_rowsum_precision(compiled_rowsum, batch, n_cols): + import torch_npu # noqa: F401 + + lib = ctypes.CDLL(_LIB_PATH) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ] + lib.call_kernel.restype = None + + torch.npu.set_device(_DEVICE) + x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) + y = torch.empty(batch, device=_DEVICE, dtype=torch.float32) + + y_ref = x.float().sum(dim=-1) + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + ctypes.c_uint32(_BLOCK_DIM), + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_uint32(batch), + ctypes.c_uint32(n_cols), + ) + torch.npu.synchronize() + + torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) From 14d6be1b2172149fdd8c9cb20c96e7a7eb06a23c Mon Sep 17 00:00:00 2001 From: Filip Skogh <43207511+fiskrt@users.noreply.github.com> Date: Fri, 6 Mar 2026 11:45:51 +0100 Subject: [PATCH 08/53] feat: add double buffered matmul matching `torch.matmul` speed (#55) * WIP: dsl * feat: simplify * feat: add finer grained set/wait flags for more pipeline overlap * fix: func signature compatability with caller * feat: fp16xfp16->fp16 matmul with fp32 acc tiles * feat: A/C are now 3d tensors * fix: typo * feat: pre-load B * WIP: double buffer * feat: cleanup * wip: not working * feat: add cpp double buf code for debug * fix: typos * feat: remove extra sync in bench * feat: add double buf cpp ref * feat: convert dsl to fp16 * feat: add modulus and support multiple event_ids * feat: double buffer matmul pto-dsl * fix: remove cpp * feat: simplify * feat: add manual sync option to decorator * Revert "feat: add manual sync option to decorator" This reverts commit 8ccbf68cb8869ab59e4d4cae91148bce4b4d817d. * fix: duplicate and typing * fix: type check * feat: move to new directory * fix: remove sync param * feat: add run script * refactor: cleanup cpp ref * feat: add caller and readme * feat: verify and benchmark, add barrier * feat: add optional dev depencdency matplotlib * fix: only plot when specified * fix: only use the signed remainder ops --- .../matmul_dynbatch_multicore_2buf/README.md | 4 + .../matmul_dynbatch_multicore_2buf/caller.cpp | 13 ++ .../matmul_dynbatch_multicore_2buf/compile.sh | 12 ++ .../matmul_dsl.py | 175 +++++++++++++++ .../matmul_ref.cpp | 133 ++++++++++++ .../run_matmul.py | 199 ++++++++++++++++++ ptodsl/bench.py | 5 +- ptodsl/language.py | 22 +- ptodsl/pyproject.toml | 3 + 9 files changed, 558 insertions(+), 8 deletions(-) create mode 100644 examples/aot/matmul_dynbatch_multicore_2buf/README.md create mode 100644 examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp create mode 100755 examples/aot/matmul_dynbatch_multicore_2buf/compile.sh create mode 100644 examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py create mode 100644 examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp create mode 100644 examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/README.md b/examples/aot/matmul_dynbatch_multicore_2buf/README.md new file mode 100644 index 00000000..17fc4de9 --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/README.md @@ -0,0 +1,4 @@ +```bash +bash ./compile.sh +python ./run_matmul.py +``` diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp b/examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp new file mode 100644 index 00000000..fff32469 --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp @@ -0,0 +1,13 @@ +#include "mul.cpp" + +extern "C" void call_kernel( + uint32_t blockDim, void* stream, + uint8_t* c, uint8_t* a, uint8_t* b, uint32_t batch_size) +{ + RunTMATMULSplitK<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + nullptr, false, batch_size + ); +} diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/compile.sh b/examples/aot/matmul_dynbatch_multicore_2buf/compile.sh new file mode 100755 index 00000000..0cd49d16 --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/compile.sh @@ -0,0 +1,12 @@ +rm mul.cpp matmul_kernel.so + +python ./matmul_dsl.py | ptoas > mul.cpp + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I${ASCEND_TOOLKIT_HOME}/include \ + --cce-soc-version=Ascend910B2 \ + --cce-soc-core-type=CubeCore \ + -I/mounted_home/pto-isa/include \ + ./caller.cpp \ + -o ./matmul_kernel.so diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py new file mode 100644 index 00000000..26436b66 --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py @@ -0,0 +1,175 @@ +from mlir.ir import IntegerType + +from ptodsl import to_ir_module +import ptodsl.language as pto + + +def build(M=128, K=128, N=128): + def meta_data(): + dtype = pto.float16 + dtype_acc_tile = pto.float32 + ptr_type = pto.PtrType(dtype) + i32 = pto.int32 + i1 = IntegerType.get_signless(1) + + tensor_type = pto.TensorType(rank=2, dtype=dtype) + tensor_type3d = pto.TensorType(rank=3, dtype=dtype) + + tile_view_a = pto.SubTensorType(shape=[M, K], dtype=dtype) + tile_view_b = pto.SubTensorType(shape=[K, N], dtype=dtype) + tile_view_c = pto.SubTensorType(shape=[M, N], dtype=dtype) + tile_buf_aMat = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="MAT") + tile_buf_bMat = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="MAT") + tile_buf_aTile = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="LEFT") + tile_buf_bTile = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="RIGHT") + tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype_acc_tile, memory_space="ACC") + # TODO: Get rid of this? + return locals() + + const = pto.const + + + # Until we have set_dyn_flag with event_id as SSA values + # event_id can be dynamic SSA value + # https://github.com/zhangstevenunity/PTOAS/pull/176 + def record_event(src, dst, event_id): + pto.cond( + event_id == const(0), + lambda: pto.record_event(src, dst, event_id=0), + lambda: pto.record_event(src, dst, event_id=1) + ) + + def wait_event(src, dst, event_id): + pto.cond( + event_id == const(0), + lambda: pto.wait_event(src, dst, event_id=0), + lambda: pto.wait_event(src, dst, event_id=1) + ) + + @to_ir_module(meta_data=meta_data) + def RunTMATMULSplitK( + out_ptr: "ptr_type", + a_ptr: "ptr_type", + b_ptr: "ptr_type", + bias_ptr: "ptr_type", + isBias: "i1", + batch_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + cM = const(M) + cK = const(K) + cN = const(N) + batch = pto.index_cast(batch_i32) + + num_blocks = pto.index_cast(pto.get_block_num()) + # TODO round robin + batches_per_core = pto.ceil_div(batch, num_blocks) + bid = pto.index_cast(pto.get_block_idx()) + b_start = bid * batches_per_core + b_end_unclamped = b_start + batches_per_core + b_end = pto.min_u(b_end_unclamped, batch) + + # TODO: if no batched assigned to this core, early return + + tvA = pto.as_tensor(tensor_type3d, ptr=a_ptr, shape=[batch, cM, cK], strides=[cK*cM, cK, c1]) + tvC = pto.as_tensor(tensor_type3d, ptr=out_ptr, shape=[batch, cM, cN], strides=[cM*cN, cN, c1]) + tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) + + # TODO: pre-fetch more than two tiles into L1 + NUM_BUFFERS = 2 + aMatTiles = [pto.alloc_tile(tile_buf_aMat) for _ in range(NUM_BUFFERS)] + bMatTile = pto.alloc_tile(tile_buf_bMat) + # Ping and pong buffers in L0A/C + aTiles = [pto.alloc_tile(tile_buf_aTile) for _ in range(NUM_BUFFERS)] + cTiles = [pto.alloc_tile(tile_buf_cTile) for _ in range(NUM_BUFFERS)] + bTile = pto.alloc_tile(tile_buf_bTile) + + # Put B in L0B + svB = pto.slice_view(tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cN]) + pto.load(svB, bMatTile) + pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) + pto.mov(bMatTile, bTile) + # TODO: wait here so we can use full l1 memory later for A. + + + # load in the first tile from GM->L1 + svA = pto.slice_view(tile_view_a, source=tvA, offsets=[b_start, c0, c0], sizes=[c1, cM, cK]) + curr = c1 - (b_start % c2) + pto.cond( + curr == c1, + lambda: pto.load(svA, aMatTiles[0]), + lambda: pto.load(svA, aMatTiles[1]), + ) + record_event("LOAD", "MOV_M2L", event_id=curr) + + # TODO: fix wait events if batch size is 1/2 + # signal to LOAD that L1 can be overwritten + pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1]) + # signal to MOV that L0 can be overwritten + pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) + # signal to MATMUL that it can overwrite L0C + pto.record_event("STORE_ACC", "MATMUL", event_id=[0, 1]) + + for b_idx in pto.for_range(b_start, b_end, c1): + curr = b_idx % c2 + svA = pto.slice_view(tile_view_a, source=tvA, offsets=[b_idx+c1, c0, c0], sizes=[c1, cM, cK]) + svC = pto.slice_view(tile_view_c, source=tvC, offsets=[b_idx, c0, c0], sizes=[c1, cM, cN]) + + ########## Load tile A for iteration i+1 from GM -> L1 + wait_event("MOV_M2L", "LOAD", event_id=curr) + with pto.if_context(b_idx + c1 < b_end): + pto.cond( + curr == c1, + lambda: pto.load(svA, aMatTiles[0]), + lambda: pto.load(svA, aMatTiles[1]) + ) + record_event("LOAD", "MOV_M2L", event_id=curr) + + + ########## Move A1 and A2 into L0A + wait_event("LOAD", "MOV_M2L", event_id=c1 - curr) + wait_event("MATMUL", "MOV_M2L", event_id=curr) + pto.cond( + curr == c0, + lambda: pto.mov(aMatTiles[0], aTiles[0]), + lambda: pto.mov(aMatTiles[1], aTiles[1]) + ) + with pto.if_context(b_idx + c2 < b_end): + record_event("MOV_M2L", "LOAD", event_id=curr) + record_event("MOV_M2L", "MATMUL", event_id=curr) + + + ########## Perform matmul + wait_event("MOV_M2L", "MATMUL", event_id=curr) + wait_event("STORE_ACC", "MATMUL", event_id=curr) + pto.cond( + curr == c0, + lambda: pto.matmul(aTiles[0], bTile, cTiles[0]), + lambda: pto.matmul(aTiles[1], bTile, cTiles[1]), + ) + record_event("MATMUL", "STORE_ACC", event_id=curr) + with pto.if_context(b_idx + c2 < b_end): + record_event("MATMUL", "MOV_M2L", event_id=curr) + + + ######### Store + wait_event("MATMUL", "STORE_ACC", event_id=curr) + pto.cond( + curr == c0, + lambda: pto.store(cTiles[0], svC), + lambda: pto.store(cTiles[1], svC), + ) + with pto.if_context(b_idx + c2 < b_end): + record_event("STORE_ACC", "MATMUL", event_id=curr) + + pto.barrier('LOAD') + + + return RunTMATMULSplitK + + +if __name__ == "__main__": + print(build()) diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp new file mode 100644 index 00000000..e0848f5d --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp @@ -0,0 +1,133 @@ +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void RunTMATMULSplitK(__gm__ half *v1, __gm__ half *v2, __gm__ half *v3, __gm__ half *v4, bool v5, + int32_t v6) { + unsigned v7 = 16384; + unsigned v8 = 128; + unsigned v9 = 1; + unsigned v10 = 0; + int32_t v11 = 0; + int32_t v12 = 1; + int32_t v13 = 2; + int32_t v14 = 128; + int32_t v15 = 16384; + int64_t v16 = 32768; + int64_t v17 = 65536; + int64_t v18 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + int64_t v19 = get_block_num(); + int32_t v20 = (int32_t)((int64_t)v19); + int32_t v21 = v6 / v20; + int32_t v22 = v6 % v20 != v11 && v6 < v11 == v20 < v11 ? v21 + v12 : v21; + int64_t v23 = get_block_idx(); + int32_t v24 = (int32_t)((uint32_t)((int32_t)(int64_t)v23) * (uint32_t)v22); + int32_t v25 = (int32_t)((uint32_t)v24 + (uint32_t)v22); + Tile A1_l1; + TASSIGN(A1_l1, v16); + Tile A2_l1; + TASSIGN(A2_l1, v17); + + Tile A1_l0; + TASSIGN(A1_l0, v18); + Tile A2_l0; + TASSIGN(A2_l0, v16); + + Tile C1_l0; + TASSIGN(C1_l0, v18); + Tile C2_l0; + TASSIGN(C2_l0, v17); + + Tile v28; + TASSIGN(v28, v18); + Tile B_l0; + TASSIGN(B_l0, v18); + pto::Shape<1, 1, 1, 128, 128> v34 = pto::Shape<1, 1, 1, 128, 128>(); + pto::Stride<16384, 16384, 16384, 128, 1> v35 = pto::Stride<16384, 16384, 16384, 128, 1>(); + + using GMType = + GlobalTensor, pto::Stride<16384, 16384, 16384, 128, 1>, pto::Layout::ND>; + GMType v36 = GMType(v3 + (v10 + v10 * (unsigned)v14 + v10 * (unsigned)v12), v34, v35); + TLOAD(v28, v36); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(B_l0, v28); + + pto::Shape<1, 1, 1, 128, 128> v39 = pto::Shape<1, 1, 1, 128, 128>(); + pto::Stride<16384, 16384, 16384, 128, 1> v40 = pto::Stride<16384, 16384, 16384, 128, 1>(); + pto::Shape<1, 1, 1, 128, 128> v49 = pto::Shape<1, 1, 1, 128, 128>(); + pto::Stride<16384, 16384, 16384, 128, 1> v50 = pto::Stride<16384, 16384, 16384, 128, 1>(); + pto::Shape<1, 1, 1, 128, 128> v43 = pto::Shape<1, 1, 1, 128, 128>(); + pto::Stride<16384, 16384, 16384, 128, 1> v44 = pto::Stride<16384, 16384, 16384, 128, 1>(); + pto::Shape<1, 1, 1, 128, 128> v46 = pto::Shape<1, 1, 1, 128, 128>(); + pto::Stride<16384, 16384, 16384, 128, 1> v47 = pto::Stride<16384, 16384, 16384, 128, 1>(); + + int end = ((size_t)((uint32_t)v25 < (uint32_t)v6 ? v25 : v6)); + // i-2 + int curr = v24 & 1; + set_flag(PIPE_MTE1, PIPE_MTE2, curr); // set(1) + set_flag(PIPE_M, PIPE_MTE1, curr); // set(3) + set_flag(PIPE_FIX, PIPE_M, curr); // set(4) + + // i-1 + // must load the first tile from GM->l1 here since the loop always loads for + // next iteration + GMType A_gm_first = GMType(v2 + v24 * v15, v39, v40); + // this is iteration i-1, in this case -1 + curr = 1 - curr; // since v24 can start at odd/even i must load the right tile + TLOAD(curr == 1 ? A1_l1 : A2_l1, A_gm_first); + set_flag(PIPE_MTE2, PIPE_MTE1, curr); // set(2) tell MTE1 that MTE2 finished. + + set_flag(PIPE_MTE1, PIPE_MTE2, curr); // set(1) + set_flag(PIPE_M, PIPE_MTE1, curr); // set(3) + set_flag(PIPE_FIX, PIPE_M, curr); // set(4) + + for (size_t i = v24; i < end; i += 1) { + curr = i & 1; + // Global memory for A tiles + GMType v45 = GMType(v2 + (i + 1) * v15, v43, v44); + // GM tile C_1 and C_2 + GMType v48 = GMType(v1 + i * v15, v46, v47); + + // Start loading the tile used in matmul at iteration i+1 + wait_flag(PIPE_MTE1, PIPE_MTE2, curr); // (1, i-2) wait until the MOV at i-2 has completed + if (i + 1 < end) { + TLOAD(curr == 1 ? A1_l1 : A2_l1, v45); + set_flag(PIPE_MTE2, PIPE_MTE1, curr); // set(2, i+1) notify the mov below in iteration i+1 that the load completed + } + + // mov + wait_flag(PIPE_MTE2, PIPE_MTE1, 1 - curr); // (2, i-1) last iteration loaded the tile into l1, so + // for us to move to l0 we wait for last it + wait_flag(PIPE_M, PIPE_MTE1, curr); // (3, i-2) make sure the matmul from + // i-2 finished so we can overwrite l0A + TMOV(curr == 0 ? A1_l0 : A2_l0, curr == 0 ? A1_l1 : A2_l1); + if (i + 2 < end) { + set_flag(PIPE_MTE1, PIPE_MTE2, curr); // set(1, i+2) notify load at iteration i+2 that it's ready + } + set_flag(PIPE_MTE1, PIPE_M, curr); // set(5, i) simply notify matmul at it. i it is ready. + + // matmul + wait_flag(PIPE_FIX, PIPE_M, curr); // (4, i-2) wait until the STORE at it. + // i-2 has written back from L0C + wait_flag(PIPE_MTE1, PIPE_M, curr); // (5, i) need the tile that is moved into L0A at iteration i + TMATMUL(curr == 0 ? C1_l0 : C2_l0, curr == 0 ? A1_l0 : A2_l0, B_l0); + set_flag(PIPE_M, PIPE_FIX, curr); // set(6, i) notify store in this + // iteration i, that matmul is done + if (i + 2 < end) { + set_flag(PIPE_M, PIPE_MTE1, curr); // set(3, i+2) notify mov in iteration i+2, that matmul is done + } + + // store + wait_flag(PIPE_M, PIPE_FIX, curr); // (6, i) wait for matmul in it. i to be done + TSTORE(v48, curr == 0 ? C1_l0 : C2_l0); + if (i + 2 < end) { + set_flag(PIPE_FIX, PIPE_M, curr); // set(4, i+2) notify matmul in i+2 that store is complete + } + } + +#endif // __DAV_CUBE__ + + return; +} diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py b/examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py new file mode 100644 index 00000000..0b5be8a9 --- /dev/null +++ b/examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py @@ -0,0 +1,199 @@ +from typing import Callable, List, Literal, Union +import ctypes +import time +import argparse + +from ptodsl.test_util import get_test_device +from ptodsl import do_bench + +import torch +import torch_npu + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def _dtype_nbytes(dtype: torch.dtype) -> int: + return torch.empty((), dtype=dtype).element_size() + + +def matmul_flops(batch_size: int, m: int, k: int, n: int) -> int: + return 2 * batch_size * m * k * n + + +def matmul_io_bytes(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor) -> int: + # Simple traffic model: read A + read B + write C. + elt = _dtype_nbytes(a.dtype) + return (a.numel() + b.numel() + c.numel()) * elt + + +def benchmark( + fn, + *, + flops: int | None = None, + io_bytes: int | None = None, +) -> dict: + avg_s = do_bench(fn, unit='s', flush_cache=True) + stats = {"avg_ms": avg_s * 1e3} + if flops is not None: + stats["tflops"] = (flops / avg_s) / 1e12 + if io_bytes is not None: + stats["gbps"] = (io_bytes / avg_s) / 1e9 + return stats + + +def print_benchmark(stats: dict) -> None: + parts = [f"{stats['name']}: {stats['avg_ms']:.3f} ms"] + if "tflops" in stats: + parts.append(f"{stats['tflops']:.2f} TFLOP/s") + if "gbps" in stats: + parts.append(f"{stats['gbps']:.2f} GB/s (A+B+C)") + print(" | ".join(parts)) + + +def load_lib(lib_path): + lib = ctypes.CDLL(lib_path) + + def matmul_func( + c, a, b, batch_size, + block_dim, + stream_ptr=None + ): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(c), + torch_to_ctypes(a), + torch_to_ctypes(b), + ctypes.c_uint32(batch_size), + ) + + return matmul_func + + +def plot_benchmark(): + import matplotlib.pyplot as plt + device = get_test_device() + torch.set_default_device(device) + torch.npu.set_device(device) + dtype = torch.float16 + torch.manual_seed(0) + + matmul_func = load_lib("./matmul_kernel.so") + + pto_results, torch_results, pto2_results, pto3_results = [], [], [], [] + m, k, n = 128, 128, 128 + batches = list(range(24*2, 8000, 24*2)) + blk = [24, 1, 6] + for i in batches: + bs = i + a = torch.rand((bs, m, k), device=device, dtype=dtype) + b = torch.rand((k, n), device=device, dtype=dtype) + c = torch.zeros((bs, m, n), device=device, dtype=dtype) + + # correctness check + matmul_func(c, a, b, batch_size=bs, block_dim=24) + torch.npu.synchronize() + c_ref = torch.matmul(a, b) + diff = (c - c_ref).abs().max() + #assert diff <= 1e-5, diff + if diff < 1e-5: + print('.', end='') + else: + print(f'failed at shape: {a.shape} with {diff}') + + flops = matmul_flops(bs, m, k, n) + io_bytes = matmul_io_bytes(a, b, c) + + # run a benchmark for warmup (else first iterations are off) + benchmark(lambda: torch.matmul(a, b, out=c)) + + torch_b = benchmark( lambda: torch.matmul(a, b, out=c), + flops=flops, io_bytes=io_bytes)['gbps'] + pto2 = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[1]), + flops=flops, io_bytes=io_bytes)['gbps'] + pto3 = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[2]), + flops=flops, io_bytes=io_bytes)['gbps'] + pto = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[0]), + flops=flops, io_bytes=io_bytes)['gbps'] + pto_results.append(pto) + pto2_results.append(pto2) + pto3_results.append(pto3) + torch_results.append(torch_b) + print() + rel_diff = [our/their for our, their in zip(pto_results, torch_results)] + + fig, ax1 = plt.subplots(figsize=(8,5)) + + ax1.plot(batches, pto_results, '-', label=f'pto-dsl ({blk[0]} cores)') + ax1.plot(batches, pto2_results, '-', label=f'pto-dsl ({blk[1]} cores)') + ax1.plot(batches, pto3_results, '-', label=f'pto-dsl ({blk[2]} cores)') + ax1.plot(batches, torch_results, '-', label='torch.matmul (24 cores)') + ax1.set_xlabel('Batch size') + ax1.set_ylabel('Bandwidth (Read A+B write C) (GB/s)') + ax1.grid(True, linestyle='--', alpha=0.6) + + ax2 = ax1.twinx() + ax2.plot(batches, rel_diff, '-', color='purple', label='pto-dsl / torch') + ax2.set_ylabel('Relative Performance (pto-dsl / torch)') + ax2.set_ylim(0.95*min(rel_diff),1.05*max(rel_diff)) + ax2.axhline(y=1, linestyle='--', linewidth=1.0) + + dt_str = {torch.float16: 'fp16', torch.float32: 'fp32'}[dtype] + plt.title( + f"""pto-dsl kernel vs torch.matmul\n + @<{b.shape[0]}, {b.shape[1]}, {dt_str}>=""" + ) + + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax2.get_legend_handles_labels() + ax1.legend(lines1 + lines2, labels1 + labels2, loc='best') + plt.tight_layout() + plt.savefig('dsl.png') + + +def correctness_verify(): + device = get_test_device() + torch.set_default_device(device) + torch.npu.set_device(device) + dtype = torch.float16 + torch.manual_seed(0) + + matmul_func = load_lib("./matmul_kernel.so") + + m, k, n = 128, 128, 128 + for blk in [1, 24]: + for bs in range(1000, 1100): + a = torch.rand((bs, m, k), device=device, dtype=dtype) + b = torch.rand((k, n), device=device, dtype=dtype) + c = torch.zeros((bs, m, n), device=device, dtype=dtype) + + matmul_func(c, a, b, batch_size=bs, block_dim=blk) + torch.npu.synchronize() + c_ref = torch.matmul(a, b) + + + diff = (c - c_ref).abs().max() + #assert diff <= 1e-5, diff + if diff < 1e-5: + print('.', end='', flush=True) + else: + print(f'#cores={blk} failed at shape: {list(a.shape)} with error:{diff}') + print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--benchmark", + dest="benchmark", + action="store_true", + help="Enable benchmarking" + ) + args = parser.parse_args() + correctness_verify() + if args.benchmark: + plot_benchmark() diff --git a/ptodsl/bench.py b/ptodsl/bench.py index b9efa909..d413cccd 100644 --- a/ptodsl/bench.py +++ b/ptodsl/bench.py @@ -36,15 +36,16 @@ def do_bench( fn() torch_npu.npu.synchronize() + # It's not easy to time a kernel in a way that satisfies the following two at the same time: + # 1) Ignores cache flushing, and 2) Ignoring kernel launch overhead. Here we ignore cache flushing. for i in range(benchmark_iters): if flush_cache: cache.zero_() - torch_npu.npu.synchronize() start_events[i].record() fn() end_events[i].record() - torch_npu.npu.synchronize() + torch_npu.npu.synchronize() f = {"s": 1e-3, "ms": 1e0, "us": 1e3, "ns": 1e6}[unit] times = [f * s.elapsed_time(e) for s, e in zip(start_events, end_events)] if aggregation == "mean": diff --git a/ptodsl/language.py b/ptodsl/language.py index 43f6365d..99bd6b23 100644 --- a/ptodsl/language.py +++ b/ptodsl/language.py @@ -1,4 +1,5 @@ from contextlib import contextmanager +from typing import Sequence from mlir.dialects import arith, pto, scf from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType @@ -66,7 +67,7 @@ def __le__(self, other): def __ge__(self, other): return Value._cmp(self, other, arith.CmpIPredicate.sge) - + def __eq__(self, other): return Value._cmp(self, other, arith.CmpIPredicate.eq) @@ -393,15 +394,24 @@ def _resolve_event_id(event_id): return event_id -def record_event(record_op, wait_op, event_id=0): - pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) +def record_event(record_op, wait_op, event_id: int|Sequence[int]=0): + if not isinstance(event_id, int): + for eid in event_id: + pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + else: + pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) + -def wait_event(record_op, wait_op, event_id=0): - pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) +def wait_event(record_op, wait_op, event_id: int|Sequence[int]=0): + if not isinstance(event_id, int): + for eid in event_id: + pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + else: + pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) -def record_wait_pair(record_op, wait_op, event_id=0): +def record_wait_pair(record_op, wait_op, event_id: int|Sequence[int]=0): rec = _resolve_sync_op(record_op) w = _resolve_sync_op(wait_op) ev = _resolve_event_id(event_id) diff --git a/ptodsl/pyproject.toml b/ptodsl/pyproject.toml index 2b57c249..a788ec3c 100644 --- a/ptodsl/pyproject.toml +++ b/ptodsl/pyproject.toml @@ -11,6 +11,9 @@ authors = [ { name = "pto-dsl contributors" } ] +[project.optional-dependencies] +dev = ["matplotlib"] + [tool.setuptools] packages = ["ptodsl"] From c9d17b0d65e1d9758e39dc6f4f1dcf22bba5062d Mon Sep 17 00:00:00 2001 From: Mirko De Vita <61700769+MirkoDeVita98@users.noreply.github.com> Date: Sat, 7 Mar 2026 20:12:51 +0100 Subject: [PATCH 09/53] elementwise unary fp16 fp32 dynamic multicore tests (#67) * added elementwise unary tests or fp16 and fp32 * reduce test case numbers * register require_npu marker --------- Co-authored-by: mirkodevita Co-authored-by: learning-chip --- ptodsl/language.py | 8 + pytest.ini | 3 + .../.gitignore | 4 + .../caller.py | 35 +++++ .../clean.sh | 3 + .../compile.sh | 33 ++++ .../gen_ir.py | 35 +++++ .../test_unary_builder.py | 141 ++++++++++++++++++ .../unary_builder.py | 116 ++++++++++++++ 9 files changed, 378 insertions(+) create mode 100644 pytest.ini create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/.gitignore create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/caller.py create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/clean.sh create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/compile.sh create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/test_unary_builder.py create mode 100644 tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py diff --git a/ptodsl/language.py b/ptodsl/language.py index 99bd6b23..61db0611 100644 --- a/ptodsl/language.py +++ b/ptodsl/language.py @@ -289,6 +289,14 @@ def sqrt(inp, out): pto.TSqrtOp(inp, out) +def rsqrt(inp, out): + pto.TRsqrtOp(inp, out) + + +def reciprocal(inp, out): + pto.TRecipOp(inp, out) + + def store(source, dest): pto.TStoreOp(None, source, dest) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..ea508c64 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + require_npu: marks tests as requiring NPU hardware (deselect with '-m "not require_npu"') diff --git a/tests/npu/elementwise_unary_dynamic_multicore/.gitignore b/tests/npu/elementwise_unary_dynamic_multicore/.gitignore new file mode 100644 index 00000000..d44846d3 --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/.gitignore @@ -0,0 +1,4 @@ +*.pto +*.cpp +*_lib.so +caller.cpp diff --git a/tests/npu/elementwise_unary_dynamic_multicore/caller.py b/tests/npu/elementwise_unary_dynamic_multicore/caller.py new file mode 100644 index 00000000..ab6cf518 --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/caller.py @@ -0,0 +1,35 @@ +"""Generate caller.cpp for a given unary op name.""" + +import sys + +_DTYPE_TO_CTYPE = { + "float32": "float", + "float16": "half", + "int32": "int32_t", + "int16": "int16_t", +} + +_BLOCK_DIM = 24 + + +def generate_caller(op_name, dtype="float32"): + ctype = _DTYPE_TO_CTYPE[dtype] + return f"""\ +#include "{op_name}_{dtype}.cpp" + +extern "C" void call_kernel( + void *stream, uint8_t *x, uint8_t *y, int32_t batch, int32_t n_cols) +{{ + _kernel<<<{_BLOCK_DIM}, nullptr, stream>>>( + ({ctype} *)x, ({ctype} *)y, batch, n_cols); +}} +""" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python caller.py [dtype]", file=sys.stderr) + sys.exit(1) + op_name = sys.argv[1] + dtype = sys.argv[2] if len(sys.argv) > 2 else "float32" + print(generate_caller(op_name, dtype)) diff --git a/tests/npu/elementwise_unary_dynamic_multicore/clean.sh b/tests/npu/elementwise_unary_dynamic_multicore/clean.sh new file mode 100644 index 00000000..5ab34240 --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/clean.sh @@ -0,0 +1,3 @@ +#!/bin/bash +rm -f *.pto *.cpp *_lib.so caller.cpp +echo "Cleaned generated files." diff --git a/tests/npu/elementwise_unary_dynamic_multicore/compile.sh b/tests/npu/elementwise_unary_dynamic_multicore/compile.sh new file mode 100644 index 00000000..84ae81cc --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/compile.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +OP=${1:?Usage: compile.sh [dtype]} +DTYPE=${2:-float32} + +TMP=$(mktemp -d) +trap "rm -rf $TMP" EXIT + +python "$SCRIPT_DIR/gen_ir.py" "$OP" "$DTYPE" > "$TMP/${OP}_${DTYPE}.pto" +ptoas --enable-insert-sync "$TMP/${OP}_${DTYPE}.pto" -o "$TMP/${OP}_${DTYPE}.cpp" + +python "$SCRIPT_DIR/caller.py" "$OP" "$DTYPE" > "$TMP/caller.cpp" + +PTO_LIB_PATH=/sources/pto-isa +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + "$TMP/caller.cpp" \ + -o "$SCRIPT_DIR/${OP}_${DTYPE}_lib.so" + +echo "Built ${OP}_${DTYPE}_lib.so successfully." diff --git a/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py b/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..90dfd7ab --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py @@ -0,0 +1,35 @@ +"""Print MLIR IR for a unary op at a given dtype. + +Usage: python gen_ir.py [dtype] +""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import ptodsl.language as pto +from unary_builder import build_unary_kernel + +_OPS = { + "rsqrt": pto.rsqrt, + "sqrt": pto.sqrt, + "exp": pto.exp, + "log": pto.log, + "relu": pto.relu, + "abs": pto.abs, + "reciprocal": pto.reciprocal, +} + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python gen_ir.py [dtype]", file=sys.stderr) + sys.exit(1) + op_name = sys.argv[1] + dtype = sys.argv[2] if len(sys.argv) > 2 else "float32" + + if op_name not in _OPS: + print(f"Unknown op: {op_name}. Available: {list(_OPS)}", file=sys.stderr) + sys.exit(1) + + print(build_unary_kernel(op_name, _OPS[op_name], dtype=dtype)) diff --git a/tests/npu/elementwise_unary_dynamic_multicore/test_unary_builder.py b/tests/npu/elementwise_unary_dynamic_multicore/test_unary_builder.py new file mode 100644 index 00000000..65411aff --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/test_unary_builder.py @@ -0,0 +1,141 @@ +import os +import ctypes +import subprocess + +import pytest +import torch +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() + +UNARY_OPS = [ + ("rsqrt", lambda x: x.rsqrt()), + ("sqrt", lambda x: x.sqrt()), + ("exp", lambda x: x.exp()), + ("log", lambda x: x.log()), + ("relu", lambda x: x.relu()), + ("abs", lambda x: x.abs()), + ("reciprocal", lambda x: x.reciprocal()), +] + +DTYPES = ["float32", "float16"] + +TORCH_DTYPES = { + "float32": torch.float32, + "float16": torch.float16, +} + +_SHAPE_LIST = [ + (1, 128), + (7, 1024), + (29, 512), + (32, 2048), + (65, 4096), + (200, 8192), +] + +_SHAPE_PARAMS = [ + pytest.param(batch, n_cols, id=f"batch{batch}-cols{n_cols}") + for batch, n_cols in _SHAPE_LIST +] + +_PARAMS = [ + pytest.param((op_name, ref_fn, dtype), id=f"{op_name}-{dtype}") + for op_name, ref_fn in UNARY_OPS + for dtype in DTYPES +] + + +@pytest.fixture(scope="session", params=_PARAMS) +def compiled_lib(request): + op_name, ref_fn, dtype = request.param + subprocess.check_call( + ["bash", os.path.join(_DIR, "compile.sh"), op_name, dtype], + cwd=_DIR, + ) + yield { + "op_name": op_name, + "ref_fn": ref_fn, + "dtype": dtype, + "lib_path": _lib_path(op_name, dtype), + } + os.remove(_lib_path(op_name, dtype)) + + +def _make_input(shape, device, dtype, op_name): + """Return a suitable input tensor for the given op. + + rsqrt: inputs in (1.0, 2.0] — keeps outputs near 1.0 + so float16 absolute error stays within 2e-3. + sqrt/log: inputs in (0.1, 1.1]. + exp: inputs in (-0.5, 0.5] to avoid float16 overflow. + relu/abs: inputs in (-1.0, 1.0] to exercise both signs. + """ + if op_name in {"rsqrt", "reciprocal"}: + return torch.rand(shape, device=device, dtype=dtype) + 1.0 + elif op_name in {"sqrt", "log"}: + return torch.rand(shape, device=device, dtype=dtype) + 0.1 + elif op_name == "exp": + return torch.rand(shape, device=device, dtype=dtype) - 0.5 + else: + return torch.rand(shape, device=device, dtype=dtype) * 2.0 - 1.0 + + +def _lib_to_func_unary(lib): + lib.call_kernel.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int32, + ctypes.c_int32, + ] + lib.call_kernel.restype = None + + def fn(x, y): + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_int32(x.size(0)), + ctypes.c_int32(x.size(1)), + ) + + return fn + + +def _lib_path(op_name, dtype): + return os.path.join(_DIR, f"{op_name}_{dtype}_lib.so") + + +def test_build_unary_kernels(compiled_lib): + assert os.path.exists(_lib_path(compiled_lib["op_name"], compiled_lib["dtype"])) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("batch, n_cols", _SHAPE_PARAMS) +def test_unary_precision(compiled_lib, batch, n_cols): + import torch_npu # noqa: F401 + + torch.npu.set_device(_DEVICE) + op_name = compiled_lib["op_name"] + ref_fn = compiled_lib["ref_fn"] + torch_dtype = TORCH_DTYPES[compiled_lib["dtype"]] + + lib = ctypes.CDLL(compiled_lib["lib_path"]) + kernel = _lib_to_func_unary(lib) + + x = _make_input((batch, n_cols), _DEVICE, torch_dtype, op_name) + y = torch.empty(batch, n_cols, device=_DEVICE, dtype=torch_dtype) + kernel(x, y) + torch.npu.synchronize() + y_ref = ref_fn(x) + torch.npu.synchronize() + torch.testing.assert_close(y, y_ref, atol=2e-3, rtol=1e-3) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py b/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py new file mode 100644 index 00000000..ac397ad5 --- /dev/null +++ b/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py @@ -0,0 +1,116 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + +const = pto.const + +# 32 KB of UB +_TILE_SIZE_BYTES = 32 * 1024 +_DTYPE_BYTES = {"float32": 4, "float16": 2} + + +def meta_data(dtype="float32"): + pto_dtype = {"float32": pto.float32, "float16": pto.float16}[dtype] + elements_per_tile = _TILE_SIZE_BYTES // _DTYPE_BYTES[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=pto_dtype) + subtensor_type = pto.SubTensorType(shape=[1, elements_per_tile], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, elements_per_tile], + valid_shape=[1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "elements_per_tile": elements_per_tile, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_unary_kernel(op_name, op_fn, dtype="float32"): + """ + Dynamic multicore unary elementwise kernel. + + Args: + x_ptr : dtype[batch * n_cols] input matrix, row-major + y_ptr : dtype[batch * n_cols] output matrix + batch_i32 : int32 number of rows + n_cols_i32 : int32 elements per row; must be <= elements_per_tile + + Semantics: + y[r, c] = op(x[r, c]) + """ + _meta_data = lambda: meta_data(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(elements_per_tile) + + batch = pto.index_cast(batch_i32) + n_cols = pto.index_cast(n_cols_i32) + + with pto.vector_section(): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = pto.index_cast(cid * sub_bnum + sub_bid) + num_cores = pto.index_cast(num_blocks * sub_bnum) + + rows_per_core = pto.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = pto.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_y = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.for_range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_x = pto.slice_view( + subtensor_type, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_y = pto.slice_view( + subtensor_type, + source=tv_y, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_x, tb_x) + op_fn(tb_x, tb_y) + pto.store(tb_y, sv_y) + + _ = op_name + return _kernel From 1d888fcd47fe509d214c3479a213ae6aa4bf405e Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Sat, 7 Mar 2026 23:10:09 +0100 Subject: [PATCH 10/53] Refactor ptodsl package module/namespace layout (#69) * refactor ptodsl module namespaces * fix lazy-import * module clean-up * comment on the need to lazy-eval * pto.subset -> tile.subset * further split pto.py file * multi-level dir layout * avoid overwriting sys.modules * lazy-import * pto.for_range -> pto.range --- .../aot/add_dynamic_multicore/add_builder.py | 22 +- .../add_double_builder.py | 24 +- .../aot/add_static_multicore/add_builder.py | 15 +- .../aot/fast_hadamard/hadamard_builder.py | 78 ++-- .../geglu_dynamic_multicore/geglu_builder.py | 44 +- .../matmul_builder.py | 34 +- .../matmul_dsl.py | 28 +- .../matmul_builder.py | 28 +- .../matmul_builder.py | 22 +- .../relu_dynamic_multicore/relu_builder.py | 24 +- examples/jit/add_dynamic_multicore/run_add.py | 22 +- .../jit/add_static_multicore/run_add_1d.py | 10 +- .../jit/add_static_multicore/run_add_2d.py | 14 +- .../run_batch_matmul.py | 34 +- ptodsl/__init__.py | 390 +--------------- ptodsl/api/__init__.py | 3 + ptodsl/api/control_flow.py | 52 +++ ptodsl/api/pto.py | 58 +++ ptodsl/api/pto_general.py | 84 ++++ ptodsl/api/scalar.py | 164 +++++++ ptodsl/api/synchronization.py | 70 +++ ptodsl/api/tile.py | 107 +++++ ptodsl/api/type_def.py | 105 +++++ ptodsl/bench.py | 54 +-- ptodsl/compiler/__init__.py | 4 + ptodsl/compiler/ir.py | 107 +++++ ptodsl/compiler/jit.py | 296 ++++++++++++ ptodsl/language.py | 435 ------------------ ptodsl/pto.py | 6 + ptodsl/scalar.py | 6 + ptodsl/test_util.py | 32 +- ptodsl/tile.py | 6 + ptodsl/utils/__init__.py | 4 + ptodsl/utils/bench.py | 54 +++ ptodsl/utils/test_util.py | 19 + tests/frontend/test_add_dynamic_ir.py | 23 +- tests/frontend/test_add_ir.py | 71 +-- tests/frontend/test_matmul_dynamic_ir.py | 33 +- .../elementwise_dynamic_multicore/builder.py | 38 +- .../elementwise_dynamic_multicore/gen_ir.py | 12 +- .../gen_ir.py | 16 +- .../unary_builder.py | 20 +- tests/npu/gather_dynamic_multicore/builder.py | 24 +- tests/npu/gather_static_singlecore/builder.py | 10 +- .../rowsum_builder.py | 24 +- 45 files changed, 1500 insertions(+), 1226 deletions(-) create mode 100644 ptodsl/api/__init__.py create mode 100644 ptodsl/api/control_flow.py create mode 100644 ptodsl/api/pto.py create mode 100644 ptodsl/api/pto_general.py create mode 100644 ptodsl/api/scalar.py create mode 100644 ptodsl/api/synchronization.py create mode 100644 ptodsl/api/tile.py create mode 100644 ptodsl/api/type_def.py create mode 100644 ptodsl/compiler/__init__.py create mode 100644 ptodsl/compiler/ir.py create mode 100644 ptodsl/compiler/jit.py delete mode 100644 ptodsl/language.py create mode 100644 ptodsl/pto.py create mode 100644 ptodsl/scalar.py create mode 100644 ptodsl/tile.py create mode 100644 ptodsl/utils/__init__.py create mode 100644 ptodsl/utils/bench.py create mode 100644 ptodsl/utils/test_util.py diff --git a/examples/aot/add_dynamic_multicore/add_builder.py b/examples/aot/add_dynamic_multicore/add_builder.py index df613306..c6e0d9e5 100644 --- a/examples/aot/add_dynamic_multicore/add_builder.py +++ b/examples/aot/add_dynamic_multicore/add_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const def meta_data(): @@ -46,12 +46,12 @@ def vec_add_1d_dynamic( num_blocks = pto.get_block_num() # Convert i64/i32 values to index for arithmetic ops. - vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores - total_elements = pto.index_cast(argN) + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores + total_elements = s.index_cast(argN) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid * num_tiles_per_core with pto.vector_section(): @@ -69,13 +69,13 @@ def vec_add_1d_dynamic( need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -91,7 +91,7 @@ def vec_add_1d_dynamic( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) diff --git a/examples/aot/add_dynamic_multicore/add_double_builder.py b/examples/aot/add_dynamic_multicore/add_double_builder.py index 073ceb15..a2da0b60 100644 --- a/examples/aot/add_dynamic_multicore/add_double_builder.py +++ b/examples/aot/add_dynamic_multicore/add_double_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const def meta_data(): @@ -47,12 +47,12 @@ def vec_add_1d_dynamic( num_blocks = pto.get_block_num() # Convert i64/i32 values to index for arithmetic ops. - vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores - total_elements = pto.index_cast(argN) + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores + total_elements = s.index_cast(argN) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid * num_tiles_per_core with pto.vector_section(): @@ -74,13 +74,13 @@ def vec_add_1d_dynamic( need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -96,12 +96,12 @@ def vec_add_1d_dynamic( with pto.if_context((i % c2) == c0, has_else=True) as branch: pto.load(sv0, tb0_ping) pto.load(sv1, tb1_ping) - pto.add(tb0_ping, tb1_ping, tb2_ping) + tile.add(tb0_ping, tb1_ping, tb2_ping) pto.store(tb2_ping, sv2) with branch.else_context(): pto.load(sv0, tb0_pong) pto.load(sv1, tb1_pong) - pto.add(tb0_pong, tb1_pong, tb2_pong) + tile.add(tb0_pong, tb1_pong, tb2_pong) pto.store(tb2_pong, sv2) diff --git a/examples/aot/add_static_multicore/add_builder.py b/examples/aot/add_static_multicore/add_builder.py index 0525f4c7..93865a47 100644 --- a/examples/aot/add_static_multicore/add_builder.py +++ b/examples/aot/add_static_multicore/add_builder.py @@ -1,6 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto -const = pto.const +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const def meta_data(): @@ -42,14 +43,14 @@ def vec_add_kernel_2d_dynamic( cidmul = cid * sub_bnum vid = cidmul + sub_bid - v_row_idx = pto.index_cast(arg_vrow_i32) - v_col_idx = pto.index_cast(arg_vcol_i32) + v_row_idx = s.index_cast(arg_vrow_i32) + v_col_idx = s.index_cast(arg_vcol_i32) tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[c1280, c32], strides=[c32, c1]) tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[c1280, c32], strides=[c32, c1]) tv2 = pto.as_tensor(tensor_type, ptr=arg2, shape=[c1280, c32], strides=[c32, c1]) - vid_idx = pto.index_cast(vid) + vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 # every core loads 32 rows of data sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) @@ -62,7 +63,7 @@ def vec_add_kernel_2d_dynamic( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) # `default `return None` maps to `func.ReturnOp([])` diff --git a/examples/aot/fast_hadamard/hadamard_builder.py b/examples/aot/fast_hadamard/hadamard_builder.py index 4ea10e12..032ff728 100644 --- a/examples/aot/fast_hadamard/hadamard_builder.py +++ b/examples/aot/fast_hadamard/hadamard_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const ELEMENTS_PER_TILE = 32 * 1024 // 2 # 32KB UB / sizeof(fp16) HALF_ELEMENTS_PER_TILE = ELEMENTS_PER_TILE // 2 @@ -54,25 +54,25 @@ def fast_hadamard_autosync( c1 = const(1) c2 = const(2) - batch = pto.index_cast(batch_i32) - n = pto.index_cast(n_i32) - log2_n = pto.index_cast(log2_n_i32) + batch = s.index_cast(batch_i32) + n = s.index_cast(n_i32) + log2_n = s.index_cast(log2_n_i32) cid = pto.get_block_idx() sub_bid = pto.get_subblock_idx() sub_bnum = pto.get_subblock_num() num_blocks = pto.get_block_num() - vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores with pto.vector_section(): - samples_per_core = pto.ceil_div(batch, num_cores) + samples_per_core = s.ceil_div(batch, num_cores) sample_offset = vid * samples_per_core with pto.if_context(sample_offset < batch): samples_end = sample_offset + samples_per_core - samples_to_process = pto.select( + samples_to_process = s.select( samples_end > batch, batch - sample_offset, samples_per_core, @@ -100,31 +100,31 @@ def fast_hadamard_autosync( # poorly with static tile subset sizing in current PTO Python # bindings and can corrupt rows for larger batches. samples_per_load = c1 - num_chunks = pto.ceil_div(samples_to_process, samples_per_load) + num_chunks = s.ceil_div(samples_to_process, samples_per_load) def process_rows(tb_row, tb_even, tb_odd, gm_offset, cur_samples): - for s in pto.for_range(c0, cur_samples, c1): + for s in pto.range(c0, cur_samples, c1): row_offset = gm_offset + s * n sv_row = pto.slice_view( subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] ) # Alias row halves inside UB row tile (no GM round-trip # per Hadamard iteration). - tb_first = pto.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]) - tb_second = pto.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]) + tb_first = tile.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]) + tb_second = tile.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]) pto.load(sv_row, tb_row) - for _ in pto.for_range(c0, log2_n, c1): - pto.gather(tb_row, tb_even, mask_pattern="P0101") - pto.gather(tb_row, tb_odd, mask_pattern="P1010") - pto.add(tb_even, tb_odd, tb_first) - pto.sub(tb_even, tb_odd, tb_second) + for _ in pto.range(c0, log2_n, c1): + tile.gather(tb_row, tb_even, mask_pattern="P0101") + tile.gather(tb_row, tb_odd, mask_pattern="P1010") + tile.add(tb_even, tb_odd, tb_first) + tile.sub(tb_even, tb_odd, tb_second) pto.store(tb_row, sv_row) - for chunk_i in pto.for_range(c0, num_chunks, c1): + for chunk_i in pto.range(c0, num_chunks, c1): sample_done = chunk_i * samples_per_load chunk_left = samples_to_process - sample_done - cur_samples = pto.select( + cur_samples = s.select( chunk_left < samples_per_load, chunk_left, samples_per_load ) @@ -153,25 +153,25 @@ def fast_hadamard_manualsync( c1 = const(1) c2 = const(2) - batch = pto.index_cast(batch_i32) - n = pto.index_cast(n_i32) - log2_n = pto.index_cast(log2_n_i32) + batch = s.index_cast(batch_i32) + n = s.index_cast(n_i32) + log2_n = s.index_cast(log2_n_i32) cid = pto.get_block_idx() sub_bid = pto.get_subblock_idx() sub_bnum = pto.get_subblock_num() num_blocks = pto.get_block_num() - vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores with pto.vector_section(): - samples_per_core = pto.ceil_div(batch, num_cores) + samples_per_core = s.ceil_div(batch, num_cores) sample_offset = vid * samples_per_core with pto.if_context(sample_offset < batch): samples_end = sample_offset + samples_per_core - samples_to_process = pto.select( + samples_to_process = s.select( samples_end > batch, batch - sample_offset, samples_per_core, @@ -199,22 +199,22 @@ def fast_hadamard_manualsync( # poorly with static tile subset sizing in current PTO Python # bindings and can corrupt rows for larger batches. samples_per_load = c1 - num_chunks = pto.ceil_div(samples_to_process, samples_per_load) + num_chunks = s.ceil_div(samples_to_process, samples_per_load) def process_rows( tb_row, tb_even, tb_odd, event_id, gm_offset, cur_samples ): - for s in pto.for_range(c0, cur_samples, c1): + for s in pto.range(c0, cur_samples, c1): row_offset = gm_offset + s * n sv_row = pto.slice_view( subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] ) # Alias row halves inside UB row tile (no GM round-trip # per Hadamard iteration). - tb_first = pto.subset( + tb_first = tile.subset( tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] ) - tb_second = pto.subset( + tb_second = tile.subset( tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] ) @@ -223,12 +223,12 @@ def process_rows( pto.load(sv_row, tb_row) pto.record_wait_pair("LOAD", "VEC", event_id=event_id) - for _ in pto.for_range(c0, log2_n, c1): - pto.gather(tb_row, tb_even, mask_pattern="P0101") - pto.gather(tb_row, tb_odd, mask_pattern="P1010") + for _ in pto.range(c0, log2_n, c1): + tile.gather(tb_row, tb_even, mask_pattern="P0101") + tile.gather(tb_row, tb_odd, mask_pattern="P1010") pto.barrier("VEC") - pto.add(tb_even, tb_odd, tb_first) - pto.sub(tb_even, tb_odd, tb_second) + tile.add(tb_even, tb_odd, tb_first) + tile.sub(tb_even, tb_odd, tb_second) pto.barrier("VEC") pto.record_wait_pair( @@ -242,10 +242,10 @@ def process_rows( pto.record_event("VEC", "LOAD", event_id=event_id) pto.record_event("STORE_VEC", "VEC", event_id=event_id) - for chunk_i in pto.for_range(c0, num_chunks, c1): + for chunk_i in pto.range(c0, num_chunks, c1): sample_done = chunk_i * samples_per_load chunk_left = samples_to_process - sample_done - cur_samples = pto.select( + cur_samples = s.select( chunk_left < samples_per_load, chunk_left, samples_per_load ) diff --git a/examples/aot/geglu_dynamic_multicore/geglu_builder.py b/examples/aot/geglu_dynamic_multicore/geglu_builder.py index 4151d19c..e5e862ce 100644 --- a/examples/aot/geglu_dynamic_multicore/geglu_builder.py +++ b/examples/aot/geglu_dynamic_multicore/geglu_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const # 32 KB of UB / sizeof(fp16) = 16384 elements per tile ELEMENTS_PER_TILE = 32 * 1024 // 2 @@ -72,8 +72,8 @@ def _kernel( c1 = const(1) c_tile = const(ELEMENTS_PER_TILE) - batch = pto.index_cast(batch_i32) - n_cols = pto.index_cast(n_cols_i32) + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) with pto.vector_section(): # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. @@ -85,13 +85,13 @@ def _kernel( sub_bnum = pto.get_subblock_num() num_blocks = pto.get_block_num() - vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores # Distribute rows across cores (row-level parallelism). - rows_per_core = pto.ceil_div(batch, num_cores) + rows_per_core = s.ceil_div(batch, num_cores) row_start = vid * rows_per_core - row_end = pto.min_u(row_start + rows_per_core, batch) + row_end = s.min_u(row_start + rows_per_core, batch) num_rows = row_end - row_start total_elems = batch * n_cols @@ -113,7 +113,7 @@ def _kernel( tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) - for row_i in pto.for_range(c0, num_rows, c1): + for row_i in pto.range(c0, num_rows, c1): gm_offset = (row_start + row_i) * n_cols sv_a = pto.slice_view( @@ -140,24 +140,24 @@ def _kernel( # Derive constants from data (no scalar-tile broadcast needed): # a - a = 0 => exp(0) = 1.0 - pto.sub(tb_a, tb_a, tb_tmp2) # tmp2 = 0.0 - pto.exp(tb_tmp2, tb_ones) # ones = 1.0 + tile.sub(tb_a, tb_a, tb_tmp2) # tmp2 = 0.0 + tile.exp(tb_tmp2, tb_ones) # ones = 1.0 # tanh(a) = (exp(2a) - 1) / (exp(2a) + 1) - pto.add(tb_a, tb_a, tb_tmp1) # tmp1 = 2a - pto.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(2a) - pto.sub(tb_tmp1, tb_ones, tb_tmp2) # tmp2 = exp(2a) - 1 - pto.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = exp(2a) + 1 - pto.div(tb_tmp2, tb_tmp1, tb_tmp2) # tmp2 = tanh(a) + tile.add(tb_a, tb_a, tb_tmp1) # tmp1 = 2a + tile.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(2a) + tile.sub(tb_tmp1, tb_ones, tb_tmp2) # tmp2 = exp(2a) - 1 + tile.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = exp(2a) + 1 + tile.div(tb_tmp2, tb_tmp1, tb_tmp2) # tmp2 = tanh(a) # gelu_approx(a) = a * (1 + tanh(a)) / 2 - pto.add(tb_ones, tb_tmp2, tb_tmp1) # tmp1 = 1 + tanh(a) - pto.mul(tb_a, tb_tmp1, tb_tmp1) # tmp1 = a * (1 + tanh(a)) - pto.add(tb_ones, tb_ones, tb_tmp2) # tmp2 = 2.0 - pto.div(tb_tmp1, tb_tmp2, tb_tmp1) # tmp1 = gelu_approx(a) + tile.add(tb_ones, tb_tmp2, tb_tmp1) # tmp1 = 1 + tanh(a) + tile.mul(tb_a, tb_tmp1, tb_tmp1) # tmp1 = a * (1 + tanh(a)) + tile.add(tb_ones, tb_ones, tb_tmp2) # tmp2 = 2.0 + tile.div(tb_tmp1, tb_tmp2, tb_tmp1) # tmp1 = gelu_approx(a) # GEGLU: c = gelu_approx(a) * b - pto.mul(tb_tmp1, tb_b, tb_tmp1) # tmp1 = c + tile.mul(tb_tmp1, tb_b, tb_tmp1) # tmp1 = c pto.store(tb_tmp1, sv_c) _ = fn_name diff --git a/examples/aot/matmul_dynbatch_multicore/matmul_builder.py b/examples/aot/matmul_dynbatch_multicore/matmul_builder.py index c3b7f96d..730190d7 100644 --- a/examples/aot/matmul_dynbatch_multicore/matmul_builder.py +++ b/examples/aot/matmul_dynbatch_multicore/matmul_builder.py @@ -1,7 +1,7 @@ from mlir.ir import IntegerType -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s def build( @@ -56,7 +56,7 @@ def meta_data(): "tile_buf_biasTile": tile_buf_biasTile, } - const = pto.const + const = s.const @to_ir_module(meta_data=meta_data) def RunTMATMULSplitK( @@ -78,15 +78,15 @@ def RunTMATMULSplitK( cTileM = const(M) cTileN = const(N) - batch = pto.index_cast(batch_i32) + batch = s.index_cast(batch_i32) cBM = batch * cM - num_blocks = pto.index_cast(pto.get_block_num()) - batches_per_core = pto.ceil_div(batch, num_blocks) - bid = pto.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + batches_per_core = s.ceil_div(batch, num_blocks) + bid = s.index_cast(pto.get_block_idx()) b_start = bid * batches_per_core b_end_unclamped = b_start + batches_per_core - b_end = pto.min_u(b_end_unclamped, batch) + b_end = s.min_u(b_end_unclamped, batch) tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) @@ -101,10 +101,10 @@ def RunTMATMULSplitK( cTile = pto.alloc_tile(tile_buf_cTile) biasTile = pto.alloc_tile(tile_buf_biasTile) - for b_idx in pto.for_range(b_start, b_end, c1): + for b_idx in pto.range(b_start, b_end, c1): row_off = b_idx * cM - for i in pto.for_range(c0, cIter, c1): + for i in pto.range(c0, cIter, c1): kOff = i * cBASEK svA = pto.slice_view(tile_view_a, source=tvA, offsets=[row_off, kOff], sizes=[cTileM, cBASEK]) svB = pto.slice_view(tile_view_b, source=tvB, offsets=[kOff, c0], sizes=[cBASEK, cTileN]) @@ -117,26 +117,26 @@ def RunTMATMULSplitK( pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(aMatTile, aTile) - pto.mov(bMatTile, bTile) + tile.mov(aMatTile, aTile) + tile.mov(bMatTile, bTile) with pto.if_context(isBias): - pto.mov(biasDataTile, biasTile) + tile.mov(biasDataTile, biasTile) pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - is_i0 = pto.eq(i, c0) + is_i0 = s.eq(i, c0) def _first_iter(): pto.cond( isBias, - lambda: pto.matmul_bias(aTile, bTile, biasTile, cTile), - lambda: pto.matmul(aTile, bTile, cTile), + lambda: tile.matmul_bias(aTile, bTile, biasTile, cTile), + lambda: tile.matmul(aTile, bTile, cTile), ) pto.cond( is_i0, _first_iter, - lambda: pto.matmul_acc(cTile, aTile, bTile, cTile), + lambda: tile.matmul_acc(cTile, aTile, bTile, cTile), ) pto.record_wait_pair("MATMUL", "LOAD", event_id=0) diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py index 26436b66..363637b7 100644 --- a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py +++ b/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py @@ -1,7 +1,7 @@ from mlir.ir import IntegerType -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s def build(M=128, K=128, N=128): @@ -26,7 +26,7 @@ def meta_data(): # TODO: Get rid of this? return locals() - const = pto.const + const = s.const # Until we have set_dyn_flag with event_id as SSA values @@ -62,15 +62,15 @@ def RunTMATMULSplitK( cM = const(M) cK = const(K) cN = const(N) - batch = pto.index_cast(batch_i32) + batch = s.index_cast(batch_i32) - num_blocks = pto.index_cast(pto.get_block_num()) + num_blocks = s.index_cast(pto.get_block_num()) # TODO round robin - batches_per_core = pto.ceil_div(batch, num_blocks) - bid = pto.index_cast(pto.get_block_idx()) + batches_per_core = s.ceil_div(batch, num_blocks) + bid = s.index_cast(pto.get_block_idx()) b_start = bid * batches_per_core b_end_unclamped = b_start + batches_per_core - b_end = pto.min_u(b_end_unclamped, batch) + b_end = s.min_u(b_end_unclamped, batch) # TODO: if no batched assigned to this core, early return @@ -91,7 +91,7 @@ def RunTMATMULSplitK( svB = pto.slice_view(tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cN]) pto.load(svB, bMatTile) pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(bMatTile, bTile) + tile.mov(bMatTile, bTile) # TODO: wait here so we can use full l1 memory later for A. @@ -113,7 +113,7 @@ def RunTMATMULSplitK( # signal to MATMUL that it can overwrite L0C pto.record_event("STORE_ACC", "MATMUL", event_id=[0, 1]) - for b_idx in pto.for_range(b_start, b_end, c1): + for b_idx in pto.range(b_start, b_end, c1): curr = b_idx % c2 svA = pto.slice_view(tile_view_a, source=tvA, offsets=[b_idx+c1, c0, c0], sizes=[c1, cM, cK]) svC = pto.slice_view(tile_view_c, source=tvC, offsets=[b_idx, c0, c0], sizes=[c1, cM, cN]) @@ -134,8 +134,8 @@ def RunTMATMULSplitK( wait_event("MATMUL", "MOV_M2L", event_id=curr) pto.cond( curr == c0, - lambda: pto.mov(aMatTiles[0], aTiles[0]), - lambda: pto.mov(aMatTiles[1], aTiles[1]) + lambda: tile.mov(aMatTiles[0], aTiles[0]), + lambda: tile.mov(aMatTiles[1], aTiles[1]) ) with pto.if_context(b_idx + c2 < b_end): record_event("MOV_M2L", "LOAD", event_id=curr) @@ -147,8 +147,8 @@ def RunTMATMULSplitK( wait_event("STORE_ACC", "MATMUL", event_id=curr) pto.cond( curr == c0, - lambda: pto.matmul(aTiles[0], bTile, cTiles[0]), - lambda: pto.matmul(aTiles[1], bTile, cTiles[1]), + lambda: tile.matmul(aTiles[0], bTile, cTiles[0]), + lambda: tile.matmul(aTiles[1], bTile, cTiles[1]), ) record_event("MATMUL", "STORE_ACC", event_id=curr) with pto.if_context(b_idx + c2 < b_end): diff --git a/examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py b/examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py index 61ace0b7..6dbfd8cd 100644 --- a/examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py +++ b/examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py @@ -1,7 +1,7 @@ # adapted from https://github.com/zhangstevenunity/PTOAS/blob/a301aa43b388d9b2e1ba0db8773b3a719e8c445b/test/samples/MatMul/tmatmulk.py -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s def build( @@ -49,7 +49,7 @@ def meta_data(): "tile_buf_cTile": tile_buf_cTile, } - const = pto.const + const = s.const @to_ir_module(meta_data=meta_data) def RunTMATMULSplitK( @@ -75,19 +75,19 @@ def RunTMATMULSplitK( cTileM = const(M) cTileN = const(N) - batch = pto.index_cast(batch_i32) + batch = s.index_cast(batch_i32) # Distribute batches over cores with "base + remainder" policy. - num_blocks = pto.index_cast(pto.get_block_num()) - bid = pto.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) base = batch // num_blocks rem = batch % num_blocks - lt_rem = pto.lt(bid, rem) - min_bid_rem = pto.min_u(bid, rem) + lt_rem = s.lt(bid, rem) + min_bid_rem = s.min_u(bid, rem) b_start = bid * base + min_bid_rem - length = base + pto.select(lt_rem, c1, c0) - b_end = pto.min_u(b_start + length, batch) + length = base + s.select(lt_rem, c1, c0) + b_end = s.min_u(b_start + length, batch) tvA = pto.as_tensor(tv_a, ptr=a_ptr, shape=[batch, cM, cK], strides=[cKM, cK, c1]) tvB = pto.as_tensor(tv_b, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) @@ -103,9 +103,9 @@ def RunTMATMULSplitK( svB = pto.slice_view(tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cTileN]) pto.load(svB, bMatTile) pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(bMatTile, bTile) + tile.mov(bMatTile, bTile) - for b_idx in pto.for_range(b_start, b_end, c1): + for b_idx in pto.range(b_start, b_end, c1): svA = pto.slice_view( tile_view_a, source=tvA, @@ -122,9 +122,9 @@ def RunTMATMULSplitK( pto.load(svA, aMatTile) pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(aMatTile, aTile) + tile.mov(aMatTile, aTile) pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - pto.matmul(aTile, bTile, cTile) + tile.matmul(aTile, bTile, cTile) pto.record_wait_pair("MATMUL", "LOAD", event_id=0) pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) diff --git a/examples/aot/matmul_static_singlecore/matmul_builder.py b/examples/aot/matmul_static_singlecore/matmul_builder.py index ab5e2ffa..3bb3464e 100644 --- a/examples/aot/matmul_static_singlecore/matmul_builder.py +++ b/examples/aot/matmul_static_singlecore/matmul_builder.py @@ -1,7 +1,7 @@ # adapted from https://github.com/zhangstevenunity/PTOAS/blob/a301aa43b388d9b2e1ba0db8773b3a719e8c445b/test/samples/MatMul/tmatmulk.py -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s def build( @@ -55,7 +55,7 @@ def meta_data(): "tile_buf_biasTile": tile_buf_biasTile, } - const = pto.const + const = s.const @to_ir_module(meta_data=meta_data) def RunTMATMULSplitK( @@ -89,7 +89,7 @@ def RunTMATMULSplitK( cTile = pto.alloc_tile(tile_buf_cTile) biasTile = pto.alloc_tile(tile_buf_biasTile) - for i in pto.for_range(c0, cIter, c1): + for i in pto.range(c0, cIter, c1): kOff = i * cBASEK svA = pto.slice_view( tile_view_a, @@ -115,24 +115,24 @@ def RunTMATMULSplitK( with pto.if_context(isBias): pto.load(svBias, biasDataTile) - pto.mov(aMatTile, aTile) - pto.mov(bMatTile, bTile) + tile.mov(aMatTile, aTile) + tile.mov(bMatTile, bTile) with pto.if_context(isBias): - pto.mov(biasDataTile, biasTile) + tile.mov(biasDataTile, biasTile) - is_i0 = pto.eq(i, c0) + is_i0 = s.eq(i, c0) def _first_iter(): pto.cond( isBias, - lambda: pto.matmul_bias(aTile, bTile, biasTile, cTile), - lambda: pto.matmul(aTile, bTile, cTile), + lambda: tile.matmul_bias(aTile, bTile, biasTile, cTile), + lambda: tile.matmul(aTile, bTile, cTile), ) pto.cond( is_i0, _first_iter, - lambda: pto.matmul_acc(cTile, aTile, bTile, cTile), + lambda: tile.matmul_acc(cTile, aTile, bTile, cTile), ) svOut = pto.slice_view( diff --git a/examples/aot/relu_dynamic_multicore/relu_builder.py b/examples/aot/relu_dynamic_multicore/relu_builder.py index dc6acbd6..a797d06f 100644 --- a/examples/aot/relu_dynamic_multicore/relu_builder.py +++ b/examples/aot/relu_dynamic_multicore/relu_builder.py @@ -1,5 +1,5 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s def build(): @@ -29,7 +29,7 @@ def meta_data(): "tile_w": tile_w, } - const = pto.const + const = s.const @to_ir_module(meta_data=meta_data) def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> None: @@ -37,31 +37,31 @@ def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> c0 = const(0) c1 = const(1) c_tile_w = const(tile_w) - total_elements = pto.index_cast(argN) + total_elements = s.index_cast(argN) - num_blocks = pto.index_cast(pto.get_block_num()) - num_el_per_core = pto.ceil_div(total_elements, num_blocks) + num_blocks = s.index_cast(pto.get_block_num()) + num_el_per_core = s.ceil_div(total_elements, num_blocks) # Per-core range: [core_start, core_end) - bid = pto.index_cast(pto.get_block_idx()) + bid = s.index_cast(pto.get_block_idx()) core_start = bid * num_el_per_core core_end_unclamped = core_start + num_el_per_core - core_end = pto.min_u(core_end_unclamped, total_elements) + core_end = s.min_u(core_end_unclamped, total_elements) core_len = core_end - core_start # Per-core number of tiles: ceil(core_len / tile_w). - num_tiles = pto.ceil_div(core_len, c_tile_w) + num_tiles = s.ceil_div(core_len, c_tile_w) # GM tensors shape N with stride 1. tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[total_elements], strides=[c1]) tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[total_elements], strides=[c1]) - for i in pto.for_range(c0, num_tiles, c1): + for i in pto.range(c0, num_tiles, c1): offset_tile = i * c_tile_w offset_total = core_start + offset_tile remaining_core = core_end - offset_total - valid_len = pto.min_u(remaining_core, c_tile_w) + valid_len = s.min_u(remaining_core, c_tile_w) # Keep per-iteration tile alloc to match original behavior. tb0 = pto.alloc_tile(tile_type, valid_row=c1, valid_col=valid_len) @@ -82,7 +82,7 @@ def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> ) pto.load(sv0, tb0) - pto.relu(tb0, tb1) + tile.relu(tb0, tb1) pto.store(tb1, sv1) return sync_kernel_dyn diff --git a/examples/jit/add_dynamic_multicore/run_add.py b/examples/jit/add_dynamic_multicore/run_add.py index aca0d9a1..b2a063c9 100644 --- a/examples/jit/add_dynamic_multicore/run_add.py +++ b/examples/jit/add_dynamic_multicore/run_add.py @@ -1,10 +1,10 @@ -from ptodsl import jit -import ptodsl.language as pto +from ptodsl import jit, pto, tile +from ptodsl import scalar as s import torch import torch_npu from ptodsl.test_util import get_test_device -const = pto.const +const = s.const def meta_data(): @@ -51,12 +51,12 @@ def vec_add_1d_dynamic( num_blocks = pto.get_block_num() # Convert i64/i32 values to index for arithmetic ops. - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) - total_elements = pto.index_cast(argN) + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks) + total_elements = s.index_cast(argN) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid_idx * num_tiles_per_core with pto.vector_section(): @@ -74,13 +74,13 @@ def vec_add_1d_dynamic( need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -96,7 +96,7 @@ def vec_add_1d_dynamic( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) diff --git a/examples/jit/add_static_multicore/run_add_1d.py b/examples/jit/add_static_multicore/run_add_1d.py index 722fe24c..e8388794 100644 --- a/examples/jit/add_static_multicore/run_add_1d.py +++ b/examples/jit/add_static_multicore/run_add_1d.py @@ -1,10 +1,10 @@ -from ptodsl import jit -import ptodsl.language as pto +from ptodsl import jit, pto, tile +from ptodsl import scalar as s import torch import torch_npu from ptodsl.test_util import get_test_device -const = pto.const +const = s.const def meta_data(): @@ -46,7 +46,7 @@ def vec_add_kernel( tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[c1, c1024], strides=[c1024, c1]) tv2 = pto.as_tensor(tensor_type, ptr=arg2, shape=[c1, c1024], strides=[c1024, c1]) - vid_idx = pto.index_cast(vid) + vid_idx = s.index_cast(vid) offset = vid_idx * c1024 # every core loads 1024 elements of data sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[c0, offset], sizes=[c1, c1024]) sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[c0, offset], sizes=[c1, c1024]) @@ -59,7 +59,7 @@ def vec_add_kernel( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) diff --git a/examples/jit/add_static_multicore/run_add_2d.py b/examples/jit/add_static_multicore/run_add_2d.py index ca73fe23..127bcd9f 100644 --- a/examples/jit/add_static_multicore/run_add_2d.py +++ b/examples/jit/add_static_multicore/run_add_2d.py @@ -1,10 +1,10 @@ -from ptodsl import jit -import ptodsl.language as pto +from ptodsl import jit, pto, tile +from ptodsl import scalar as s import torch import torch_npu from ptodsl.test_util import get_test_device -const = pto.const +const = s.const def meta_data(): @@ -46,14 +46,14 @@ def vec_add_kernel( cidmul = cid * sub_bnum vid = cidmul + sub_bid - v_row_idx = pto.index_cast(vrow) - v_col_idx = pto.index_cast(vcol) + v_row_idx = s.index_cast(vrow) + v_col_idx = s.index_cast(vcol) tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[c1280, c32], strides=[c32, c1]) tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[c1280, c32], strides=[c32, c1]) tv2 = pto.as_tensor(tensor_type, ptr=arg2, shape=[c1280, c32], strides=[c32, c1]) - vid_idx = pto.index_cast(vid) + vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 # every core loads 32 rows of data sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) @@ -66,7 +66,7 @@ def vec_add_kernel( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) diff --git a/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py b/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py index 1f75cbc3..815fc78a 100644 --- a/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py +++ b/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py @@ -1,12 +1,12 @@ from mlir.ir import IntegerType -from ptodsl import jit -import ptodsl.language as pto +from ptodsl import jit, pto, tile +from ptodsl import scalar as s import torch import torch_npu from ptodsl.test_util import get_test_device -const = pto.const +const = s.const def build_kernel( @@ -81,15 +81,15 @@ def RunTMATMULSplitK( cTileM = const(M) cTileN = const(N) - batch = pto.index_cast(batch_i32) + batch = s.index_cast(batch_i32) cBM = batch * cM - num_blocks = pto.index_cast(pto.get_block_num()) - batches_per_core = pto.ceil_div(batch, num_blocks) - bid = pto.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + batches_per_core = s.ceil_div(batch, num_blocks) + bid = s.index_cast(pto.get_block_idx()) b_start = bid * batches_per_core b_end_unclamped = b_start + batches_per_core - b_end = pto.min_u(b_end_unclamped, batch) + b_end = s.min_u(b_end_unclamped, batch) tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) @@ -104,10 +104,10 @@ def RunTMATMULSplitK( cTile = pto.alloc_tile(tile_buf_cTile) biasTile = pto.alloc_tile(tile_buf_biasTile) - for b_idx in pto.for_range(b_start, b_end, c1): + for b_idx in pto.range(b_start, b_end, c1): row_off = b_idx * cM - for i in pto.for_range(c0, cIter, c1): + for i in pto.range(c0, cIter, c1): kOff = i * cBASEK svA = pto.slice_view( tile_view_a, @@ -135,26 +135,26 @@ def RunTMATMULSplitK( pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(aMatTile, aTile) - pto.mov(bMatTile, bTile) + tile.mov(aMatTile, aTile) + tile.mov(bMatTile, bTile) with pto.if_context(isBias): - pto.mov(biasDataTile, biasTile) + tile.mov(biasDataTile, biasTile) pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - is_i0 = pto.eq(i, c0) + is_i0 = s.eq(i, c0) def _first_iter(): pto.cond( isBias, - lambda: pto.matmul_bias(aTile, bTile, biasTile, cTile), - lambda: pto.matmul(aTile, bTile, cTile), + lambda: tile.matmul_bias(aTile, bTile, biasTile, cTile), + lambda: tile.matmul(aTile, bTile, cTile), ) pto.cond( is_i0, _first_iter, - lambda: pto.matmul_acc(cTile, aTile, bTile, cTile), + lambda: tile.matmul_acc(cTile, aTile, bTile, cTile), ) pto.record_wait_pair("MATMUL", "LOAD", event_id=0) diff --git a/ptodsl/__init__.py b/ptodsl/__init__.py index 7a5a7a3c..55333e65 100644 --- a/ptodsl/__init__.py +++ b/ptodsl/__init__.py @@ -1,388 +1,6 @@ -import inspect -import ctypes -import os -import pathlib -import subprocess -from functools import update_wrapper - -from mlir.dialects import func, pto -from mlir.ir import Context, InsertionPoint, Location, Module - -from .language import wrap_value +from . import pto, scalar, tile from .bench import do_bench +from .compiler.ir import to_ir_module +from .compiler.jit import JitWrapper, jit - -def _resolve_meta(meta_fn): - values = meta_fn() - if not isinstance(values, dict): - raise ValueError("`meta_data()` must return a dict of named symbols to MLIR/PTO types.") - return dict(values) - - -def _resolve_arg_types(signature, meta_map): - arg_types = [] - for param in signature.parameters.values(): - annot = param.annotation - if isinstance(annot, str): - if annot not in meta_map: - raise ValueError(f"Unknown annotation '{annot}'.") - arg_types.append(meta_map[annot]) - elif annot is inspect._empty: - raise ValueError(f"Missing annotation for argument '{param.name}'.") - else: - arg_types.append(annot) - return arg_types - - -def _resolve_ret_types(signature, meta_map): - ret_annot = signature.return_annotation - if ret_annot in (inspect._empty, None): - return [] - if isinstance(ret_annot, str): - if ret_annot not in meta_map: - raise ValueError(f"Unknown return annotation '{ret_annot}'.") - return [meta_map[ret_annot]] - if isinstance(ret_annot, (list, tuple)): - out = [] - for elem in ret_annot: - if isinstance(elem, str): - out.append(meta_map[elem]) - else: - out.append(elem) - return out - return [ret_annot] - - -def _has_func_return(block): - last_name = None - for op in block.operations: - last_name = op.operation.name - return last_name == "func.return" - - -def _inject_globals(fn, values): - old = {} - for name, value in values.items(): - old[name] = fn.__globals__.get(name, None) - fn.__globals__[name] = value - return old - - -def _restore_globals(fn, old, injected_names): - for name in injected_names: - if old[name] is None and name in fn.__globals__: - del fn.__globals__[name] - else: - fn.__globals__[name] = old[name] - - -def to_ir_module(*, meta_data): - def decorator(fn): - sig = inspect.signature(fn) - - with Context() as ctx, Location.unknown(): - pto.register_dialect(ctx, load=True) - meta_map = _resolve_meta(meta_data) - arg_types = _resolve_arg_types(sig, meta_map) - ret_types = _resolve_ret_types(sig, meta_map) - module = Module.create() - fn_ty = func.FunctionType.get(arg_types, ret_types) - - with InsertionPoint(module.body): - ir_func = func.FuncOp(fn.__name__, fn_ty) - entry = ir_func.add_entry_block() - - with InsertionPoint(entry): - wrapped_args = [wrap_value(arg) for arg in entry.arguments] - injected = set(meta_map.keys()) - old_globals = _inject_globals(fn, meta_map) - try: - fn(*wrapped_args) - finally: - _restore_globals(fn, old_globals, injected) - - if not ret_types and not _has_func_return(entry): - func.ReturnOp([]) - - module.operation.verify() - return module - - return decorator - - -def _type_repr(type_obj): - return str(type_obj).replace(" ", "").lower() - - -def _is_ptr_type(type_obj): - return "ptr" in _type_repr(type_obj) - - -def _ptr_elem_cpp_type(type_obj): - type_repr = _type_repr(type_obj) - if "f32" in type_repr: - return "float" - if "f16" in type_repr: - return "__fp16" - if "bf16" in type_repr: - return "__bf16" - if "i8" in type_repr: - return "int8_t" - if "u8" in type_repr: - return "uint8_t" - if "i16" in type_repr: - return "int16_t" - if "u16" in type_repr: - return "uint16_t" - if "i32" in type_repr: - return "int32_t" - if "u32" in type_repr: - return "uint32_t" - if "i64" in type_repr: - return "int64_t" - if "u64" in type_repr: - return "uint64_t" - return "float" - - -def _scalar_cpp_type(type_obj): - type_repr = _type_repr(type_obj) - if "i32" in type_repr: - return "int32_t" - if "i64" in type_repr or "index" in type_repr: - return "int64_t" - if "f32" in type_repr: - return "float" - if "f16" in type_repr: - return "__fp16" - return "int32_t" - - -def _scalar_ctype(type_obj): - type_repr = _type_repr(type_obj) - if "i64" in type_repr or "index" in type_repr: - return ctypes.c_int64 - if "f32" in type_repr: - return ctypes.c_float - if "f16" in type_repr: - return ctypes.c_uint16 - return ctypes.c_int32 - - -def _normalize_stream_ptr(stream_ptr): - if isinstance(stream_ptr, ctypes.c_void_p): - return stream_ptr - if isinstance(stream_ptr, int): - return ctypes.c_void_p(stream_ptr) - if hasattr(stream_ptr, "value"): - return ctypes.c_void_p(int(stream_ptr.value)) - return stream_ptr - - -class JitWrapper: - def __init__( - self, - fn, - *, - meta_data, - output_dir=None, - block_dim=20, - enable_insert_sync=True, - npu_arch="dav-2201", - ): - self._fn = fn - self._meta_data = meta_data - self._sig = inspect.signature(fn) - self._arg_types = None - self._output_dir = pathlib.Path(output_dir) if output_dir else pathlib.Path.cwd() / ".ptodsl_jit" / fn.__name__ - self._block_dim = block_dim - self._enable_insert_sync = enable_insert_sync - self._npu_arch = npu_arch - self._compiled = False - self._lib = None - self._lib_path = self._output_dir / "kernel.so" - update_wrapper(self, fn) - - def _artifact_paths(self): - pto_path = self._output_dir / "kernel.pto" - cpp_path = self._output_dir / "kernel.cpp" - caller_path = self._output_dir / "caller.cpp" - return pto_path, cpp_path, caller_path, self._lib_path - - def _generate_caller_cpp(self, kernel_cpp_name): - params = list(self._sig.parameters.values()) - cpp_args = [] - launch_args = [] - for param, arg_type in zip(params, self._arg_types): - if _is_ptr_type(arg_type): - cpp_args.append(f"uint8_t *{param.name}") - launch_args.append(f"({ _ptr_elem_cpp_type(arg_type) } *){param.name}") - else: - cpp_t = _scalar_cpp_type(arg_type) - cpp_args.append(f"{cpp_t} {param.name}") - launch_args.append(param.name) - - wrapper_sig = ", ".join(["uint32_t blockDim", "void *stream"] + cpp_args) - kernel_call = ", ".join(launch_args) - return ( - f'#include "{kernel_cpp_name}"\n' - f"#include \n\n" - f'extern "C" void call_kernel({wrapper_sig})\n' - "{\n" - f" {self._fn.__name__}<<>>({kernel_call});\n" - "}\n" - ) - - def _compile_shared_library(self, caller_cpp_path, lib_path): - toolkit_home = os.environ.get("ASCEND_TOOLKIT_HOME") - if not toolkit_home: - raise RuntimeError("ASCEND_TOOLKIT_HOME is required to compile generated caller.cpp.") - cmd = [ - "bisheng", - f"-I{toolkit_home}/include", - "-fPIC", - "-shared", - "-D_FORTIFY_SOURCE=2", - "-O2", - "-std=c++17", - "-Wno-macro-redefined", - "-Wno-ignored-attributes", - "-fstack-protector-strong", - "-xcce", - "-Xhost-start", - "-Xhost-end", - "-mllvm", - "-cce-aicore-stack-size=0x8000", - "-mllvm", - "-cce-aicore-function-stack-size=0x8000", - "-mllvm", - "-cce-aicore-record-overflow=true", - "-mllvm", - "-cce-aicore-addr-transform", - "-mllvm", - "-cce-aicore-dcci-insert-for-scalar=false", - f"--npu-arch={self._npu_arch}", - "-DMEMORY_BASE", # TODO: add switch for A5 - "-std=gnu++17", - str(caller_cpp_path), - "-o", - str(lib_path), - ] - subprocess.run(cmd, check=True, cwd=str(self._output_dir)) - - def _resolve_runtime_arg_types(self): - with Context() as ctx, Location.unknown(): - pto.register_dialect(ctx, load=True) - meta_map = _resolve_meta(self._meta_data) - return _resolve_arg_types(self._sig, meta_map) - - def _build(self): - self._output_dir.mkdir(parents=True, exist_ok=True) - pto_path, cpp_path, caller_path, lib_path = self._artifact_paths() - self._arg_types = self._resolve_runtime_arg_types() - - ir_module = to_ir_module(meta_data=self._meta_data)(self._fn) - pto_path.write_text(f"{ir_module}\n", encoding="utf-8") - - ptoas_cmd = ["ptoas"] - if self._enable_insert_sync: - ptoas_cmd.append("--enable-insert-sync") - ptoas_cmd += [str(pto_path), "-o", str(cpp_path)] - subprocess.run(ptoas_cmd, check=True, cwd=str(self._output_dir)) - - caller_path.write_text(self._generate_caller_cpp(cpp_path.name), encoding="utf-8") - self._compile_shared_library(caller_path, lib_path) - - self._lib = ctypes.CDLL(str(lib_path)) - self._lib.call_kernel.argtypes = [ctypes.c_uint32, ctypes.c_void_p] + [ - ctypes.c_void_p if _is_ptr_type(arg_type) else _scalar_ctype(arg_type) - for arg_type in self._arg_types - ] - self._compiled = True - - def _convert_ptr(self, value): - if isinstance(value, ctypes.c_void_p): - return value - if hasattr(value, "data_ptr"): - return ctypes.c_void_p(value.data_ptr()) - if isinstance(value, int): - return ctypes.c_void_p(value) - raise TypeError(f"Pointer-like argument expected, got {type(value)!r}.") - - def _prepare_call_args(self, args): - params = list(self._sig.parameters.values()) - if len(args) > len(params): - raise TypeError(f"Expected at most {len(params)} arguments, got {len(args)}.") - - filled_args = list(args) - for idx in range(len(args), len(params)): - param = params[idx] - if param.default is not inspect._empty: - filled_args.append(param.default) - continue - arg_type = self._arg_types[idx] - if _is_ptr_type(arg_type): - raise TypeError(f"Missing required pointer argument '{param.name}'.") - - converted = [] - for value, arg_type in zip(filled_args, self._arg_types): - if _is_ptr_type(arg_type): - converted.append(self._convert_ptr(value)) - else: - converted.append(value) - return converted - - # TODO: also allow taking named `kwargs` - def __call__(self, *args, stream_ptr=None): - if not self._compiled: - self._build() - - if stream_ptr is None: - import torch - stream_ptr = torch.npu.current_stream()._as_parameter_ - - call_args = self._prepare_call_args(args) - self._lib.call_kernel( - ctypes.c_uint32(self._block_dim), - _normalize_stream_ptr(stream_ptr), - *call_args, - ) - return None - - def set_block_dim(self, block_dim): - if not isinstance(block_dim, int) or block_dim <= 0: - raise ValueError("`block_dim` must be a positive integer.") - self._block_dim = block_dim - return self - - @property - def library_path(self): - return str(self._lib_path) - - @property - def output_dir(self): - return str(self._output_dir) - - -def jit( - *, - meta_data, - output_dir=None, - block_dim=1, - enable_insert_sync=True, - npu_arch="dav-2201", -): - def decorator(fn): - return JitWrapper( - fn, - meta_data=meta_data, - output_dir=output_dir, - block_dim=block_dim, - enable_insert_sync=enable_insert_sync, - npu_arch=npu_arch, - ) - - return decorator - - -__all__ = ["JitWrapper", "jit", "to_ir_module", "do_bench"] +__all__ = ["JitWrapper", "do_bench", "jit", "pto", "scalar", "tile", "to_ir_module"] diff --git a/ptodsl/api/__init__.py b/ptodsl/api/__init__.py new file mode 100644 index 00000000..ca7e01f9 --- /dev/null +++ b/ptodsl/api/__init__.py @@ -0,0 +1,3 @@ +from . import pto, scalar, tile + +__all__ = ["pto", "scalar", "tile"] diff --git a/ptodsl/api/control_flow.py b/ptodsl/api/control_flow.py new file mode 100644 index 00000000..457fade8 --- /dev/null +++ b/ptodsl/api/control_flow.py @@ -0,0 +1,52 @@ +from contextlib import contextmanager + +from mlir.dialects import scf +from mlir.ir import InsertionPoint + +from .scalar import Value, _unwrap + + +def range(start, stop, step): + loop = scf.ForOp(_unwrap(start), _unwrap(stop), _unwrap(step)) + with InsertionPoint(loop.body): + yield Value(loop.induction_variable) + scf.YieldOp([]) + + +class _IfElseBranch: + def __init__(self, if_op): + self._if_op = if_op + + @contextmanager + def else_context(self): + with InsertionPoint(self._if_op.else_block): + yield + scf.YieldOp([]) + + +@contextmanager +def if_context(condition, has_else=False): + if has_else: + op = scf.IfOp(_unwrap(condition), [], hasElse=True) + branch = _IfElseBranch(op) + else: + op = scf.IfOp(_unwrap(condition)) + branch = None + + with InsertionPoint(op.then_block): + yield branch + scf.YieldOp([]) + + +def cond(condition, then_builder, else_builder): + op = scf.IfOp(_unwrap(condition), [], hasElse=True) + with InsertionPoint(op.then_block): + then_builder() + scf.YieldOp([]) + with InsertionPoint(op.else_block): + else_builder() + scf.YieldOp([]) + return op + + +__all__ = ["cond", "range", "if_context"] diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py new file mode 100644 index 00000000..b7bcb2c6 --- /dev/null +++ b/ptodsl/api/pto.py @@ -0,0 +1,58 @@ +from .control_flow import cond, range, if_context +from .scalar import Value, wrap_value +from .pto_general import ( + alloc_tile, + as_tensor, + cube_section, + get_block_idx, + get_block_num, + get_subblock_idx, + get_subblock_num, + load, + slice_view, + store, + vector_section, +) +from .synchronization import barrier, record_event, record_wait_pair, wait_event +from .type_def import ( + PtrType, + SubTensorType, + TensorType, + TileBufConfig, + TileBufType, + __getattr__, +) + + +__all__ = [ + "Value", + "wrap_value", + "bool", + "float16", + "float32", + "int16", + "int32", + "PtrType", + "TensorType", + "SubTensorType", + "TileBufConfig", + "TileBufType", + "get_block_idx", + "get_subblock_idx", + "get_subblock_num", + "get_block_num", + "as_tensor", + "slice_view", + "vector_section", + "cube_section", + "range", + "if_context", + "cond", + "alloc_tile", + "load", + "store", + "record_event", + "wait_event", + "record_wait_pair", + "barrier", +] diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py new file mode 100644 index 00000000..66a9ca3c --- /dev/null +++ b/ptodsl/api/pto_general.py @@ -0,0 +1,84 @@ +from contextlib import contextmanager + +from mlir.dialects import pto as _pto +from mlir.ir import InsertionPoint + +from .scalar import Value, _unwrap + + +def get_block_idx(): + return Value(_pto.GetBlockIdxOp().result) + + +def get_subblock_idx(): + return Value(_pto.GetSubBlockIdxOp().result) + + +def get_subblock_num(): + return Value(_pto.GetSubBlockNumOp().result) + + +def get_block_num(): + return Value(_pto.GetBlockNumOp().result) + + +def as_tensor(tensor_type, *, ptr, shape, strides): + shape_vals = [_unwrap(v) for v in shape] + stride_vals = [_unwrap(v) for v in strides] + return _pto.MakeTensorViewOp(tensor_type, _unwrap(ptr), shape_vals, stride_vals).result + + +def slice_view(subtensor_type, *, source, offsets, sizes): + offset_vals = [_unwrap(v) for v in offsets] + size_vals = [_unwrap(v) for v in sizes] + return _pto.PartitionViewOp( + subtensor_type, source, offsets=offset_vals, sizes=size_vals + ).result + + +@contextmanager +def vector_section(): + section = _pto.SectionVectorOp() + block = section.body.blocks.append() + with InsertionPoint(block): + yield + + +@contextmanager +def cube_section(): + section = _pto.SectionCubeOp() + block = section.body.blocks.append() + with InsertionPoint(block): + yield + + +def alloc_tile(tile_type, *, valid_row=None, valid_col=None): + kwargs = {} + if valid_row is not None: + kwargs["valid_row"] = _unwrap(valid_row) + if valid_col is not None: + kwargs["valid_col"] = _unwrap(valid_col) + return _pto.AllocTileOp(tile_type, **kwargs).result + + +def load(source, dest): + _pto.TLoadOp(None, source, dest) + + +def store(source, dest): + _pto.TStoreOp(None, source, dest) + + +__all__ = [ + "get_block_idx", + "get_subblock_idx", + "get_subblock_num", + "get_block_num", + "as_tensor", + "slice_view", + "vector_section", + "cube_section", + "alloc_tile", + "load", + "store", +] diff --git a/ptodsl/api/scalar.py b/ptodsl/api/scalar.py new file mode 100644 index 00000000..8c10a4d6 --- /dev/null +++ b/ptodsl/api/scalar.py @@ -0,0 +1,164 @@ +from mlir.dialects import arith +from mlir.ir import F16Type, F32Type, IndexType, IntegerType + + +def _unwrap(value): + if isinstance(value, Value): + return value.raw + return value + + +class Value: + # TODO: generalize to more comprehensive wrappers like + # https://github.com/makslevental/mlir-python-extras/blob/0.0.8.2/mlir/extras/dialects/ext/arith.py + def __init__(self, raw): + self.raw = raw + + def __mul__(self, other): + return Value(arith.MulIOp(_unwrap(self), _unwrap(other)).result) + + def __rmul__(self, other): + return Value(arith.MulIOp(_unwrap(other), _unwrap(self)).result) + + def __add__(self, other): + return Value(arith.AddIOp(_unwrap(self), _unwrap(other)).result) + + def __radd__(self, other): + return Value(arith.AddIOp(_unwrap(other), _unwrap(self)).result) + + def __sub__(self, other): + return Value(arith.SubIOp(_unwrap(self), _unwrap(other)).result) + + def __rsub__(self, other): + return Value(arith.SubIOp(_unwrap(other), _unwrap(self)).result) + + def __floordiv__(self, other): + return Value(arith.DivSIOp(_unwrap(self), _unwrap(other)).result) + + def __rfloordiv__(self, other): + return Value(arith.DivSIOp(_unwrap(other), _unwrap(self)).result) + + def __truediv__(self, other): + return Value(arith.DivFOp(_unwrap(self), _unwrap(other)).result) + + def __rtruediv__(self, other): + return Value(arith.DivFOp(_unwrap(other), _unwrap(self)).result) + + def __mod__(self, other): + return Value(arith.RemSIOp(_unwrap(self), _unwrap(other)).result) + + def __rmod__(self, other): + return Value(arith.RemSIOp(_unwrap(other), _unwrap(self)).result) + + @staticmethod + def _cmp(lhs, rhs, predicate): + return Value(arith.CmpIOp(predicate, _unwrap(lhs), _unwrap(rhs)).result) + + def __lt__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.slt) + + def __gt__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sgt) + + def __le__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sle) + + def __ge__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sge) + + def __eq__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.eq) + + def __ne__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.ne) + + def __getattr__(self, item): + return getattr(self.raw, item) + + +def wrap_value(value): + if isinstance(value, Value): + return value + return Value(value) + + +def __getattr__(name): + # TODO: add more builtin dtype aliases (for example float16/bfloat16/int8/int64) + # when they are validated against PTO type support. + if name == "bool": + return IntegerType.get_signless(1) + if name == "float32": + return F32Type.get() + if name == "float16": + return F16Type.get() + if name == "int32": + return IntegerType.get_signless(32) + if name == "int16": + return IntegerType.get_signless(16) + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + +def const(value): + return Value(arith.ConstantOp(IndexType.get(), value).result) + + +def index_cast(value, index_type=IndexType): + if hasattr(index_type, "get"): + dst = index_type.get() + else: + dst = index_type + return Value(arith.IndexCastOp(dst, _unwrap(value)).result) + + +def ceil_div(a, b): + return Value(arith.CeilDivSIOp(_unwrap(a), _unwrap(b)).result) + + +def div_s(a, b): + return Value(arith.DivSIOp(_unwrap(a), _unwrap(b)).result) + + +def rem_s(a, b): + return Value(arith.RemSIOp(_unwrap(a), _unwrap(b)).result) + + +def min_u(a, b): + return Value(arith.MinUIOp(_unwrap(a), _unwrap(b)).result) + + +def eq(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.eq, _unwrap(a), _unwrap(b)).result) + + +def lt(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.slt, _unwrap(a), _unwrap(b)).result) + + +def gt(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.sgt, _unwrap(a), _unwrap(b)).result) + + +def ge(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.sge, _unwrap(a), _unwrap(b)).result) + + +def select(cond, true_val, false_val): + return Value(arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result) + + +__all__ = [ + "Value", + "_unwrap", + "wrap_value", + "const", + "index_cast", + "ceil_div", + "div_s", + "rem_s", + "min_u", + "eq", + "lt", + "gt", + "ge", + "select", +] diff --git a/ptodsl/api/synchronization.py b/ptodsl/api/synchronization.py new file mode 100644 index 00000000..1a0801ee --- /dev/null +++ b/ptodsl/api/synchronization.py @@ -0,0 +1,70 @@ +from typing import Sequence + +from mlir.dialects import pto as _pto + + +def _resolve_sync_op(sync_op): + if isinstance(sync_op, str): + normalized = sync_op.strip().upper() + if not normalized.startswith("T"): + normalized = f"T{normalized}" + try: + return getattr(_pto, normalized) + except AttributeError as exc: + raise ValueError(f"Unsupported sync op type '{sync_op}'.") from exc + return sync_op + + +def _resolve_event_id(event_id): + if isinstance(event_id, int): + if event_id < 0 or event_id > 7: + raise ValueError(f"event_id must be in range [0, 7], got {event_id}.") + return getattr(_pto, f"EVENT_ID{event_id}") + return event_id + + +def record_event(record_op, wait_op, event_id: int | Sequence[int] = 0): + if not isinstance(event_id, int): + for eid in event_id: + _pto.record_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(eid), + ) + else: + _pto.record_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(event_id), + ) + + +def wait_event(record_op, wait_op, event_id: int | Sequence[int] = 0): + if not isinstance(event_id, int): + for eid in event_id: + _pto.wait_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(eid), + ) + else: + _pto.wait_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(event_id), + ) + + +def record_wait_pair(record_op, wait_op, event_id: int | Sequence[int] = 0): + record = _resolve_sync_op(record_op) + wait = _resolve_sync_op(wait_op) + event = _resolve_event_id(event_id) + _pto.record_event(record, wait, event) + _pto.wait_event(record, wait, event) + + +def barrier(sync_op): + _pto.barrier(_resolve_sync_op(sync_op)) + + +__all__ = ["record_event", "wait_event", "record_wait_pair", "barrier"] diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py new file mode 100644 index 00000000..094831ea --- /dev/null +++ b/ptodsl/api/tile.py @@ -0,0 +1,107 @@ +from mlir.dialects import pto as _pto + +from .scalar import _unwrap + + +def mov(source, dest): + _pto.TMovOp(None, source, dest) + + +def add(lhs, rhs, out): + _pto.TAddOp(lhs, rhs, out) + + +def sub(lhs, rhs, out): + _pto.TSubOp(lhs, rhs, out) + + +def div(lhs, rhs, out): + _pto.TDivOp(lhs, rhs, out) + + +def mul(lhs, rhs, out): + _pto.TMulOp(lhs, rhs, out) + + +def or_(lhs, rhs, out): + _pto.TOrOp(lhs, rhs, out) + + +def gather(src, out, indices=None, *, mask_pattern=None): + if mask_pattern is not None: + mask = _pto.MaskPatternAttr.get(getattr(_pto.MaskPattern, mask_pattern)) + _pto.TGatherOp(src, out, maskPattern=mask) + else: + _pto.TGatherOp(src, out, indices=indices) + + +def exp(inp, out): + _pto.TExpOp(inp, out) + + +def log(inp, out): + _pto.TLogOp(inp, out) + + +def relu(inp, out): + _pto.TReluOp(inp, out) + + +def abs(inp, out): + _pto.TAbsOp(inp, out) + + +def sqrt(inp, out): + _pto.TSqrtOp(inp, out) + + +def rsqrt(inp, out): + _pto.TRsqrtOp(inp, out) + + +def reciprocal(inp, out): + _pto.TRecipOp(inp, out) + + +def matmul(lhs, rhs, out): + _pto.TMatmulOp(None, lhs, rhs, out) + + +def matmul_bias(lhs, rhs, bias, out): + _pto.TMatmulBiasOp(None, lhs, rhs, bias, out) + + +def matmul_acc(acc, lhs, rhs, out): + _pto.TMatmulAccOp(None, acc, lhs, rhs, out) + + +def row_sum(src, tmp, dst): + _pto.TRowSumOp(src=src, tmp=tmp, dst=dst) + + +def subset(source, offsets, sizes): + offset_vals = [_unwrap(v) for v in offsets] + return _pto.subset(source, offset_vals, sizes) + + +__all__ = [ + "mov", + "add", + "sub", + "div", + "mul", + "or_", + "gather", + "exp", + "log", + "relu", + "abs", + "sqrt", + "rsqrt", + "reciprocal", + "matmul", + "matmul_bias", + "matmul_acc", + "row_sum", + "subset", +] diff --git a/ptodsl/api/type_def.py b/ptodsl/api/type_def.py new file mode 100644 index 00000000..24fbfcf3 --- /dev/null +++ b/ptodsl/api/type_def.py @@ -0,0 +1,105 @@ +from mlir.dialects import pto as _pto + +from . import scalar + + +def __getattr__(name): + # MLIR type factories require an active context, so keep dtype aliases lazy + # and resolve them only when user code accesses them inside PTO/MLIR setup. + if name in {"bool", "float16", "float32", "int16", "int32"}: + return getattr(scalar, name) + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + +def PtrType(dtype): + return _pto.PtrType.get(dtype) + + +def TensorType(*, rank, dtype): + return _pto.TensorViewType.get(rank, dtype) + + +def SubTensorType(*, shape, dtype): + return _pto.PartitionTensorViewType.get(shape, dtype) + + +class TileBufConfig: + def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null"): + # TODO: expose and validate a broader set of tile buffer knobs if PTO adds + # more layout/padding/fractal settings that should be configurable here. + self._bl = _pto.BLayoutAttr.get(getattr(_pto.BLayout, blayout)) + self._sl = _pto.SLayoutAttr.get(getattr(_pto.SLayout, slayout)) + self._pd = _pto.PadValueAttr.get(getattr(_pto.PadValue, pad)) + self._s_fractal_size = s_fractal_size + + @property + def attr(self): + return _pto.TileBufConfigAttr.get(self._bl, self._sl, self._s_fractal_size, self._pd) + + +def _default_tile_config(memory_space, shape): + space = memory_space.upper() + # Defaults mirror the explicit configs used by the verbose matmul builder. + if space == "MAT": + if len(shape) >= 1 and shape[0] == 1: + return TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=_pto.TileConfig.fractalABSize, + ) + return TileBufConfig( + blayout="ColMajor", + slayout="RowMajor", + s_fractal_size=_pto.TileConfig.fractalABSize, + ) + if space == "LEFT": + return TileBufConfig( + blayout="RowMajor", + slayout="RowMajor", + s_fractal_size=_pto.TileConfig.fractalABSize, + ) + if space == "RIGHT": + return TileBufConfig( + blayout="RowMajor", + slayout="ColMajor", + s_fractal_size=_pto.TileConfig.fractalABSize, + ) + if space == "ACC": + return TileBufConfig( + blayout="ColMajor", + slayout="RowMajor", + s_fractal_size=_pto.TileConfig.fractalCSize, + ) + if space == "BIAS": + return TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=_pto.TileConfig.fractalABSize, + ) + if space == "VEC": + return TileBufConfig() + raise ValueError(f"Unsupported memory_space '{memory_space}' for default tile config.") + + +def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): + space = _pto.AddressSpaceAttr.get(getattr(_pto.AddressSpace, memory_space)) + if valid_shape is None: + valid_shape = shape + if config is None: + config = _default_tile_config(memory_space, shape) + cfg = config.attr if isinstance(config, TileBufConfig) else config + return _pto.TileBufType.get(shape, dtype, space, valid_shape, cfg) + + +__all__ = [ + "PtrType", + "TensorType", + "SubTensorType", + "TileBufConfig", + "TileBufType", + "bool", + "float16", + "float32", + "int16", + "int32", +] diff --git a/ptodsl/bench.py b/ptodsl/bench.py index d413cccd..b6ae4870 100644 --- a/ptodsl/bench.py +++ b/ptodsl/bench.py @@ -1,53 +1,3 @@ -from typing import Callable, List, Literal, Union +from .utils.bench import do_bench - -def do_bench( - fn: Callable, - warmup_iters: int = 5, - benchmark_iters: int = 15, - aggregation: Literal["mean", "none"] = "mean", - unit: Literal["s", "ms", "us", "ns"] = "us", - flush_cache: bool = True, -) -> Union[float, List[float]]: - """ - Benchmark a given function with warmup. - - Args: - fn: Function to benchmark. - warmup_iters: Number of warmup runs. - benchmark_iters: Number of benchmark runs. - aggregation: Aggregation mode for benchmark times. - unit: Time unit of the benchmarks. - flush_cache: if we should overwrite l2 cache between every iteration - Returns: - Runtime, or list of runtimes, in specified units. - """ - import torch - import torch_npu - start_events = [torch.npu.Event(enable_timing=True) for _ in range(benchmark_iters)] - end_events = [torch.npu.Event(enable_timing=True) for _ in range(benchmark_iters)] - - # Allocate a 256 MB tensor which we write to every iteration to flush L2 cache - # https://github.com/tile-ai/tilelang/blob/main/tilelang/profiler/bench.py#L103 - cache_size = 256 * 1024 * 1024 - cache = torch.empty((cache_size), dtype=torch.int8).npu() - - for _ in range(warmup_iters): - fn() - torch_npu.npu.synchronize() - - # It's not easy to time a kernel in a way that satisfies the following two at the same time: - # 1) Ignores cache flushing, and 2) Ignoring kernel launch overhead. Here we ignore cache flushing. - for i in range(benchmark_iters): - if flush_cache: - cache.zero_() - start_events[i].record() - fn() - end_events[i].record() - - torch_npu.npu.synchronize() - f = {"s": 1e-3, "ms": 1e0, "us": 1e3, "ns": 1e6}[unit] - times = [f * s.elapsed_time(e) for s, e in zip(start_events, end_events)] - if aggregation == "mean": - return sum(times) / len(times) - return times +__all__ = ["do_bench"] diff --git a/ptodsl/compiler/__init__.py b/ptodsl/compiler/__init__.py new file mode 100644 index 00000000..1e6d9312 --- /dev/null +++ b/ptodsl/compiler/__init__.py @@ -0,0 +1,4 @@ +from .ir import to_ir_module +from .jit import JitWrapper, jit + +__all__ = ["JitWrapper", "jit", "to_ir_module"] diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py new file mode 100644 index 00000000..677aa0dc --- /dev/null +++ b/ptodsl/compiler/ir.py @@ -0,0 +1,107 @@ +import inspect + +from mlir.dialects import func, pto as _pto +from mlir.ir import Context, InsertionPoint, Location, Module + +from ..api.scalar import wrap_value + + +def _resolve_meta(meta_fn): + values = meta_fn() + if not isinstance(values, dict): + raise ValueError("`meta_data()` must return a dict of named symbols to MLIR/PTO types.") + return dict(values) + + +def _resolve_arg_types(signature, meta_map): + arg_types = [] + for param in signature.parameters.values(): + annot = param.annotation + if isinstance(annot, str): + if annot not in meta_map: + raise ValueError(f"Unknown annotation '{annot}'.") + arg_types.append(meta_map[annot]) + elif annot is inspect._empty: + raise ValueError(f"Missing annotation for argument '{param.name}'.") + else: + arg_types.append(annot) + return arg_types + + +def _resolve_ret_types(signature, meta_map): + ret_annot = signature.return_annotation + if ret_annot in (inspect._empty, None): + return [] + if isinstance(ret_annot, str): + if ret_annot not in meta_map: + raise ValueError(f"Unknown return annotation '{ret_annot}'.") + return [meta_map[ret_annot]] + if isinstance(ret_annot, (list, tuple)): + out = [] + for elem in ret_annot: + if isinstance(elem, str): + out.append(meta_map[elem]) + else: + out.append(elem) + return out + return [ret_annot] + + +def _has_func_return(block): + last_name = None + for op in block.operations: + last_name = op.operation.name + return last_name == "func.return" + + +def _inject_globals(fn, values): + old = {} + for name, value in values.items(): + old[name] = fn.__globals__.get(name, None) + fn.__globals__[name] = value + return old + + +def _restore_globals(fn, old, injected_names): + for name in injected_names: + if old[name] is None and name in fn.__globals__: + del fn.__globals__[name] + else: + fn.__globals__[name] = old[name] + + +def to_ir_module(*, meta_data): + def decorator(fn): + sig = inspect.signature(fn) + + with Context() as ctx, Location.unknown(): + _pto.register_dialect(ctx, load=True) + meta_map = _resolve_meta(meta_data) + arg_types = _resolve_arg_types(sig, meta_map) + ret_types = _resolve_ret_types(sig, meta_map) + module = Module.create() + fn_ty = func.FunctionType.get(arg_types, ret_types) + + with InsertionPoint(module.body): + ir_func = func.FuncOp(fn.__name__, fn_ty) + entry = ir_func.add_entry_block() + + with InsertionPoint(entry): + wrapped_args = [wrap_value(arg) for arg in entry.arguments] + injected = set(meta_map.keys()) + old_globals = _inject_globals(fn, meta_map) + try: + fn(*wrapped_args) + finally: + _restore_globals(fn, old_globals, injected) + + if not ret_types and not _has_func_return(entry): + func.ReturnOp([]) + + module.operation.verify() + return module + + return decorator + + +__all__ = ["to_ir_module"] diff --git a/ptodsl/compiler/jit.py b/ptodsl/compiler/jit.py new file mode 100644 index 00000000..772a8c42 --- /dev/null +++ b/ptodsl/compiler/jit.py @@ -0,0 +1,296 @@ +import ctypes +import inspect +import os +import pathlib +import subprocess +from functools import update_wrapper + +from mlir.dialects import pto as _pto +from mlir.ir import Context, Location + +from .ir import to_ir_module + + +def _type_repr(type_obj): + return str(type_obj).replace(" ", "").lower() + + +def _is_ptr_type(type_obj): + return "ptr" in _type_repr(type_obj) + + +def _ptr_elem_cpp_type(type_obj): + type_repr = _type_repr(type_obj) + if "f32" in type_repr: + return "float" + if "f16" in type_repr: + return "__fp16" + if "bf16" in type_repr: + return "__bf16" + if "i8" in type_repr: + return "int8_t" + if "u8" in type_repr: + return "uint8_t" + if "i16" in type_repr: + return "int16_t" + if "u16" in type_repr: + return "uint16_t" + if "i32" in type_repr: + return "int32_t" + if "u32" in type_repr: + return "uint32_t" + if "i64" in type_repr: + return "int64_t" + if "u64" in type_repr: + return "uint64_t" + return "float" + + +def _scalar_cpp_type(type_obj): + type_repr = _type_repr(type_obj) + if "i32" in type_repr: + return "int32_t" + if "i64" in type_repr or "index" in type_repr: + return "int64_t" + if "f32" in type_repr: + return "float" + if "f16" in type_repr: + return "__fp16" + return "int32_t" + + +def _scalar_ctype(type_obj): + type_repr = _type_repr(type_obj) + if "i64" in type_repr or "index" in type_repr: + return ctypes.c_int64 + if "f32" in type_repr: + return ctypes.c_float + if "f16" in type_repr: + return ctypes.c_uint16 + return ctypes.c_int32 + + +def _normalize_stream_ptr(stream_ptr): + if isinstance(stream_ptr, ctypes.c_void_p): + return stream_ptr + if isinstance(stream_ptr, int): + return ctypes.c_void_p(stream_ptr) + if hasattr(stream_ptr, "value"): + return ctypes.c_void_p(int(stream_ptr.value)) + return stream_ptr + + +class JitWrapper: + def __init__( + self, + fn, + *, + meta_data, + output_dir=None, + block_dim=20, + enable_insert_sync=True, + npu_arch="dav-2201", + ): + self._fn = fn + self._meta_data = meta_data + self._sig = inspect.signature(fn) + self._arg_types = None + self._output_dir = ( + pathlib.Path(output_dir) + if output_dir + else pathlib.Path.cwd() / ".ptodsl_jit" / fn.__name__ + ) + self._block_dim = block_dim + self._enable_insert_sync = enable_insert_sync + self._npu_arch = npu_arch + self._compiled = False + self._lib = None + self._lib_path = self._output_dir / "kernel.so" + update_wrapper(self, fn) + + def _artifact_paths(self): + pto_path = self._output_dir / "kernel.pto" + cpp_path = self._output_dir / "kernel.cpp" + caller_path = self._output_dir / "caller.cpp" + return pto_path, cpp_path, caller_path, self._lib_path + + def _generate_caller_cpp(self, kernel_cpp_name): + params = list(self._sig.parameters.values()) + cpp_args = [] + launch_args = [] + for param, arg_type in zip(params, self._arg_types): + if _is_ptr_type(arg_type): + cpp_args.append(f"uint8_t *{param.name}") + launch_args.append(f"({_ptr_elem_cpp_type(arg_type)} *){param.name}") + else: + cpp_t = _scalar_cpp_type(arg_type) + cpp_args.append(f"{cpp_t} {param.name}") + launch_args.append(param.name) + + wrapper_sig = ", ".join(["uint32_t blockDim", "void *stream"] + cpp_args) + kernel_call = ", ".join(launch_args) + return ( + f'#include "{kernel_cpp_name}"\n' + f"#include \n\n" + f'extern "C" void call_kernel({wrapper_sig})\n' + "{\n" + f" {self._fn.__name__}<<>>({kernel_call});\n" + "}\n" + ) + + def _compile_shared_library(self, caller_cpp_path, lib_path): + toolkit_home = os.environ.get("ASCEND_TOOLKIT_HOME") + if not toolkit_home: + raise RuntimeError("ASCEND_TOOLKIT_HOME is required to compile generated caller.cpp.") + cmd = [ + "bisheng", + f"-I{toolkit_home}/include", + "-fPIC", + "-shared", + "-D_FORTIFY_SOURCE=2", + "-O2", + "-std=c++17", + "-Wno-macro-redefined", + "-Wno-ignored-attributes", + "-fstack-protector-strong", + "-xcce", + "-Xhost-start", + "-Xhost-end", + "-mllvm", + "-cce-aicore-stack-size=0x8000", + "-mllvm", + "-cce-aicore-function-stack-size=0x8000", + "-mllvm", + "-cce-aicore-record-overflow=true", + "-mllvm", + "-cce-aicore-addr-transform", + "-mllvm", + "-cce-aicore-dcci-insert-for-scalar=false", + f"--npu-arch={self._npu_arch}", + "-DMEMORY_BASE", # TODO: add switch for A5 + "-std=gnu++17", + str(caller_cpp_path), + "-o", + str(lib_path), + ] + subprocess.run(cmd, check=True, cwd=str(self._output_dir)) + + def _resolve_runtime_arg_types(self): + from .ir import _resolve_arg_types, _resolve_meta + + with Context() as ctx, Location.unknown(): + _pto.register_dialect(ctx, load=True) + meta_map = _resolve_meta(self._meta_data) + return _resolve_arg_types(self._sig, meta_map) + + def _build(self): + self._output_dir.mkdir(parents=True, exist_ok=True) + pto_path, cpp_path, caller_path, lib_path = self._artifact_paths() + self._arg_types = self._resolve_runtime_arg_types() + + ir_module = to_ir_module(meta_data=self._meta_data)(self._fn) + pto_path.write_text(f"{ir_module}\n", encoding="utf-8") + + ptoas_cmd = ["ptoas"] + if self._enable_insert_sync: + ptoas_cmd.append("--enable-insert-sync") + ptoas_cmd += [str(pto_path), "-o", str(cpp_path)] + subprocess.run(ptoas_cmd, check=True, cwd=str(self._output_dir)) + + caller_path.write_text(self._generate_caller_cpp(cpp_path.name), encoding="utf-8") + self._compile_shared_library(caller_path, lib_path) + + self._lib = ctypes.CDLL(str(lib_path)) + self._lib.call_kernel.argtypes = [ctypes.c_uint32, ctypes.c_void_p] + [ + ctypes.c_void_p if _is_ptr_type(arg_type) else _scalar_ctype(arg_type) + for arg_type in self._arg_types + ] + self._compiled = True + + def _convert_ptr(self, value): + if isinstance(value, ctypes.c_void_p): + return value + if hasattr(value, "data_ptr"): + return ctypes.c_void_p(value.data_ptr()) + if isinstance(value, int): + return ctypes.c_void_p(value) + raise TypeError(f"Pointer-like argument expected, got {type(value)!r}.") + + def _prepare_call_args(self, args): + params = list(self._sig.parameters.values()) + if len(args) > len(params): + raise TypeError(f"Expected at most {len(params)} arguments, got {len(args)}.") + + filled_args = list(args) + for idx in range(len(args), len(params)): + param = params[idx] + if param.default is not inspect._empty: + filled_args.append(param.default) + continue + arg_type = self._arg_types[idx] + if _is_ptr_type(arg_type): + raise TypeError(f"Missing required pointer argument '{param.name}'.") + + converted = [] + for value, arg_type in zip(filled_args, self._arg_types): + if _is_ptr_type(arg_type): + converted.append(self._convert_ptr(value)) + else: + converted.append(value) + return converted + + # TODO: also allow taking named `kwargs` + def __call__(self, *args, stream_ptr=None): + if not self._compiled: + self._build() + + if stream_ptr is None: + import torch + + stream_ptr = torch.npu.current_stream()._as_parameter_ + + call_args = self._prepare_call_args(args) + self._lib.call_kernel( + ctypes.c_uint32(self._block_dim), + _normalize_stream_ptr(stream_ptr), + *call_args, + ) + return None + + def set_block_dim(self, block_dim): + if not isinstance(block_dim, int) or block_dim <= 0: + raise ValueError("`block_dim` must be a positive integer.") + self._block_dim = block_dim + return self + + @property + def library_path(self): + return str(self._lib_path) + + @property + def output_dir(self): + return str(self._output_dir) + + +def jit( + *, + meta_data, + output_dir=None, + block_dim=1, + enable_insert_sync=True, + npu_arch="dav-2201", +): + def decorator(fn): + return JitWrapper( + fn, + meta_data=meta_data, + output_dir=output_dir, + block_dim=block_dim, + enable_insert_sync=enable_insert_sync, + npu_arch=npu_arch, + ) + + return decorator + + +__all__ = ["JitWrapper", "jit"] diff --git a/ptodsl/language.py b/ptodsl/language.py deleted file mode 100644 index 61db0611..00000000 --- a/ptodsl/language.py +++ /dev/null @@ -1,435 +0,0 @@ -from contextlib import contextmanager -from typing import Sequence - -from mlir.dialects import arith, pto, scf -from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType - - -def _unwrap(value): - if isinstance(value, Value): - return value.raw - return value - - -class Value: - # TODO: generalize to more comprehensive wrappers like https://github.com/makslevental/mlir-python-extras/blob/0.0.8.2/mlir/extras/dialects/ext/arith.py - def __init__(self, raw): - self.raw = raw - - def __mul__(self, other): - return Value(arith.MulIOp(_unwrap(self), _unwrap(other)).result) - - def __rmul__(self, other): - return Value(arith.MulIOp(_unwrap(other), _unwrap(self)).result) - - def __add__(self, other): - return Value(arith.AddIOp(_unwrap(self), _unwrap(other)).result) - - def __radd__(self, other): - return Value(arith.AddIOp(_unwrap(other), _unwrap(self)).result) - - def __sub__(self, other): - return Value(arith.SubIOp(_unwrap(self), _unwrap(other)).result) - - def __rsub__(self, other): - return Value(arith.SubIOp(_unwrap(other), _unwrap(self)).result) - - def __floordiv__(self, other): - return Value(arith.DivSIOp(_unwrap(self), _unwrap(other)).result) - - def __rfloordiv__(self, other): - return Value(arith.DivSIOp(_unwrap(other), _unwrap(self)).result) - - def __truediv__(self, other): - return Value(arith.DivFOp(_unwrap(self), _unwrap(other)).result) - - def __rtruediv__(self, other): - return Value(arith.DivFOp(_unwrap(other), _unwrap(self)).result) - - def __mod__(self, other): - return Value(arith.RemSIOp(_unwrap(self), _unwrap(other)).result) - - def __rmod__(self, other): - return Value(arith.RemSIOp(_unwrap(other), _unwrap(self)).result) - - @staticmethod - def _cmp(lhs, rhs, predicate): - return Value(arith.CmpIOp(predicate, _unwrap(lhs), _unwrap(rhs)).result) - - def __lt__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.slt) - - def __gt__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.sgt) - - def __le__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.sle) - - def __ge__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.sge) - - def __eq__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.eq) - - def __ne__(self, other): - return Value._cmp(self, other, arith.CmpIPredicate.ne) - - def __getattr__(self, item): - return getattr(self.raw, item) - - -def wrap_value(value): - if isinstance(value, Value): - return value - return Value(value) - - -def __getattr__(name): - # TODO: add more builtin dtype aliases (for example float16/bfloat16/int8/int64) - # when they are validated against PTO type support. - if name == "bool": - return IntegerType.get_signless(1) - if name == "float32": - return F32Type.get() - if name == "float16": - return F16Type.get() - if name == "int32": - return IntegerType.get_signless(32) - if name == "int16": - return IntegerType.get_signless(16) - raise AttributeError(f"module '{__name__}' has no attribute '{name}'") - - -def PtrType(dtype): - return pto.PtrType.get(dtype) - - -def TensorType(*, rank, dtype): - return pto.TensorViewType.get(rank, dtype) - - -def SubTensorType(*, shape, dtype): - return pto.PartitionTensorViewType.get(shape, dtype) - - -class TileBufConfig: - def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null"): - # TODO: expose and validate a broader set of tile buffer knobs if PTO adds - # more layout/padding/fractal settings that should be configurable here. - self._bl = pto.BLayoutAttr.get(getattr(pto.BLayout, blayout)) - self._sl = pto.SLayoutAttr.get(getattr(pto.SLayout, slayout)) - self._pd = pto.PadValueAttr.get(getattr(pto.PadValue, pad)) - self._s_fractal_size = s_fractal_size - - @property - def attr(self): - return pto.TileBufConfigAttr.get(self._bl, self._sl, self._s_fractal_size, self._pd) - - -def _default_tile_config(memory_space, shape): - space = memory_space.upper() - # Defaults mirror the explicit configs used by the verbose matmul builder. - if space == "MAT": - if len(shape) >= 1 and shape[0] == 1: - return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) - return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) - if space == "LEFT": - return TileBufConfig(blayout="RowMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) - if space == "RIGHT": - return TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=pto.TileConfig.fractalABSize) - if space == "ACC": - return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalCSize) - if space == "BIAS": - return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) - if space == "VEC": - return TileBufConfig() - raise ValueError(f"Unsupported memory_space '{memory_space}' for default tile config.") - - -def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): - space = pto.AddressSpaceAttr.get(getattr(pto.AddressSpace, memory_space)) - if valid_shape is None: - valid_shape = shape - if config is None: - config = _default_tile_config(memory_space, shape) - cfg = config.attr if isinstance(config, TileBufConfig) else config - return pto.TileBufType.get(shape, dtype, space, valid_shape, cfg) - - -def const(value): - return Value(arith.ConstantOp(IndexType.get(), value).result) - - -def get_block_idx(): - return Value(pto.GetBlockIdxOp().result) - - -def get_subblock_idx(): - return Value(pto.GetSubBlockIdxOp().result) - - -def get_subblock_num(): - return Value(pto.GetSubBlockNumOp().result) - - -def get_block_num(): - return Value(pto.GetBlockNumOp().result) - - -def index_cast(value, index_type=IndexType): - if hasattr(index_type, "get"): - dst = index_type.get() - else: - dst = index_type - return Value(arith.IndexCastOp(dst, _unwrap(value)).result) - - -def as_tensor(tensor_type, *, ptr, shape, strides): - shape_vals = [_unwrap(v) for v in shape] - stride_vals = [_unwrap(v) for v in strides] - return pto.MakeTensorViewOp(tensor_type, _unwrap(ptr), shape_vals, stride_vals).result - - -def slice_view(subtensor_type, *, source, offsets, sizes): - offset_vals = [_unwrap(v) for v in offsets] - size_vals = [_unwrap(v) for v in sizes] - return pto.PartitionViewOp(subtensor_type, source, offsets=offset_vals, sizes=size_vals).result - - -@contextmanager -def vector_section(): - section = pto.SectionVectorOp() - block = section.body.blocks.append() - with InsertionPoint(block): - yield - - -@contextmanager -def cube_section(): - section = pto.SectionCubeOp() - block = section.body.blocks.append() - with InsertionPoint(block): - yield - - -def for_range(start, stop, step): - loop = scf.ForOp(_unwrap(start), _unwrap(stop), _unwrap(step)) - with InsertionPoint(loop.body): - yield Value(loop.induction_variable) - scf.YieldOp([]) - - -def alloc_tile(tile_type, *, valid_row=None, valid_col=None): - kwargs = {} - if valid_row is not None: - kwargs["valid_row"] = _unwrap(valid_row) - if valid_col is not None: - kwargs["valid_col"] = _unwrap(valid_col) - return pto.AllocTileOp(tile_type, **kwargs).result - - -def subset(source, offsets, sizes): - offset_vals = [_unwrap(v) for v in offsets] - return pto.subset(source, offset_vals, sizes) - - -def load(source, dest): - pto.TLoadOp(None, source, dest) - - -def mov(source, dest): - pto.TMovOp(None, source, dest) - - -def add(lhs, rhs, out): - pto.TAddOp(lhs, rhs, out) - - -def sub(lhs, rhs, out): - pto.TSubOp(lhs, rhs, out) - - -def div(lhs, rhs, out): - pto.TDivOp(lhs, rhs, out) - - -def mul(lhs, rhs, out): - pto.TMulOp(lhs, rhs, out) - - -def or_(lhs, rhs, out): - pto.TOrOp(lhs, rhs, out) - - -def gather(src, out, indices=None, *, mask_pattern=None): - if mask_pattern is not None: - mp = pto.MaskPatternAttr.get(getattr(pto.MaskPattern, mask_pattern)) - pto.TGatherOp(src, out, maskPattern=mp) - else: - pto.TGatherOp(src, out, indices=indices) - - -def exp(inp, out): - pto.TExpOp(inp, out) - - -def log(inp, out): - pto.TLogOp(inp, out) - - -def relu(inp, out): - pto.TReluOp(inp, out) - - -def abs(inp, out): - pto.TAbsOp(inp, out) - - -def sqrt(inp, out): - pto.TSqrtOp(inp, out) - - -def rsqrt(inp, out): - pto.TRsqrtOp(inp, out) - - -def reciprocal(inp, out): - pto.TRecipOp(inp, out) - - -def store(source, dest): - pto.TStoreOp(None, source, dest) - - -def matmul(lhs, rhs, out): - pto.TMatmulOp(None, lhs, rhs, out) - - -def matmul_bias(lhs, rhs, bias, out): - pto.TMatmulBiasOp(None, lhs, rhs, bias, out) - - -def matmul_acc(acc, lhs, rhs, out): - pto.TMatmulAccOp(None, acc, lhs, rhs, out) - - -def ceil_div(a, b): - return Value(arith.CeilDivSIOp(_unwrap(a), _unwrap(b)).result) - - -def div_s(a, b): - return Value(arith.DivSIOp(_unwrap(a), _unwrap(b)).result) - - -def rem_s(a, b): - return Value(arith.RemSIOp(_unwrap(a), _unwrap(b)).result) - - -def min_u(a, b): - return Value(arith.MinUIOp(_unwrap(a), _unwrap(b)).result) - - -def eq(a, b): - return Value(arith.CmpIOp(arith.CmpIPredicate.eq, _unwrap(a), _unwrap(b)).result) - - -def lt(a, b): - return Value(arith.CmpIOp(arith.CmpIPredicate.slt, _unwrap(a), _unwrap(b)).result) - - -def gt(a, b): - return Value(arith.CmpIOp(arith.CmpIPredicate.sgt, _unwrap(a), _unwrap(b)).result) - - -def ge(a, b): - return Value(arith.CmpIOp(arith.CmpIPredicate.sge, _unwrap(a), _unwrap(b)).result) - - -def select(cond, true_val, false_val): - return Value(arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result) - - -class _IfElseBranch: - def __init__(self, if_op): - self._if_op = if_op - @contextmanager - def else_context(self): - with InsertionPoint(self._if_op.else_block): - yield - scf.YieldOp([]) - -@contextmanager -def if_context(condition, has_else=False): - if has_else: - op = scf.IfOp(_unwrap(condition), [], hasElse=True) - branch = _IfElseBranch(op) - else: - op = scf.IfOp(_unwrap(condition)) - branch = None - - with InsertionPoint(op.then_block): - yield branch - scf.YieldOp([]) - - -def cond(condition, then_builder, else_builder): - op = scf.IfOp(_unwrap(condition), [], hasElse=True) - with InsertionPoint(op.then_block): - then_builder() - scf.YieldOp([]) - with InsertionPoint(op.else_block): - else_builder() - scf.YieldOp([]) - return op - -def _resolve_sync_op(sync_op): - if isinstance(sync_op, str): - normalized = sync_op.strip().upper() - if not normalized.startswith("T"): - normalized = f"T{normalized}" - try: - return getattr(pto, normalized) - except AttributeError as exc: - raise ValueError(f"Unsupported sync op type '{sync_op}'.") from exc - return sync_op - - -def _resolve_event_id(event_id): - if isinstance(event_id, int): - if event_id < 0 or event_id > 7: - raise ValueError(f"event_id must be in range [0, 7], got {event_id}.") - return getattr(pto, f"EVENT_ID{event_id}") - return event_id - - -def record_event(record_op, wait_op, event_id: int|Sequence[int]=0): - if not isinstance(event_id, int): - for eid in event_id: - pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) - else: - pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) - - - -def wait_event(record_op, wait_op, event_id: int|Sequence[int]=0): - if not isinstance(event_id, int): - for eid in event_id: - pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) - else: - pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) - - -def record_wait_pair(record_op, wait_op, event_id: int|Sequence[int]=0): - rec = _resolve_sync_op(record_op) - w = _resolve_sync_op(wait_op) - ev = _resolve_event_id(event_id) - pto.record_event(rec, w, ev) - pto.wait_event(rec, w, ev) - - -def barrier(sync_op): - pto.barrier(_resolve_sync_op(sync_op)) - - -def row_sum(src, tmp, dst): - pto.TRowSumOp(src = src, tmp = tmp, dst = dst) \ No newline at end of file diff --git a/ptodsl/pto.py b/ptodsl/pto.py new file mode 100644 index 00000000..b69bfbb0 --- /dev/null +++ b/ptodsl/pto.py @@ -0,0 +1,6 @@ +from .api import pto as _pto +from .api.pto import __all__ + + +def __getattr__(name): + return getattr(_pto, name) diff --git a/ptodsl/scalar.py b/ptodsl/scalar.py new file mode 100644 index 00000000..07d48be8 --- /dev/null +++ b/ptodsl/scalar.py @@ -0,0 +1,6 @@ +from .api import scalar as _scalar +from .api.scalar import __all__ + + +def __getattr__(name): + return getattr(_scalar, name) diff --git a/ptodsl/test_util.py b/ptodsl/test_util.py index bf5badb3..95e1165d 100644 --- a/ptodsl/test_util.py +++ b/ptodsl/test_util.py @@ -1,19 +1,13 @@ -import os - - -DEVICE_ENV_VAR = "PTODSL_TEST_DEVICE_ID" -DEFAULT_DEVICE_ID = "0" -DEVICE_PREFIX = "npu:" - - -def get_test_device() -> str: - device_id = os.getenv(DEVICE_ENV_VAR) - if not device_id: - print( - f"Warning: {DEVICE_ENV_VAR} is not set; defaulting to {DEFAULT_DEVICE_ID}." - ) - device_id = DEFAULT_DEVICE_ID - - if device_id.startswith(DEVICE_PREFIX): - return device_id - return f"{DEVICE_PREFIX}{device_id}" +from .utils.test_util import ( + DEFAULT_DEVICE_ID, + DEVICE_ENV_VAR, + DEVICE_PREFIX, + get_test_device, +) + +__all__ = [ + "DEVICE_ENV_VAR", + "DEFAULT_DEVICE_ID", + "DEVICE_PREFIX", + "get_test_device", +] diff --git a/ptodsl/tile.py b/ptodsl/tile.py new file mode 100644 index 00000000..3d8658c2 --- /dev/null +++ b/ptodsl/tile.py @@ -0,0 +1,6 @@ +from .api import tile as _tile +from .api.tile import __all__ + + +def __getattr__(name): + return getattr(_tile, name) diff --git a/ptodsl/utils/__init__.py b/ptodsl/utils/__init__.py new file mode 100644 index 00000000..8fbbcda0 --- /dev/null +++ b/ptodsl/utils/__init__.py @@ -0,0 +1,4 @@ +from .bench import do_bench +from .test_util import get_test_device + +__all__ = ["do_bench", "get_test_device"] diff --git a/ptodsl/utils/bench.py b/ptodsl/utils/bench.py new file mode 100644 index 00000000..44496414 --- /dev/null +++ b/ptodsl/utils/bench.py @@ -0,0 +1,54 @@ +from typing import Callable, List, Literal, Union + + +def do_bench( + fn: Callable, + warmup_iters: int = 5, + benchmark_iters: int = 15, + aggregation: Literal["mean", "none"] = "mean", + unit: Literal["s", "ms", "us", "ns"] = "us", + flush_cache: bool = True, +) -> Union[float, List[float]]: + """ + Benchmark a given function with warmup. + + Args: + fn: Function to benchmark. + warmup_iters: Number of warmup runs. + benchmark_iters: Number of benchmark runs. + aggregation: Aggregation mode for benchmark times. + unit: Time unit of the benchmarks. + flush_cache: if we should overwrite l2 cache between every iteration + Returns: + Runtime, or list of runtimes, in specified units. + """ + import torch + import torch_npu + + start_events = [torch.npu.Event(enable_timing=True) for _ in range(benchmark_iters)] + end_events = [torch.npu.Event(enable_timing=True) for _ in range(benchmark_iters)] + + # Allocate a 256 MB tensor which we write to every iteration to flush L2 cache + # https://github.com/tile-ai/tilelang/blob/main/tilelang/profiler/bench.py#L103 + cache_size = 256 * 1024 * 1024 + cache = torch.empty((cache_size), dtype=torch.int8).npu() + + for _ in range(warmup_iters): + fn() + torch_npu.npu.synchronize() + + # It's not easy to time a kernel in a way that satisfies the following two at the same time: + # 1) Ignores cache flushing, and 2) Ignoring kernel launch overhead. Here we ignore cache flushing. + for i in range(benchmark_iters): + if flush_cache: + cache.zero_() + start_events[i].record() + fn() + end_events[i].record() + + torch_npu.npu.synchronize() + factor = {"s": 1e-3, "ms": 1e0, "us": 1e3, "ns": 1e6}[unit] + times = [factor * start.elapsed_time(end) for start, end in zip(start_events, end_events)] + if aggregation == "mean": + return sum(times) / len(times) + return times diff --git a/ptodsl/utils/test_util.py b/ptodsl/utils/test_util.py new file mode 100644 index 00000000..bf5badb3 --- /dev/null +++ b/ptodsl/utils/test_util.py @@ -0,0 +1,19 @@ +import os + + +DEVICE_ENV_VAR = "PTODSL_TEST_DEVICE_ID" +DEFAULT_DEVICE_ID = "0" +DEVICE_PREFIX = "npu:" + + +def get_test_device() -> str: + device_id = os.getenv(DEVICE_ENV_VAR) + if not device_id: + print( + f"Warning: {DEVICE_ENV_VAR} is not set; defaulting to {DEFAULT_DEVICE_ID}." + ) + device_id = DEFAULT_DEVICE_ID + + if device_id.startswith(DEVICE_PREFIX): + return device_id + return f"{DEVICE_PREFIX}{device_id}" diff --git a/tests/frontend/test_add_dynamic_ir.py b/tests/frontend/test_add_dynamic_ir.py index f3ded1a2..9d661f28 100644 --- a/tests/frontend/test_add_dynamic_ir.py +++ b/tests/frontend/test_add_dynamic_ir.py @@ -2,11 +2,10 @@ from mlir.ir import IndexType from mlir.dialects import arith, func, pto as _pto, scf -from ptodsl import to_ir_module +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -import ptodsl.language as pto - -const = pto.const +const = s.const def meta_data(): @@ -51,12 +50,12 @@ def vec_add_1d_dynamic( vid = cidmul + sub_bid num_blocks = pto.get_block_num() - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) - total_elements = pto.index_cast(argN) + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks) + total_elements = s.index_cast(argN) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid_idx * num_tiles_per_core with pto.vector_section(): @@ -73,13 +72,13 @@ def vec_add_1d_dynamic( need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -95,7 +94,7 @@ def vec_add_1d_dynamic( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) diff --git a/tests/frontend/test_add_ir.py b/tests/frontend/test_add_ir.py index a6b5a936..d0ce8a05 100644 --- a/tests/frontend/test_add_ir.py +++ b/tests/frontend/test_add_ir.py @@ -1,9 +1,10 @@ from mlir.ir import Context, Location, Module, InsertionPoint, IntegerType from mlir.ir import F32Type, IndexType from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile +from ptodsl import scalar as s -const = pto.const +const = s.const def meta_data(): @@ -43,14 +44,14 @@ def vec_add_2d_static( cidmul = cid * sub_bnum vid = cidmul + sub_bid - v_row_idx = pto.index_cast(arg_vrow_i32) - v_col_idx = pto.index_cast(arg_vcol_i32) + v_row_idx = s.index_cast(arg_vrow_i32) + v_col_idx = s.index_cast(arg_vcol_i32) tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[c1280, c32], strides=[c32, c1]) tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[c1280, c32], strides=[c32, c1]) tv2 = pto.as_tensor(tensor_type, ptr=arg2, shape=[c1280, c32], strides=[c32, c1]) - vid_idx = pto.index_cast(vid) + vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) @@ -63,32 +64,32 @@ def vec_add_2d_static( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.add(tb0, tb1, tb2) + tile.add(tb0, tb1, tb2) pto.store(tb2, sv2) def build(): - from mlir.dialects import func, arith, pto + from mlir.dialects import arith, func, pto as _pto with Context() as ctx, Location.unknown(): - pto.register_dialect(ctx, load=True) + _pto.register_dialect(ctx, load=True) m = Module.create() f32 = F32Type.get() i32 = IntegerType.get_signless(32) - ptr_f32 = pto.PtrType.get(f32) + ptr_f32 = _pto.PtrType.get(f32) - tv2_f32 = pto.TensorViewType.get(2, f32) - tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32) - vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC) - bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor) - sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox) - pd = pto.PadValueAttr.get(pto.PadValue.Null) + tv2_f32 = _pto.TensorViewType.get(2, f32) + tile_view_32 = _pto.PartitionTensorViewType.get([32, 32], f32) + vec = _pto.AddressSpaceAttr.get(_pto.AddressSpace.VEC) + bl = _pto.BLayoutAttr.get(_pto.BLayout.RowMajor) + sl = _pto.SLayoutAttr.get(_pto.SLayout.NoneBox) + pd = _pto.PadValueAttr.get(_pto.PadValue.Null) - cfg = pto.TileBufConfigAttr.get(bl, sl, 512, pd) + cfg = _pto.TileBufConfigAttr.get(bl, sl, 512, pd) - tile_buf_dynamic = pto.TileBufType.get([32, 32], f32, vec, [-1, -1], cfg) + tile_buf_dynamic = _pto.TileBufType.get([32, 32], f32, vec, [-1, -1], cfg) fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32, i32, i32], []) with InsertionPoint(m.body): @@ -103,42 +104,42 @@ def build(): arg0, arg1, arg2, arg_vrow_i32, arg_vcol_i32 = entry.arguments - cid = pto.GetBlockIdxOp().result - sub_bid = pto.GetSubBlockIdxOp().result - sub_bnum = pto.GetSubBlockNumOp().result + cid = _pto.GetBlockIdxOp().result + sub_bid = _pto.GetSubBlockIdxOp().result + sub_bnum = _pto.GetSubBlockNumOp().result cidmul = arith.MulIOp(cid, sub_bnum).result vid = arith.AddIOp(cidmul, sub_bid).result v_row_idx = arith.IndexCastOp(IndexType.get(), arg_vrow_i32).result v_col_idx = arith.IndexCastOp(IndexType.get(), arg_vcol_i32).result - tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c1280, c32], [c32, c1]).result - tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c1280, c32], [c32, c1]).result - tv2 = pto.MakeTensorViewOp(tv2_f32, arg2, [c1280, c32], [c32, c1]).result + tv0 = _pto.MakeTensorViewOp(tv2_f32, arg0, [c1280, c32], [c32, c1]).result + tv1 = _pto.MakeTensorViewOp(tv2_f32, arg1, [c1280, c32], [c32, c1]).result + tv2 = _pto.MakeTensorViewOp(tv2_f32, arg2, [c1280, c32], [c32, c1]).result vid_idx = arith.IndexCastOp(IndexType.get(), vid).result offset_row = arith.MulIOp(vid_idx, c32).result - sv0 = pto.PartitionViewOp( + sv0 = _pto.PartitionViewOp( tile_view_32, tv0, offsets=[offset_row, c0], sizes=[c32, c32] ).result - sv1 = pto.PartitionViewOp( + sv1 = _pto.PartitionViewOp( tile_view_32, tv1, offsets=[offset_row, c0], sizes=[c32, c32] ).result - sv2 = pto.PartitionViewOp( + sv2 = _pto.PartitionViewOp( tile_view_32, tv2, offsets=[offset_row, c0], sizes=[c32, c32] ).result - vec_section = pto.SectionVectorOp() + vec_section = _pto.SectionVectorOp() vec_block = vec_section.body.blocks.append() with InsertionPoint(vec_block): - tb0 = pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result - tb1 = pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result - tb2 = pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result - - pto.TLoadOp(None, sv0, tb0) - pto.TLoadOp(None, sv1, tb1) - pto.TAddOp(tb0, tb1, tb2) - pto.TStoreOp(None, tb2, sv2) + tb0 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result + tb1 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result + tb2 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result + + _pto.TLoadOp(None, sv0, tb0) + _pto.TLoadOp(None, sv1, tb1) + _pto.TAddOp(tb0, tb1, tb2) + _pto.TStoreOp(None, tb2, sv2) func.ReturnOp([]) diff --git a/tests/frontend/test_matmul_dynamic_ir.py b/tests/frontend/test_matmul_dynamic_ir.py index da7f6b5f..fabe89bd 100644 --- a/tests/frontend/test_matmul_dynamic_ir.py +++ b/tests/frontend/test_matmul_dynamic_ir.py @@ -3,7 +3,8 @@ from mlir.dialects.pto import EVENT_ID0, TLOAD, TMATMUL, TMOV_M2L, TSTORE_ACC from mlir.ir import Context, F32Type, IndexType, InsertionPoint, IntegerType, Location, Module from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile +from ptodsl import scalar as s def _idx_const(v: int): @@ -60,7 +61,7 @@ def meta_data(): "tile_buf_biasTile": tile_buf_biasTile, } - const = pto.const + const = s.const @to_ir_module(meta_data=meta_data) def RunTMATMULSplitK( @@ -82,15 +83,15 @@ def RunTMATMULSplitK( cTileM = const(M) cTileN = const(N) - batch = pto.index_cast(batch_i32) + batch = s.index_cast(batch_i32) cBM = batch * cM - num_blocks = pto.index_cast(pto.get_block_num()) - batches_per_core = pto.ceil_div(batch, num_blocks) - bid = pto.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + batches_per_core = s.ceil_div(batch, num_blocks) + bid = s.index_cast(pto.get_block_idx()) b_start = bid * batches_per_core b_end_unclamped = b_start + batches_per_core - b_end = pto.min_u(b_end_unclamped, batch) + b_end = s.min_u(b_end_unclamped, batch) tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) @@ -105,9 +106,9 @@ def RunTMATMULSplitK( cTile = pto.alloc_tile(tile_buf_cTile) biasTile = pto.alloc_tile(tile_buf_biasTile) - for b_idx in pto.for_range(b_start, b_end, c1): + for b_idx in pto.range(b_start, b_end, c1): row_off = b_idx * cM - for i in pto.for_range(c0, cIter, c1): + for i in pto.range(c0, cIter, c1): kOff = i * cBASEK svA = pto.slice_view( tile_view_a, @@ -135,25 +136,25 @@ def RunTMATMULSplitK( pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - pto.mov(aMatTile, aTile) - pto.mov(bMatTile, bTile) + tile.mov(aMatTile, aTile) + tile.mov(bMatTile, bTile) with pto.if_context(isBias): - pto.mov(biasDataTile, biasTile) + tile.mov(biasDataTile, biasTile) pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - is_i0 = pto.eq(i, c0) + is_i0 = s.eq(i, c0) def _first_iter(): pto.cond( isBias, - lambda: pto.matmul_bias(aTile, bTile, biasTile, cTile), - lambda: pto.matmul(aTile, bTile, cTile), + lambda: tile.matmul_bias(aTile, bTile, biasTile, cTile), + lambda: tile.matmul(aTile, bTile, cTile), ) pto.cond( is_i0, _first_iter, - lambda: pto.matmul_acc(cTile, aTile, bTile, cTile), + lambda: tile.matmul_acc(cTile, aTile, bTile, cTile), ) pto.record_wait_pair("MATMUL", "LOAD", event_id=0) diff --git a/tests/npu/elementwise_dynamic_multicore/builder.py b/tests/npu/elementwise_dynamic_multicore/builder.py index c062bb3c..fa57f50a 100644 --- a/tests/npu/elementwise_dynamic_multicore/builder.py +++ b/tests/npu/elementwise_dynamic_multicore/builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const DTYPES = { @@ -63,12 +63,12 @@ def _1d( vid = cidmul + sub_bid num_blocks = pto.get_block_num() - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) - total_elements = pto.index_cast(argN) + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks) + total_elements = s.index_cast(argN) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid_idx * num_tiles_per_core with pto.vector_section(): @@ -90,13 +90,13 @@ def _1d( tiles_end_this_core = tile_offset_this_core + num_tiles_per_core need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -145,15 +145,15 @@ def _2d( vid = cidmul + sub_bid num_blocks = pto.get_block_num() - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) - rows = pto.index_cast(argM) - cols = pto.index_cast(argN) + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks) + rows = s.index_cast(argM) + cols = s.index_cast(argN) total_elements = rows * cols - rows_per_core = pto.ceil_div(rows, num_cores) + rows_per_core = s.ceil_div(rows, num_cores) row_start = vid_idx * rows_per_core - tiles_per_row = pto.ceil_div(cols, c_tile) + tiles_per_row = s.ceil_div(cols, c_tile) with pto.vector_section(): tv0 = pto.as_tensor( @@ -174,14 +174,14 @@ def _2d( rows_end = row_start + rows_per_core need_truncate = rows_end > rows remaining_rows = rows - row_start - rows_to_process = pto.select( + rows_to_process = s.select( need_truncate, remaining_rows, rows_per_core ) - for r in pto.for_range(c0, rows_to_process, c1): + for r in pto.range(c0, rows_to_process, c1): row_idx = r + row_start row_flat_offset = row_idx * cols - for c in pto.for_range(c0, tiles_per_row, c1): + for c in pto.range(c0, tiles_per_row, c1): col_offset = c * c_tile flat_offset = row_flat_offset + col_offset diff --git a/tests/npu/elementwise_dynamic_multicore/gen_ir.py b/tests/npu/elementwise_dynamic_multicore/gen_ir.py index 2562d76c..46c14310 100644 --- a/tests/npu/elementwise_dynamic_multicore/gen_ir.py +++ b/tests/npu/elementwise_dynamic_multicore/gen_ir.py @@ -8,15 +8,15 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -import ptodsl.language as pto +from ptodsl import tile from builder import build_binary_kernels _OPS = { - "add": pto.add, - "sub": pto.sub, - "mul": pto.mul, - "div": pto.div, - "or": pto.or_, + "add": tile.add, + "sub": tile.sub, + "mul": tile.mul, + "div": tile.div, + "or": tile.or_, } if __name__ == "__main__": diff --git a/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py b/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py index 90dfd7ab..b6a1d778 100644 --- a/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py +++ b/tests/npu/elementwise_unary_dynamic_multicore/gen_ir.py @@ -8,17 +8,17 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -import ptodsl.language as pto +from ptodsl import tile from unary_builder import build_unary_kernel _OPS = { - "rsqrt": pto.rsqrt, - "sqrt": pto.sqrt, - "exp": pto.exp, - "log": pto.log, - "relu": pto.relu, - "abs": pto.abs, - "reciprocal": pto.reciprocal, + "rsqrt": tile.rsqrt, + "sqrt": tile.sqrt, + "exp": tile.exp, + "log": tile.log, + "relu": tile.relu, + "abs": tile.abs, + "reciprocal": tile.reciprocal, } if __name__ == "__main__": diff --git a/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py b/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py index ac397ad5..e652d33a 100644 --- a/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py +++ b/tests/npu/elementwise_unary_dynamic_multicore/unary_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const # 32 KB of UB _TILE_SIZE_BYTES = 32 * 1024 @@ -63,8 +63,8 @@ def _kernel( c1 = const(1) c_tile = const(elements_per_tile) - batch = pto.index_cast(batch_i32) - n_cols = pto.index_cast(n_cols_i32) + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) with pto.vector_section(): cid = pto.get_block_idx() @@ -72,12 +72,12 @@ def _kernel( sub_bnum = pto.get_subblock_num() num_blocks = pto.get_block_num() - vid = pto.index_cast(cid * sub_bnum + sub_bid) - num_cores = pto.index_cast(num_blocks * sub_bnum) + vid = s.index_cast(cid * sub_bnum + sub_bid) + num_cores = s.index_cast(num_blocks * sub_bnum) - rows_per_core = pto.ceil_div(batch, num_cores) + rows_per_core = s.ceil_div(batch, num_cores) row_start = vid * rows_per_core - row_end = pto.min_u(row_start + rows_per_core, batch) + row_end = s.min_u(row_start + rows_per_core, batch) num_rows = row_end - row_start total_elems = batch * n_cols @@ -92,7 +92,7 @@ def _kernel( tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) tb_y = pto.alloc_tile(tile_type, valid_col=n_cols) - for row_i in pto.for_range(c0, num_rows, c1): + for row_i in pto.range(c0, num_rows, c1): gm_offset = (row_start + row_i) * n_cols sv_x = pto.slice_view( diff --git a/tests/npu/gather_dynamic_multicore/builder.py b/tests/npu/gather_dynamic_multicore/builder.py index b5bbbc29..1851ff54 100644 --- a/tests/npu/gather_dynamic_multicore/builder.py +++ b/tests/npu/gather_dynamic_multicore/builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const DTYPES = { "float32": lambda: pto.float32, @@ -88,18 +88,18 @@ def _kernel( c1 = const(1) c_tile = const(tile_length) - total_elements = pto.index_cast(argB * argN) # B * N + total_elements = s.index_cast(argB * argN) # B * N cid = pto.get_block_idx() sub_bid = pto.get_subblock_idx() sub_bnum = pto.get_subblock_num() vid = cid * sub_bnum + sub_bid num_blocks = pto.get_block_num() - vid_idx = pto.index_cast(vid) - num_cores = pto.index_cast(num_blocks) + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks) - num_tiles_global = pto.ceil_div(total_elements, c_tile) - num_tiles_per_core = pto.ceil_div(num_tiles_global, num_cores) + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) tile_offset_this_core = vid_idx * num_tiles_per_core with pto.vector_section(): @@ -123,13 +123,13 @@ def _kernel( tiles_end_this_core = tile_offset_this_core + num_tiles_per_core need_truncate = tiles_end_this_core > num_tiles_global remaining_tiles = num_tiles_global - tile_offset_this_core - tiles_to_process = pto.select( + tiles_to_process = s.select( need_truncate, remaining_tiles, num_tiles_per_core ) elements_to_process = tiles_to_process * c_tile with pto.if_context(elements_to_process > c0): - for i in pto.for_range(c0, tiles_to_process, c1): + for i in pto.range(c0, tiles_to_process, c1): tile_offset_global = i + tile_offset_this_core offset_global = tile_offset_global * c_tile @@ -150,9 +150,9 @@ def _kernel( pto.load(sv1, tb_idx) # gather within tile by indices - pto.gather(tb_src, tb_tmp, tb_idx) + tile.gather(tb_src, tb_tmp, tb_idx) - pto.gather(tb_tmp, tb_out, mask_pattern=mask_pattern) + tile.gather(tb_tmp, tb_out, mask_pattern=mask_pattern) sv2 = pto.slice_view( subtensor_type, diff --git a/tests/npu/gather_static_singlecore/builder.py b/tests/npu/gather_static_singlecore/builder.py index 417a76e7..2f7357c9 100644 --- a/tests/npu/gather_static_singlecore/builder.py +++ b/tests/npu/gather_static_singlecore/builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const _DTYPE_MAP = { "float32": lambda: pto.float32, @@ -90,8 +90,8 @@ def _kernel( pto.load(sv0, tb0) pto.load(sv1, tb1) - pto.gather(tb0, tb2, tb1) # index-gather: tb2[i,j] = tb0[tb1[i,j]] - pto.gather( + tile.gather(tb0, tb2, tb1) # index-gather: tb2[i,j] = tb0[tb1[i,j]] + tile.gather( tb2, tb3, mask_pattern=mask_pattern ) # mask-gather with configurable pattern diff --git a/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py b/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py index 631bcb94..5ea4338f 100644 --- a/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py +++ b/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const # 32 KB of UB _TILE_SIZE_BYTES = 32 * 1024 @@ -39,7 +39,7 @@ def meta_data(dtype="fp32"): def build_rowsum(fn_name="rowsum_fp32", dtype="fp32"): """ - Computes per-row sum across columns using PTO TROWSUM (pto.row_sum wrapper). + Computes per-row sum across columns using PTO TROWSUM (`tile.row_sum` wrapper). Args: x_ptr : dtype[batch * n_cols] input matrix flattened row-major @@ -62,16 +62,16 @@ def _kernel( c0 = const(0) c1 = const(1) - batch = pto.index_cast(batch_i32) - n_cols = pto.index_cast(n_cols_i32) + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) with pto.vector_section(): - bid = pto.index_cast(pto.get_block_idx()) - num_cores = pto.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) - rows_per_core = pto.ceil_div(batch, num_cores) + rows_per_core = s.ceil_div(batch, num_cores) row_start = bid * rows_per_core - row_end = pto.min_u(row_start + rows_per_core, batch) + row_end = s.min_u(row_start + rows_per_core, batch) num_rows = row_end - row_start total_elems = batch * n_cols @@ -91,7 +91,7 @@ def _kernel( tile_type, valid_col=n_cols ) # scratch - for r in pto.for_range(c0, num_rows, c1): + for r in pto.range(c0, num_rows, c1): gm_offset = (row_start + r) * n_cols sv_x = pto.slice_view( @@ -110,7 +110,7 @@ def _kernel( ) pto.load(sv_x, tb_x) - pto.row_sum(tb_x, tb_tmp, tb_sum) + tile.row_sum(tb_x, tb_tmp, tb_sum) # Store the 1-element tile to y[row] pto.store(tb_sum, sv_y) From 7fa663efd2db8d9507e8a1b53a83460d4a211d58 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sun, 8 Mar 2026 19:22:26 +0000 Subject: [PATCH 11/53] update ptoas to https://github.com/huawei-csl/PTOAS/releases/tag/20260307 --- .github/workflows/ci.yml | 2 +- docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b56bf119..d77062c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: env: RELEASE_REPO: huawei-csl/PTOAS - RELEASE_TAG: 20260304 + RELEASE_TAG: 20260307 CLI_DIR: /installers/ptoas-cli PTOISA_COMMIT: 672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc diff --git a/docker/README.md b/docker/README.md index edbce52a..6d13aceb 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,7 +1,7 @@ Usage: ```bash -RELEASE_TAG=20260304 +RELEASE_TAG=20260307 sudo docker build \ --build-arg RELEASE_TAG=$RELEASE_TAG \ . -t pto_dsl:$RELEASE_TAG From 80c4cb53472bfeb698c59c1fc9017075fd54517d Mon Sep 17 00:00:00 2001 From: Filip Skogh <43207511+fiskrt@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:44:33 +0100 Subject: [PATCH 12/53] feat: add `TPRINT` and `printf` ops and example (#68) * feat: add print tile example * feat: add builder file that will be fixed in future * feat: make compile script more robust, dsl now works * feat: remove cpp file * fix: compile script * fix: reorder print * feat: add printf --- examples/aot/print_tile/README.md | 6 ++ examples/aot/print_tile/caller.cpp | 9 +++ examples/aot/print_tile/compile.sh | 37 ++++++++++++ examples/aot/print_tile/print_builder.py | 74 ++++++++++++++++++++++++ examples/aot/print_tile/run_print.py | 53 +++++++++++++++++ ptodsl/api/pto.py | 2 + ptodsl/api/pto_general.py | 17 ++++++ ptodsl/api/tile.py | 4 ++ 8 files changed, 202 insertions(+) create mode 100644 examples/aot/print_tile/README.md create mode 100644 examples/aot/print_tile/caller.cpp create mode 100644 examples/aot/print_tile/compile.sh create mode 100644 examples/aot/print_tile/print_builder.py create mode 100644 examples/aot/print_tile/run_print.py diff --git a/examples/aot/print_tile/README.md b/examples/aot/print_tile/README.md new file mode 100644 index 00000000..3335fd00 --- /dev/null +++ b/examples/aot/print_tile/README.md @@ -0,0 +1,6 @@ +Usage: + +```bash +bash compile.sh +python ./run_print.py +``` diff --git a/examples/aot/print_tile/caller.cpp b/examples/aot/print_tile/caller.cpp new file mode 100644 index 00000000..6ae57e88 --- /dev/null +++ b/examples/aot/print_tile/caller.cpp @@ -0,0 +1,9 @@ +#include "print_gen.cpp" + +extern "C" void call_kernel( + void *stream, uint8_t *x, uint8_t *y, uint8_t *z, int32_t vrow, int32_t vcol) +{ + vec_add_kernel_2d_dynamic<<<2, nullptr, stream>>>( + (float *)x, (float *)y, (float *)z, vrow, vcol + ); +} diff --git a/examples/aot/print_tile/compile.sh b/examples/aot/print_tile/compile.sh new file mode 100644 index 00000000..ce093b4c --- /dev/null +++ b/examples/aot/print_tile/compile.sh @@ -0,0 +1,37 @@ + +#!/usr/bin/env bash +set -e + +PTO_DIR="$ASCEND_HOME_PATH/include/pto" +PTO_BACKUP="$ASCEND_HOME_PATH/include/pto_hidden" +PTO_LIB_PATH="/sources/pto-isa" +[ -d "$PTO_LIB_PATH" ] || exit 0 + + +rm -f print_lib.so print_gen.cpp +python ./print_builder.py | ptoas --enable-insert-sync > print_gen.cpp + +restore() { + if [ -d "$PTO_BACKUP" ]; then + mv "$PTO_BACKUP" "$PTO_DIR" + fi +} + +# For now we have to hide the CANN built-in headers, and use the cloned pto-isa's +# c.f. https://gitcode.com/cann/pto-isa/issues/149 +mv "$PTO_DIR" "$PTO_BACKUP" + +# Make restore run on EXIT +trap restore EXIT + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -xcce -Xhost-start -Xhost-end \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -D_DEBUG --cce-enable-print \ + -I${ASCEND_HOME_PATH}/aarch64-linux/pkg_inc/runtime/runtime \ + -I${PTO_LIB_PATH}/include \ + -std=gnu++17 \ + ./caller.cpp \ + -o ./print_lib.so diff --git a/examples/aot/print_tile/print_builder.py b/examples/aot/print_tile/print_builder.py new file mode 100644 index 00000000..80a17f7d --- /dev/null +++ b/examples/aot/print_tile/print_builder.py @@ -0,0 +1,74 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + + +def meta_data(): + # common, reusable type declarations + dtype = pto.float32 + index_dtype = pto.int32 + ptr_type = pto.PtrType(dtype) + tensor_type = pto.TensorType(rank=2, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[32, 32], dtype=dtype) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 + tile_cfg = pto.TileBufConfig() + # defaults to pto.TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null") + tile_type = pto.TileBufType( + shape=[32, 32], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=tile_cfg) + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +@to_ir_module(meta_data=meta_data) +def vec_add_kernel_2d_dynamic( + arg0: "ptr_type", + arg1: "ptr_type", + arg2: "ptr_type", + arg_vrow_i32: "index_dtype", + arg_vcol_i32: "index_dtype" + ) -> None: + c0 = const(0) + c1 = const(1) + c32 = const(32) + c1280 = const(1280) + + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + cidmul = cid * sub_bnum + vid = cidmul + sub_bid + + v_row_idx = s.index_cast(arg_vrow_i32) + v_col_idx = s.index_cast(arg_vcol_i32) + + tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[c1280, c32], strides=[c32, c1]) + tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[c1280, c32], strides=[c32, c1]) + tv2 = pto.as_tensor(tensor_type, ptr=arg2, shape=[c1280, c32], strides=[c32, c1]) + + vid_idx = s.index_cast(vid) + offset_row = vid_idx * c32 # every core loads 32 rows of data + sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) + sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) + sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32]) + + with pto.vector_section(): + tb0 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) + tb1 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) + tb2 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) + + pto.load(sv0, tb0) + pto.load(sv1, tb1) + pto.print("hello%d\n", c1) + tile.print(tb0) + tile.add(tb0, tb1, tb2) + pto.store(tb2, sv2) + + +if __name__ == "__main__": + module = vec_add_kernel_2d_dynamic + print(module) diff --git a/examples/aot/print_tile/run_print.py b/examples/aot/print_tile/run_print.py new file mode 100644 index 00000000..a0621095 --- /dev/null +++ b/examples/aot/print_tile/run_print.py @@ -0,0 +1,53 @@ +import ctypes +import torch +import torch_npu +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def lib_to_func(lib): + def add_func( + x, + y, + z, + stream_ptr=None + ): + + vrow, vcol = 32, 32 # local tile shape hard-coded as the kernel + + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + lib.call_kernel( + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(y), + torch_to_ctypes(z), + vrow, vcol + ) + return add_func + + +def test_add(): + device = get_test_device() + torch.npu.set_device(device) + + lib_path = "./print_lib.so" + lib = ctypes.CDLL(lib_path) + add_func = lib_to_func(lib) + + shape = [1280, 32] # tensor shape hard-coded as the kernel + torch.manual_seed(0) + dtype = torch.float32 + x = torch.arange(shape[0]*shape[1], device=device, dtype=dtype).reshape(shape) + y = torch.arange(shape[0]*shape[1], device=device, dtype=dtype).reshape(shape) + z = torch.empty(shape, device=device, dtype=dtype) + + add_func(x, y, z) + torch.npu.synchronize() + +if __name__ == "__main__": + test_add() diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py index b7bcb2c6..f2e2d0ac 100644 --- a/ptodsl/api/pto.py +++ b/ptodsl/api/pto.py @@ -12,6 +12,7 @@ slice_view, store, vector_section, + print, ) from .synchronization import barrier, record_event, record_wait_pair, wait_event from .type_def import ( @@ -51,6 +52,7 @@ "alloc_tile", "load", "store", + "print", "record_event", "wait_event", "record_wait_pair", diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 66a9ca3c..01d31848 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -69,6 +69,22 @@ def store(source, dest): _pto.TStoreOp(None, source, dest) +def print(format, scalar): + """ + Example: + `print("hello %d\n", const(5))` + is equivalent to + `cce::printf("hello%d\n", 5);` + + NOTE: may not print if the print buffer is full from previous + prints (typical when printing big tiles). + """ + if isinstance(scalar, Value): + scalar = _unwrap(scalar) + + _pto.print_(format, scalar) + + __all__ = [ "get_block_idx", "get_subblock_idx", @@ -81,4 +97,5 @@ def store(source, dest): "alloc_tile", "load", "store", + "print", ] diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index 094831ea..40643045 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -84,6 +84,10 @@ def subset(source, offsets, sizes): return _pto.subset(source, offset_vals, sizes) +def print(source): + _pto.tprint(source) + + __all__ = [ "mov", "add", From 0306b9876d27dd7f3295b6628972bfe4c7e3ea02 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:17:29 +0100 Subject: [PATCH 13/53] Matmul swizzle cleanup (#72) * port the latest minimum code from https://github.com/huawei-csl/pto-dsl/pull/70 * fix missing apis in ptodsl package * temporarily add artifacts * make swizzle count run-time parameter * update generated cpp and IR * expost swizzle_direction also as run-time parameter * fix MLIR verify * update generated ir and cpp * style compact * move nested functions out of main kernel function * note on SSA dominance * ignore compile artifacts * benchmark scripts * revert to older working builder * use more informative function name * update ptoas version * move level1_loop_mn_dynamic_tilesize to outside of main function * move out swizzle_zn and swizzle_nz util functions * plot swizzle speed-up and FLOPS ratio * remove fraction plot * ignore benchmark artifacts * reduce line counts * TODO on uni-tile-size demo * add auto-sync version of matmul swizzle * Revert "add auto-sync version of matmul swizzle" This reverts commit 446bee31bd8c505e0061907074befcbd80c204ab. --------- Co-authored-by: jiawei_zhuang --- .github/workflows/ci.yml | 2 +- docker/README.md | 2 +- examples/aot/matmul_swizzle/.gitignore | 3 + examples/aot/matmul_swizzle/README.md | 21 + examples/aot/matmul_swizzle/bench_matmul.py | 474 ++++++++++++++++++ examples/aot/matmul_swizzle/caller.cpp | 28 ++ examples/aot/matmul_swizzle/compile.sh | 14 + examples/aot/matmul_swizzle/matmul_builder.py | 300 +++++++++++ examples/aot/matmul_swizzle/run_matmul.py | 205 ++++++++ ptodsl/api/pto_general.py | 24 +- ptodsl/api/tile.py | 7 +- 11 files changed, 1073 insertions(+), 7 deletions(-) create mode 100644 examples/aot/matmul_swizzle/.gitignore create mode 100644 examples/aot/matmul_swizzle/README.md create mode 100644 examples/aot/matmul_swizzle/bench_matmul.py create mode 100644 examples/aot/matmul_swizzle/caller.cpp create mode 100644 examples/aot/matmul_swizzle/compile.sh create mode 100644 examples/aot/matmul_swizzle/matmul_builder.py create mode 100644 examples/aot/matmul_swizzle/run_matmul.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d77062c7..9a12fee9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: env: RELEASE_REPO: huawei-csl/PTOAS - RELEASE_TAG: 20260307 + RELEASE_TAG: 20260309 CLI_DIR: /installers/ptoas-cli PTOISA_COMMIT: 672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc diff --git a/docker/README.md b/docker/README.md index 6d13aceb..e7db3e11 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,7 +1,7 @@ Usage: ```bash -RELEASE_TAG=20260307 +RELEASE_TAG=20260309 sudo docker build \ --build-arg RELEASE_TAG=$RELEASE_TAG \ . -t pto_dsl:$RELEASE_TAG diff --git a/examples/aot/matmul_swizzle/.gitignore b/examples/aot/matmul_swizzle/.gitignore new file mode 100644 index 00000000..baa7820a --- /dev/null +++ b/examples/aot/matmul_swizzle/.gitignore @@ -0,0 +1,3 @@ +matmul.cpp +matmul.pto +outputs diff --git a/examples/aot/matmul_swizzle/README.md b/examples/aot/matmul_swizzle/README.md new file mode 100644 index 00000000..009a4e6a --- /dev/null +++ b/examples/aot/matmul_swizzle/README.md @@ -0,0 +1,21 @@ +Usage: + +```bash +bash ./compile.sh +python ./run_matmul.py + +python ./bench_matmul.py +``` + +Benchmark outputs: +- CSV: `outputs/csv/bench_matmul.csv` +- Optional plots (if `matplotlib` is installed): `outputs/plots/flops_n{N}_k{K}.png` + +Useful benchmark options: + +``` +python ./bench_matmul.py --csv outputs/csv/my_bench.csv --plot-dir outputs/plots +python ./bench_matmul.py --m-list 512,1024,2048,4096 +python ./bench_matmul.py --warmup 10 --repeat 50 +python ./bench_matmul.py --lib ./matmul_kernel.so +``` diff --git a/examples/aot/matmul_swizzle/bench_matmul.py b/examples/aot/matmul_swizzle/bench_matmul.py new file mode 100644 index 00000000..695cc971 --- /dev/null +++ b/examples/aot/matmul_swizzle/bench_matmul.py @@ -0,0 +1,474 @@ +import argparse +import csv +import ctypes +import os +from pathlib import Path + +import torch +import torch.nn.functional as F +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +BLOCK_DIM = 24 +SWIZZLE_DIRECTION_LIST = [0, 1] +SWIZZLE_COUNT_LIST = [1, 3, 5] +NO_SWIZZLE_DIRECTION = -1 +NO_SWIZZLE_COUNT = 1 +M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 +SHAPES_NK = [ + (4096, 4096), + (8192, 8192), + (16384, 16384), +] +N_WARMUP = 5 +N_REPEAT = 20 +DEFAULT_CSV_REL_PATH = Path("outputs") / "csv" / "bench_matmul.csv" +DEFAULT_PLOT_DIR = Path("outputs") / "plots" + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(os.path.abspath(lib_path)) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] + lib.call_kernel.restype = None + + def matmul_abt( + a, + b, + *, + block_dim=24, + swizzle_direction=1, + swizzle_count=3, + stream_ptr=None, + ): + if a.ndim != 2 or b.ndim != 2: + raise ValueError("matmul_abt expects 2D tensors: a[M,K], b[N,K]") + if a.shape[1] != b.shape[1]: + raise ValueError( + f"K mismatch: a.shape={tuple(a.shape)}, b.shape={tuple(b.shape)}" + ) + if a.dtype != torch.float16 or b.dtype != torch.float16: + raise ValueError("matmul_abt currently supports float16 inputs only") + + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + m = int(a.shape[0]) + k = int(a.shape[1]) + n = int(b.shape[0]) + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + m, + n, + k, + swizzle_direction, + swizzle_count, + ) + return c + + return matmul_abt + + +def _parse_int_list(raw: str): + parts = [p.strip() for p in raw.split(",") if p.strip()] + if not parts: + raise ValueError("List cannot be empty.") + return [int(p) for p in parts] + + +def _parse_args(): + parser = argparse.ArgumentParser( + description=( + "Benchmark AOT matmul_abt vs torch.nn.functional.linear, " + "save CSV, and optionally plot throughput." + ) + ) + parser.add_argument( + "--lib", + type=str, + default="matmul_kernel.so", + help="Path to shared library with call_kernel (default: matmul_kernel.so).", + ) + parser.add_argument( + "--csv", + type=str, + default=str(DEFAULT_CSV_REL_PATH), + help=f"Output CSV path (default: {DEFAULT_CSV_REL_PATH}).", + ) + parser.add_argument( + "--plot-dir", + type=str, + default=str(DEFAULT_PLOT_DIR), + help=f"Plot output directory (default: {DEFAULT_PLOT_DIR}).", + ) + parser.add_argument( + "--m-list", + type=str, + default=",".join(str(m) for m in M_LIST), + help="Comma-separated M values (default: script M_LIST).", + ) + parser.add_argument( + "--warmup", + type=int, + default=N_WARMUP, + help=f"Warmup iterations (default: {N_WARMUP}).", + ) + parser.add_argument( + "--repeat", + type=int, + default=N_REPEAT, + help=f"Timed iterations (default: {N_REPEAT}).", + ) + return parser.parse_args() + + +def _time_fn(fn, a_list, b_list, warmup, repeat): + for a, b in zip(a_list[:warmup], b_list[:warmup]): + fn(a, b) + torch.npu.synchronize() + + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + start.record() + for a, b in zip(a_list[warmup : warmup + repeat], b_list[warmup : warmup + repeat]): + fn(a, b) + end.record() + torch.npu.synchronize() + + elapsed_ms = start.elapsed_time(end) + return elapsed_ms * 1000.0 / repeat + + +def _swizzle_cases(): + # direction=-1 disables swizzle; treat it as one dedicated baseline case. + cases = [(NO_SWIZZLE_DIRECTION, NO_SWIZZLE_COUNT)] + for direction in SWIZZLE_DIRECTION_LIST: + if direction == NO_SWIZZLE_DIRECTION: + continue + for count in SWIZZLE_COUNT_LIST: + cases.append((direction, count)) + return cases + + +def _maybe_plot(rows, plot_dir): + try: + import matplotlib.pyplot as plt + except ImportError: + print("matplotlib not installed; skipping plot generation.") + return + + # Prefer a white-grid background style for readability in reports. + style_candidates = ("seaborn-v0_8-whitegrid", "seaborn-whitegrid") + for style_name in style_candidates: + try: + plt.style.use(style_name) + break + except OSError: + continue + + plt.rcParams["figure.facecolor"] = "white" + plt.rcParams["axes.facecolor"] = "white" + + plot_dir.mkdir(parents=True, exist_ok=True) + + grouped = {} + for row in rows: + key = (row["n"], row["k"]) + grouped.setdefault(key, []).append(row) + + for (n, k), chunk in grouped.items(): + m_values = sorted({r["m"] for r in chunk}) + swizzles = sorted( + {(r["swizzle_direction"], r["swizzle_count"]) for r in chunk}, + key=lambda x: (x[0], x[1]), + ) + + linear_by_m = {} + for m in m_values: + candidates = [r for r in chunk if r["m"] == m] + linear_by_m[m] = sum(r["linear_tflops"] for r in candidates) / len(candidates) + + plt.figure(figsize=(9, 5)) + plt.plot( + m_values, + [linear_by_m[m] for m in m_values], + marker="x", + linestyle="--", + color="#111111", + label="F.linear", + ) + + cmap = plt.get_cmap("tab10") + for idx, (direction, count) in enumerate(swizzles): + series = [] + for m in m_values: + candidates = [ + r + for r in chunk + if r["m"] == m + and r["swizzle_direction"] == direction + and r["swizzle_count"] == count + ] + if not candidates: + series.append(float("nan")) + else: + series.append( + sum(r["custom_tflops"] for r in candidates) / len(candidates) + ) + is_baseline = direction == NO_SWIZZLE_DIRECTION + label = ( + "matmul_abt(no-swizzle)" + if is_baseline + else f"matmul_abt(d={direction}, c={count})" + ) + plt.plot( + m_values, + series, + marker="o", + linestyle="-", + color=cmap(idx % 10), + alpha=1.0 if is_baseline else 0.7, + label=label, + ) + + plt.title(f"TFLOPS vs M (N={n}, K={k})") + plt.xlabel("M") + plt.ylabel("TFLOPS") + plt.xlim(left=0) + plt.ylim(bottom=0) + plt.grid(alpha=0.25) + plt.legend(fontsize=8) + plt.tight_layout() + out = plot_dir / f"flops_n{n}_k{k}.png" + plt.savefig(out, dpi=160) + plt.close() + print(f"Saved plot: {out}") + + plt.figure(figsize=(10, 5)) + ax_left = plt.gca() + cmap = plt.get_cmap("tab10") + + for idx, (direction, count) in enumerate(swizzles): + speedup_series = [] + for m in m_values: + candidates = [ + r + for r in chunk + if r["m"] == m + and r["swizzle_direction"] == direction + and r["swizzle_count"] == count + ] + if not candidates: + speedup_series.append(float("nan")) + else: + speedup_series.append( + sum(r["speedup_vs_no_swizzle"] for r in candidates) + / len(candidates) + ) + + is_baseline = direction == NO_SWIZZLE_DIRECTION + alpha = 1.0 if is_baseline else 0.7 + color = cmap(idx % 10) + base_label = ( + "no-swizzle baseline" + if is_baseline + else f"d={direction}, c={count}" + ) + speedup_label = f"speedup {base_label}" + + ax_left.plot( + m_values, + speedup_series, + marker="o", + linestyle="-", + color=color, + alpha=alpha, + label=speedup_label, + ) + + ax_left.set_title(f"Speed-up vs no-swizzle (N={n}, K={k})") + ax_left.set_xlabel("M") + ax_left.set_ylabel("Speed-up vs no-swizzle") + ax_left.set_xlim(left=0) + ax_left.set_ylim(bottom=0) + ax_left.grid(alpha=0.25) + ax_left.legend(fontsize=8) + plt.tight_layout() + ratio_out = plot_dir / f"ratio_n{n}_k{k}.png" + plt.savefig(ratio_out, dpi=160) + plt.close() + print(f"Saved plot: {ratio_out}") + + +def main(): + args = _parse_args() + base_dir = Path(__file__).resolve().parent + device = get_test_device() + torch.npu.set_device(device) + + m_list = _parse_int_list(args.m_list) + if args.warmup < 1 or args.repeat < 1: + raise ValueError("--warmup and --repeat must be positive integers.") + + lib_path = Path(args.lib) + if not lib_path.is_absolute(): + lib_path = base_dir / lib_path + if not lib_path.exists(): + raise FileNotFoundError(f"Kernel library not found: {lib_path}") + + csv_path = Path(args.csv) + if not csv_path.is_absolute(): + csv_path = base_dir / csv_path + csv_path.parent.mkdir(parents=True, exist_ok=True) + + plot_dir = Path(args.plot_dir) + if not plot_dir.is_absolute(): + plot_dir = base_dir / plot_dir + + matmul_abt = load_lib(str(lib_path)) + torch.manual_seed(0) + + rows = [] + swizzle_cases = _swizzle_cases() + total_cases = len(m_list) * len(SHAPES_NK) * len(swizzle_cases) + case_idx = 0 + + for n, k in SHAPES_NK: + for m in m_list: + alloc = args.warmup + args.repeat + a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] + b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] + c_ref = F.linear(a_list[0], b_list[0]) + torch.npu.synchronize() + + linear_time_us = _time_fn(F.linear, a_list, b_list, args.warmup, args.repeat) + flops = 2.0 * m * n * k + linear_tflops = flops / linear_time_us / 1e6 + + print(f"\n(M,N,K)=({m},{n},{k}) F.linear={linear_tflops:.3f} TFLOPS") + + case_rows = [] + no_swizzle_time_us = None + no_swizzle_tflops = None + + for swizzle_direction, swizzle_count in swizzle_cases: + case_idx += 1 + + def _custom(a, b, _d=swizzle_direction, _c=swizzle_count): + return matmul_abt( + a, + b, + block_dim=BLOCK_DIM, + swizzle_direction=_d, + swizzle_count=_c, + ) + + c = _custom(a_list[0], b_list[0]) + torch.npu.synchronize() + max_absdiff = float((c - c_ref).abs().max().item()) + mean_absdiff = float((c - c_ref).abs().mean().item()) + custom_time_us = _time_fn(_custom, a_list, b_list, args.warmup, args.repeat) + custom_tflops = flops / custom_time_us / 1e6 + flops_fraction_vs_linear = custom_tflops / linear_tflops + + if ( + swizzle_direction == NO_SWIZZLE_DIRECTION + and swizzle_count == NO_SWIZZLE_COUNT + ): + no_swizzle_time_us = custom_time_us + no_swizzle_tflops = custom_tflops + + case_rows.append( + { + "case_idx": case_idx, + "m": m, + "n": n, + "k": k, + "block_dim": BLOCK_DIM, + "swizzle_direction": swizzle_direction, + "swizzle_count": swizzle_count, + "linear_time_us": linear_time_us, + "linear_tflops": linear_tflops, + "custom_time_us": custom_time_us, + "custom_tflops": custom_tflops, + "flops_fraction_vs_linear": flops_fraction_vs_linear, + "max_absdiff": max_absdiff, + "mean_absdiff": mean_absdiff, + } + ) + + if no_swizzle_time_us is None or no_swizzle_tflops is None: + raise RuntimeError( + "No no-swizzle baseline result found " + f"(direction={NO_SWIZZLE_DIRECTION}, count={NO_SWIZZLE_COUNT})." + ) + + for record in case_rows: + record["no_swizzle_time_us"] = no_swizzle_time_us + record["no_swizzle_tflops"] = no_swizzle_tflops + record["speedup_vs_no_swizzle"] = ( + no_swizzle_time_us / record["custom_time_us"] + ) + progress_idx = record.pop("case_idx") + + print( + f" [{progress_idx:03d}/{total_cases}] " + f"d={record['swizzle_direction']} c={record['swizzle_count']} " + f"custom={record['custom_tflops']:.3f} TFLOPS " + f"frac_of_linear={record['flops_fraction_vs_linear']:.3f} " + f"speedup_vs_no_swizzle={record['speedup_vs_no_swizzle']:.3f}x " + f"mean_diff={record['mean_absdiff']:.3e}" + ) + rows.append(record) + + fieldnames = [ + "m", + "n", + "k", + "block_dim", + "swizzle_direction", + "swizzle_count", + "linear_time_us", + "linear_tflops", + "custom_time_us", + "custom_tflops", + "flops_fraction_vs_linear", + "no_swizzle_time_us", + "no_swizzle_tflops", + "speedup_vs_no_swizzle", + "max_absdiff", + "mean_absdiff", + ] + with csv_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + print(f"\nSaved benchmark CSV: {csv_path}") + + _maybe_plot(rows, plot_dir) + + +if __name__ == "__main__": + main() diff --git a/examples/aot/matmul_swizzle/caller.cpp b/examples/aot/matmul_swizzle/caller.cpp new file mode 100644 index 00000000..07774763 --- /dev/null +++ b/examples/aot/matmul_swizzle/caller.cpp @@ -0,0 +1,28 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "matmul.cpp" +#endif + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint8_t *z, + int M, + int N, + int K, + int swizzle_direction, + int swizzle_count) +{ + matmul_kernel_ABt<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + reinterpret_cast(z), + static_cast(M), + static_cast(N), + static_cast(K), + static_cast(swizzle_direction), + static_cast(swizzle_count)); +} diff --git a/examples/aot/matmul_swizzle/compile.sh b/examples/aot/matmul_swizzle/compile.sh new file mode 100644 index 00000000..9eb80bca --- /dev/null +++ b/examples/aot/matmul_swizzle/compile.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +rm -f matmul.pto matmul.cpp matmul_kernel.so + +python ./matmul_builder.py > matmul.pto +ptoas matmul.pto -o matmul.cpp + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"matmul.cpp\"" \ + ./caller.cpp \ + -o ./matmul_kernel.so diff --git a/examples/aot/matmul_swizzle/matmul_builder.py b/examples/aot/matmul_swizzle/matmul_builder.py new file mode 100644 index 00000000..a3c1eb70 --- /dev/null +++ b/examples/aot/matmul_swizzle/matmul_builder.py @@ -0,0 +1,300 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +def build(): + M_TILE = 128 + K_QTILE = 64 + K_TILE = 256 + K_DTILE = 512 + N_FULL = 256 + N_HALF = 128 + + def meta_data(): + dtype = pto.float16 + acc_dtype = pto.float32 + ptr_type = pto.PtrType(dtype) + i32 = pto.int32 + tv_a = pto.TensorType(rank=2, dtype=dtype) + tv_b = pto.TensorType(rank=2, dtype=dtype) + tv_c = pto.TensorType(rank=2, dtype=dtype) + + tile_view_a = pto.SubTensorType(shape=[M_TILE, K_DTILE], dtype=dtype) + tile_view_b_256 = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) + tile_view_b_128 = pto.SubTensorType(shape=[K_TILE, N_HALF], dtype=dtype) + tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) + tile_view_c_128 = pto.SubTensorType(shape=[M_TILE, N_HALF], dtype=dtype) + + b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) + + tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") + tile_buf_b_l1_256 = pto.TileBufType(shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg) + tile_buf_b_l1_128 = pto.TileBufType(shape=[K_TILE, N_HALF], dtype=dtype, memory_space="MAT", config=b_l1_cfg) + tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") + tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") + tile_buf_b_l0_128 = pto.TileBufType(shape=[K_QTILE, N_HALF], dtype=dtype, memory_space="RIGHT") + tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") + tile_buf_c_128 = pto.TileBufType(shape=[M_TILE, N_HALF], dtype=acc_dtype, memory_space="ACC") + + return { + "ptr_type": ptr_type, + "i32": i32, + "tv_a": tv_a, + "tv_b": tv_b, + "tv_c": tv_c, + "tile_view_a": tile_view_a, + "tile_view_b_256": tile_view_b_256, + "tile_view_b_128": tile_view_b_128, + "tile_view_c_256": tile_view_c_256, + "tile_view_c_128": tile_view_c_128, + "tile_buf_a_l1": tile_buf_a_l1, + "tile_buf_b_l1_256": tile_buf_b_l1_256, + "tile_buf_b_l1_128": tile_buf_b_l1_128, + "tile_buf_a_l0": tile_buf_a_l0, + "tile_buf_b_l0_256": tile_buf_b_l0_256, + "tile_buf_b_l0_128": tile_buf_b_l0_128, + "tile_buf_c_256": tile_buf_c_256, + "tile_buf_c_128": tile_buf_c_128, + } + + def swizzle_zn(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2): + tile_block_loop = (m_loop + cSwizzleM1) // cSwizzle + tile_block_span = cSwizzle * n_loop + tile_block_idx = li // tile_block_span + in_tile_block_idx = li % tile_block_span + is_last_block = tile_block_idx == (tile_block_loop - c1) + n_row_tail = m_loop - cSwizzle * tile_block_idx + n_row = s.select(is_last_block, n_row_tail, cSwizzle) + m_idx = tile_block_idx * cSwizzle + (in_tile_block_idx % n_row) + n_idx = in_tile_block_idx // n_row + odd_block = (tile_block_idx % c2) == c1 + flipped_n_idx = n_loop - n_idx - c1 + n_idx = s.select(odd_block, flipped_n_idx, n_idx) + return m_idx, n_idx + + def swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2): + tile_block_loop = (n_loop + cSwizzleM1) // cSwizzle + tile_block_span = cSwizzle * m_loop + tile_block_idx = li // tile_block_span + in_tile_block_idx = li % tile_block_span + is_last_block = tile_block_idx == (tile_block_loop - c1) + n_col_tail = n_loop - cSwizzle * tile_block_idx + n_col = s.select(is_last_block, n_col_tail, cSwizzle) + m_idx = in_tile_block_idx // n_col + n_idx = tile_block_idx * cSwizzle + (in_tile_block_idx % n_col) + odd_block = (tile_block_idx % c2) == c1 + flipped_m_idx = m_loop - m_idx - c1 + m_idx = s.select(odd_block, flipped_m_idx, m_idx) + return m_idx, n_idx + + def level1_loop_mn_dynamic_tilesize( + n_tile: int, + b_view_type, + c_view_type, + b_l1_type, + b_l0_type, + c_type, + m_offset, + n_offset, + k_dtile_num, + li, + core_loop, + bid, + num_blocks, + tvA, + tvB, + tvC, + ): + c0 = const(0) + c1 = const(1) + c2 = const(2) + cKT = const(K_TILE) + cKD = const(K_DTILE) + cNT = const(n_tile) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(b_l1_type), pto.alloc_tile(b_l1_type)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(b_l0_type), pto.alloc_tile(b_l0_type)] + c_l0 = pto.alloc_tile(c_type) + + not_first_tile = li != bid + with pto.if_context(not_first_tile): + pto.wait_event("STORE_ACC", "MATMUL", event_id=0) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, c0], + sizes=[const(M_TILE), cKD], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.load(sv_a0, a_l1[0]) + pto.record_event("LOAD", "MOV_M2L", event_id=0) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * cKD + is_curr0 = (k_idx % c2) == c0 + + def level2_loop_k(curr_id, next_id, a_curr, a_next): + is_first_k_tile = k_idx == c0 + + for h in range(2): + b_evt = 2 + h + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + b_view_type, + source=tvB, + offsets=[k_offset + h_off, n_offset], + sizes=[cKT, cNT], + ) + + pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) + pto.load(sv_b, b_l1[h]) + pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) + + for quarter in range(4): + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + pto.wait_event("MATMUL", "MOV_M2L", event_id=ping) + if phase == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=curr_id) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + if phase == 7: + pto.record_event("MOV_M2L", "LOAD", event_id=curr_id) + + if quarter == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=b_evt) + + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + pto.record_event("MOV_M2L", "MATMUL", event_id=0) + + if quarter == 3: + pto.record_event("MOV_M2L", "LOAD", event_id=b_evt) + + pto.wait_event("MOV_M2L", "MATMUL", event_id=0) + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + pto.record_event("MATMUL", "MOV_M2L", event_id=ping) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, k_offset + cKD], + sizes=[const(M_TILE), cKD], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=next_id) + pto.load(sv_a_next, a_next) + pto.record_event("LOAD", "MOV_M2L", event_id=next_id) + + with pto.if_context(is_curr0, has_else=True) as branch: + level2_loop_k(0, 1, a_l1[0], a_l1[1]) + with branch.else_context(): + level2_loop_k(1, 0, a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + c_view_type, + source=tvC, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), cNT], + ) + pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) + pto.store(c_l0, sv_c) + + with pto.if_context(li + num_blocks < core_loop): + pto.record_event("STORE_ACC", "MATMUL", event_id=0) + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32", + swizzle_direction_i32: "i32", + swizzle_count_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c128n = const(N_HALF) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + swizzle_direction = s.index_cast(swizzle_direction_i32) + swizzle_count = s.index_cast(swizzle_count_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + cSwizzle = s.select(swizzle_count > c0, swizzle_count, c1) + cSwizzleM1 = cSwizzle - c1 + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + + tvA = pto.as_tensor(tv_a, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tvB = pto.as_tensor(tv_b, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tvC = pto.as_tensor(tv_c, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) + pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) + + def level1_loop_mn(m_offset, n_offset, li): + # TODO: make a simpler version that only uses full-tile (256) branch, and reduce the types needed in meta_data + n_tile_size = s.select(n_offset + c256 > n_total, c128n, c256) + shared_args = [m_offset, n_offset, k_dtile_num, li, core_loop, bid, num_blocks, tvA, tvB, tvC] + with pto.if_context(n_tile_size == c256, has_else=True) as branch: + level1_loop_mn_dynamic_tilesize( + N_FULL, tile_view_b_256, tile_view_c_256, tile_buf_b_l1_256, tile_buf_b_l0_256, tile_buf_c_256, *shared_args) + with branch.else_context(): + level1_loop_mn_dynamic_tilesize( + N_HALF, tile_view_b_128, tile_view_c_128, tile_buf_b_l1_128, tile_buf_b_l0_128, tile_buf_c_128, *shared_args) + + for li in pto.range(bid, core_loop, num_blocks): + with pto.if_context(swizzle_direction == c0, has_else=True) as c0_branch: + m_idx, n_idx = swizzle_zn(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + level1_loop_mn(m_idx * c128, n_idx * c256, li) + + with c0_branch.else_context(): + with pto.if_context(swizzle_direction == c1, has_else=True) as c1_branch: + m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + level1_loop_mn(m_idx * c128, n_idx * c256, li) + + with c1_branch.else_context(): + # Default linear mapping, used when swizzle_direction is not 0/1. + m_idx = li // n_loop + n_idx = li % n_loop + level1_loop_mn(m_idx * c128, n_idx * c256, li) + + pto.wait_event("MOV_M2L", "LOAD", event_id=3) + pto.wait_event("MOV_M2L", "LOAD", event_id=2) + pto.wait_event("MOV_M2L", "LOAD", event_id=1) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=1) + + return matmul_kernel_ABt + + +if __name__ == "__main__": + print(build()) \ No newline at end of file diff --git a/examples/aot/matmul_swizzle/run_matmul.py b/examples/aot/matmul_swizzle/run_matmul.py new file mode 100644 index 00000000..f21af14a --- /dev/null +++ b/examples/aot/matmul_swizzle/run_matmul.py @@ -0,0 +1,205 @@ +import ctypes +import os +from dataclasses import dataclass + +import torch +import torch.nn.functional as F +import torch_npu + +from ptodsl.test_util import get_test_device + + +BLOCK_DIM_LIST = [1, 20, 24] +SWIZZLE_DIRECTION_LIST = [0, 1] +SWIZZLE_COUNT_LIST = [1, 3, 5] +M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 +SHAPES_NK = [ + (4096, 4096), + (8192, 8192), + (16384, 16384), +] +MAX_ABSDIFF_THRESHOLD = 0.5 +MEAN_ABSDIFF_THRESHOLD = 1e-4 + + +@dataclass +class CaseResult: + m: int + n: int + k: int + block_dim: int + swizzle_direction: int + swizzle_count: int + max_absdiff: float + mean_absdiff: float + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(os.path.abspath(lib_path)) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] + lib.call_kernel.restype = None + + def matmul_abt( + a, + b, + *, + block_dim=24, + swizzle_direction=1, + swizzle_count=3, + stream_ptr=None, + ): + if a.ndim != 2 or b.ndim != 2: + raise ValueError("matmul_abt expects 2D tensors: a[M,K], b[N,K]") + if a.shape[1] != b.shape[1]: + raise ValueError( + f"K mismatch: a.shape={tuple(a.shape)}, b.shape={tuple(b.shape)}" + ) + if a.dtype != torch.float16 or b.dtype != torch.float16: + raise ValueError("matmul_abt currently supports float16 inputs only") + + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + m = int(a.shape[0]) + k = int(a.shape[1]) + n = int(b.shape[0]) + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + m, + n, + k, + swizzle_direction, + swizzle_count, + ) + return c + + return matmul_abt + + +def run_case(matmul_abt, a, b, c_ref, *, block_dim, swizzle_direction, swizzle_count): + c = matmul_abt( + a, + b, + block_dim=block_dim, + swizzle_direction=swizzle_direction, + swizzle_count=swizzle_count, + ) + torch.npu.synchronize() + return CaseResult( + m=int(a.shape[0]), + n=int(b.shape[0]), + k=int(a.shape[1]), + block_dim=block_dim, + swizzle_direction=swizzle_direction, + swizzle_count=swizzle_count, + max_absdiff=float((c - c_ref).abs().max().item()), + mean_absdiff=float((c - c_ref).abs().mean().item()), + ) + + +def test_matmul(): + device = get_test_device() + torch.npu.set_device(device) + matmul_abt = load_lib("./matmul_kernel.so") + + torch.manual_seed(0) + checked_cases = 0 + global_worst = None + + for m in M_LIST: + for n, k in SHAPES_NK: + a = torch.randn(m, k, dtype=torch.float16, device=device) + b = torch.randn(n, k, dtype=torch.float16, device=device) + c_ref = F.linear(a, b) + torch.npu.synchronize() + + shape_worst = None + for block_dim in BLOCK_DIM_LIST: + for swizzle_direction in SWIZZLE_DIRECTION_LIST: + for swizzle_count in SWIZZLE_COUNT_LIST: + result = run_case( + matmul_abt, + a, + b, + c_ref, + block_dim=block_dim, + swizzle_direction=swizzle_direction, + swizzle_count=swizzle_count, + ) + checked_cases += 1 + + if ( + shape_worst is None + or result.max_absdiff > shape_worst.max_absdiff + or ( + result.max_absdiff == shape_worst.max_absdiff + and result.mean_absdiff > shape_worst.mean_absdiff + ) + ): + shape_worst = result + + if ( + global_worst is None + or result.max_absdiff > global_worst.max_absdiff + or ( + result.max_absdiff == global_worst.max_absdiff + and result.mean_absdiff > global_worst.mean_absdiff + ) + ): + global_worst = result + + print( + f"(m, n, k)=({m}, {n}, {k}) " + f"worst(block_dim, swizzle_direction, swizzle_count)=" + f"({shape_worst.block_dim}, {shape_worst.swizzle_direction}, " + f"{shape_worst.swizzle_count}) " + f"max_absdiff={shape_worst.max_absdiff:.6f} " + f"mean_absdiff={shape_worst.mean_absdiff:.6f}" + ) + + print(f"checked_cases={checked_cases}") + print( + "global_worst " + f"max_absdiff={global_worst.max_absdiff:.6f} " + f"mean_absdiff={global_worst.mean_absdiff:.6f} " + f"at (m, n, k, block_dim, swizzle_direction, swizzle_count)=" + f"({global_worst.m}, {global_worst.n}, {global_worst.k}, " + f"{global_worst.block_dim}, {global_worst.swizzle_direction}, " + f"{global_worst.swizzle_count})" + ) + + if global_worst.max_absdiff > MAX_ABSDIFF_THRESHOLD: + raise AssertionError( + f"max_absdiff {global_worst.max_absdiff:.6f} exceeds " + f"threshold {MAX_ABSDIFF_THRESHOLD:.6f}" + ) + if global_worst.mean_absdiff > MEAN_ABSDIFF_THRESHOLD: + raise AssertionError( + f"mean_absdiff {global_worst.mean_absdiff:.6f} exceeds " + f"threshold {MEAN_ABSDIFF_THRESHOLD:.6f}" + ) + + +if __name__ == "__main__": + test_matmul() diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 01d31848..02acb5ed 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -22,10 +22,24 @@ def get_block_num(): return Value(_pto.GetBlockNumOp().result) -def as_tensor(tensor_type, *, ptr, shape, strides): +def _resolve_layout_attr(layout): + if layout is None: + return None + if isinstance(layout, str): + return _pto.LayoutAttr.get(getattr(_pto.Layout, layout)) + return layout + + +def as_tensor(tensor_type, *, ptr, shape, strides, layout=None): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] - return _pto.MakeTensorViewOp(tensor_type, _unwrap(ptr), shape_vals, stride_vals).result + kwargs = {} + layout_attr = _resolve_layout_attr(layout) + if layout_attr is not None: + kwargs["layout"] = layout_attr + return _pto.MakeTensorViewOp( + tensor_type, _unwrap(ptr), shape_vals, stride_vals, **kwargs + ).result def slice_view(subtensor_type, *, source, offsets, sizes): @@ -52,8 +66,10 @@ def cube_section(): yield -def alloc_tile(tile_type, *, valid_row=None, valid_col=None): +def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None): kwargs = {} + if addr is not None: + kwargs["addr"] = _unwrap(addr) if valid_row is not None: kwargs["valid_row"] = _unwrap(valid_row) if valid_col is not None: @@ -98,4 +114,4 @@ def print(format, scalar): "load", "store", "print", -] +] \ No newline at end of file diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index 40643045..fc3f67a3 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -75,6 +75,10 @@ def matmul_acc(acc, lhs, rhs, out): _pto.TMatmulAccOp(None, acc, lhs, rhs, out) +def extract(source, index_row, index_col, out): + _pto.TExtractOp(src=source, indexRow=_unwrap(index_row), indexCol=_unwrap(index_col), dst=out) + + def row_sum(src, tmp, dst): _pto.TRowSumOp(src=src, tmp=tmp, dst=dst) @@ -106,6 +110,7 @@ def print(source): "matmul", "matmul_bias", "matmul_acc", + "extract", "row_sum", "subset", -] +] \ No newline at end of file From e7a68427ce867048ab2d43fd6847e320262bed51 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Tue, 10 Mar 2026 00:23:25 +0100 Subject: [PATCH 14/53] Test auto-sync on general-shape matmul with swizzle, compare to manual sync performance. (#73) * simpler matmul demo * remove swizzle param from simple demo * remove unused var * inline level1_loop_mn function * use one TensorType declare * rename level2_loop_k, put comments * add auto-sync variant * correct compile auto-sync version * ignore artifacts * performance benchmark * simple static swizzle --- .../aot/matmul_swizzle/simple_demo/.gitignore | 4 + .../aot/matmul_swizzle/simple_demo/README.md | 14 + .../simple_demo/bench_matmul.py | 193 ++++++++++ .../aot/matmul_swizzle/simple_demo/caller.cpp | 28 ++ .../aot/matmul_swizzle/simple_demo/compile.sh | 31 ++ .../simple_demo/run_simple_matmul.py | 205 ++++++++++ .../simple_demo/simple_matmul_builder.py | 355 ++++++++++++++++++ 7 files changed, 830 insertions(+) create mode 100644 examples/aot/matmul_swizzle/simple_demo/.gitignore create mode 100644 examples/aot/matmul_swizzle/simple_demo/README.md create mode 100644 examples/aot/matmul_swizzle/simple_demo/bench_matmul.py create mode 100644 examples/aot/matmul_swizzle/simple_demo/caller.cpp create mode 100644 examples/aot/matmul_swizzle/simple_demo/compile.sh create mode 100644 examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py create mode 100644 examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py diff --git a/examples/aot/matmul_swizzle/simple_demo/.gitignore b/examples/aot/matmul_swizzle/simple_demo/.gitignore new file mode 100644 index 00000000..52af4f6e --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/.gitignore @@ -0,0 +1,4 @@ +simple_matmul_manual_sync.pto +simple_matmul_auto_sync.pto +simple_matmul_manual_sync.cpp +simple_matmul_auto_sync.cpp diff --git a/examples/aot/matmul_swizzle/simple_demo/README.md b/examples/aot/matmul_swizzle/simple_demo/README.md new file mode 100644 index 00000000..323bb056 --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/README.md @@ -0,0 +1,14 @@ +Usage: + +```bash +bash ./compile.sh +# Run both variants (default) +python ./run_simple_matmul.py + +# Or run a single variant +python ./run_simple_matmul.py --variant auto-sync +python ./run_simple_matmul.py --variant manual-sync + +# Benchmark auto-sync vs manual-sync performance. +python ./bench_matmul.py +``` diff --git a/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py b/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py new file mode 100644 index 00000000..d79f3a3f --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py @@ -0,0 +1,193 @@ +import argparse +import ctypes +import os +from pathlib import Path + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +BLOCK_DIM = 24 +M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 +SHAPES_NK = [ + (4096, 4096), + (8192, 8192), + (16384, 16384), +] +N_WARMUP = 5 +N_REPEAT = 20 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(os.path.abspath(lib_path)) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] + lib.call_kernel.restype = None + + def matmul_abt(a, b, *, block_dim=BLOCK_DIM, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + m = int(a.shape[0]) + k = int(a.shape[1]) + n = int(b.shape[0]) + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + m, + n, + k, + ) + return c + + return matmul_abt + + +def _parse_int_list(raw): + parts = [p.strip() for p in raw.split(",") if p.strip()] + if not parts: + raise ValueError("List cannot be empty.") + return [int(p) for p in parts] + + +def _time_us(fn, a_list, b_list, warmup, repeat): + for a, b in zip(a_list[:warmup], b_list[:warmup]): + fn(a, b) + torch.npu.synchronize() + + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + start.record() + for a, b in zip(a_list[warmup : warmup + repeat], b_list[warmup : warmup + repeat]): + fn(a, b) + end.record() + torch.npu.synchronize() + return start.elapsed_time(end) * 1000.0 / repeat + + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Benchmark simple matmul auto-sync vs manual-sync and report ratios." + ) + parser.add_argument( + "--auto-lib", + type=str, + default="./simple_matmul_auto_sync_kernel.so", + help="Path to auto-sync shared library.", + ) + parser.add_argument( + "--manual-lib", + type=str, + default="./simple_matmul_manual_sync_kernel.so", + help="Path to manual-sync shared library.", + ) + parser.add_argument( + "--m-list", + type=str, + default=",".join(str(m) for m in M_LIST), + help="Comma-separated M values (default: script M_LIST).", + ) + parser.add_argument( + "--warmup", + type=int, + default=N_WARMUP, + help=f"Warmup iterations (default: {N_WARMUP}).", + ) + parser.add_argument( + "--repeat", + type=int, + default=N_REPEAT, + help=f"Timed iterations (default: {N_REPEAT}).", + ) + return parser.parse_args() + + +def main(): + args = _parse_args() + if args.warmup < 1 or args.repeat < 1: + raise ValueError("--warmup and --repeat must be positive integers.") + + base_dir = Path(__file__).resolve().parent + + auto_lib = Path(args.auto_lib) + if not auto_lib.is_absolute(): + auto_lib = base_dir / auto_lib + manual_lib = Path(args.manual_lib) + if not manual_lib.is_absolute(): + manual_lib = base_dir / manual_lib + if not auto_lib.exists(): + raise FileNotFoundError(f"Auto-sync library not found: {auto_lib}") + if not manual_lib.exists(): + raise FileNotFoundError(f"Manual-sync library not found: {manual_lib}") + + device = get_test_device() + torch.npu.set_device(device) + torch.manual_seed(0) + + auto_mm = load_lib(str(auto_lib)) + manual_mm = load_lib(str(manual_lib)) + m_list = _parse_int_list(args.m_list) + + ratios = [] + print(f"auto-sync lib: {auto_lib}") + print(f"manual-sync lib: {manual_lib}") + print("") + + for n, k in SHAPES_NK: + print(f"=== N={n}, K={k} ===") + for m in m_list: + alloc = args.warmup + args.repeat + a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] + b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] + + auto_us = _time_us(auto_mm, a_list, b_list, args.warmup, args.repeat) + manual_us = _time_us(manual_mm, a_list, b_list, args.warmup, args.repeat) + + flops = 2.0 * m * n * k + auto_tflops = flops / auto_us / 1e6 + manual_tflops = flops / manual_us / 1e6 + auto_vs_manual = manual_us / auto_us + manual_vs_auto = auto_us / manual_us + ratios.append(auto_vs_manual) + + print( + f"(M,N,K)=({m},{n},{k}) " + f"auto={auto_tflops:.3f}TF, manual={manual_tflops:.3f}TF, " + f"ratio(auto/manual)={auto_vs_manual:.3f}x " + f"(manual/auto={manual_vs_auto:.3f}x)" + ) + print("") + + avg_ratio = sum(ratios) / len(ratios) + min_ratio = min(ratios) + max_ratio = max(ratios) + print("=== Summary ===") + print(f"avg ratio(auto/manual): {avg_ratio:.3f}x") + print(f"min ratio(auto/manual): {min_ratio:.3f}x") + print(f"max ratio(auto/manual): {max_ratio:.3f}x") + if avg_ratio >= 1.0: + print(f"auto-sync is faster on average by {(avg_ratio - 1.0) * 100.0:.2f}%") + else: + print(f"manual-sync is faster on average by {(1.0 - avg_ratio) * 100.0:.2f}%") + + +if __name__ == "__main__": + main() diff --git a/examples/aot/matmul_swizzle/simple_demo/caller.cpp b/examples/aot/matmul_swizzle/simple_demo/caller.cpp new file mode 100644 index 00000000..ac10bd3a --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/caller.cpp @@ -0,0 +1,28 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "matmul.cpp" +#endif + +#ifndef KERNEL_FN +#define KERNEL_FN matmul_kernel_ABt +#endif + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint8_t *z, + int M, + int N, + int K) +{ + KERNEL_FN<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + reinterpret_cast(z), + static_cast(M), + static_cast(N), + static_cast(K)); +} diff --git a/examples/aot/matmul_swizzle/simple_demo/compile.sh b/examples/aot/matmul_swizzle/simple_demo/compile.sh new file mode 100644 index 00000000..a9343060 --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/compile.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +rm -f \ + simple_matmul_auto_sync.pto simple_matmul_manual_sync.pto \ + simple_matmul_auto_sync.cpp simple_matmul_manual_sync.cpp \ + simple_matmul_auto_sync_kernel.so simple_matmul_manual_sync_kernel.so + +# Manual-sync kernel variant: explicit record/wait events in PTO. +python ./simple_matmul_builder.py --manual-sync > simple_matmul_manual_sync.pto +ptoas simple_matmul_manual_sync.pto -o simple_matmul_manual_sync.cpp + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"simple_matmul_manual_sync.cpp\"" \ + -DKERNEL_FN=matmul_kernel_ABt \ + ./caller.cpp \ + -o ./simple_matmul_manual_sync_kernel.so + +# Auto-sync kernel variant: no explicit record/wait events in PTO. +python ./simple_matmul_builder.py > simple_matmul_auto_sync.pto +ptoas --enable-insert-sync simple_matmul_auto_sync.pto -o simple_matmul_auto_sync.cpp + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"simple_matmul_auto_sync.cpp\"" \ + -DKERNEL_FN=matmul_kernel_ABt_autosync \ + ./caller.cpp \ + -o ./simple_matmul_auto_sync_kernel.so diff --git a/examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py b/examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py new file mode 100644 index 00000000..16940b83 --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py @@ -0,0 +1,205 @@ +import ctypes +import os +import argparse +from dataclasses import dataclass + +import torch +import torch.nn.functional as F +import torch_npu + +from ptodsl.test_util import get_test_device + + +BLOCK_DIM_LIST = [1, 20, 24] +M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 +SHAPES_NK = [ + (4096, 4096), + (8192, 8192), + (16384, 16384), +] +MAX_ABSDIFF_THRESHOLD = 0.5 +MEAN_ABSDIFF_THRESHOLD = 1e-4 + + +@dataclass +class CaseResult: + m: int + n: int + k: int + block_dim: int + max_absdiff: float + mean_absdiff: float + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(os.path.abspath(lib_path)) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int + ] + lib.call_kernel.restype = None + + def matmul_abt( + a, + b, + *, + block_dim=24, + stream_ptr=None, + ): + if a.ndim != 2 or b.ndim != 2: + raise ValueError("matmul_abt expects 2D tensors: a[M,K], b[N,K]") + if a.shape[1] != b.shape[1]: + raise ValueError( + f"K mismatch: a.shape={tuple(a.shape)}, b.shape={tuple(b.shape)}" + ) + if a.dtype != torch.float16 or b.dtype != torch.float16: + raise ValueError("matmul_abt currently supports float16 inputs only") + + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + m = int(a.shape[0]) + k = int(a.shape[1]) + n = int(b.shape[0]) + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + m, + n, + k + ) + return c + + return matmul_abt + + +def run_case(matmul_abt, a, b, c_ref, *, block_dim): + c = matmul_abt( + a, + b, + block_dim=block_dim + ) + torch.npu.synchronize() + return CaseResult( + m=int(a.shape[0]), + n=int(b.shape[0]), + k=int(a.shape[1]), + block_dim=block_dim, + max_absdiff=float((c - c_ref).abs().max().item()), + mean_absdiff=float((c - c_ref).abs().mean().item()), + ) + + +def test_matmul(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--variant", + choices=["auto-sync", "manual-sync", "all"], + default="all", + help="Which kernel variant to run.", + ) + args = parser.parse_args() + + device = get_test_device() + torch.npu.set_device(device) + + variants = { + "auto-sync": "./simple_matmul_auto_sync_kernel.so", + "manual-sync": "./simple_matmul_manual_sync_kernel.so", + } + if args.variant == "all": + selected = [("auto-sync", variants["auto-sync"]), ("manual-sync", variants["manual-sync"])] + else: + selected = [(args.variant, variants[args.variant])] + + torch.manual_seed(0) + for variant_name, lib_path in selected: + print(f"\n=== Running variant: {variant_name} ({lib_path}) ===") + matmul_abt = load_lib(lib_path) + + checked_cases = 0 + global_worst = None + for m in M_LIST: + for n, k in SHAPES_NK: + a = torch.randn(m, k, dtype=torch.float16, device=device) + b = torch.randn(n, k, dtype=torch.float16, device=device) + c_ref = F.linear(a, b) + torch.npu.synchronize() + + shape_worst = None + for block_dim in BLOCK_DIM_LIST: + result = run_case( + matmul_abt, + a, + b, + c_ref, + block_dim=block_dim + ) + checked_cases += 1 + + if ( + shape_worst is None + or result.max_absdiff > shape_worst.max_absdiff + or ( + result.max_absdiff == shape_worst.max_absdiff + and result.mean_absdiff > shape_worst.mean_absdiff + ) + ): + shape_worst = result + + if ( + global_worst is None + or result.max_absdiff > global_worst.max_absdiff + or ( + result.max_absdiff == global_worst.max_absdiff + and result.mean_absdiff > global_worst.mean_absdiff + ) + ): + global_worst = result + + print( + f"(m, n, k)=({m}, {n}, {k}) " + f"worst(block_dim)={shape_worst.block_dim} " + f"max_absdiff={shape_worst.max_absdiff:.6f} " + f"mean_absdiff={shape_worst.mean_absdiff:.6f}" + ) + + print(f"[{variant_name}] checked_cases={checked_cases}") + print( + f"[{variant_name}] global_worst " + f"max_absdiff={global_worst.max_absdiff:.6f} " + f"mean_absdiff={global_worst.mean_absdiff:.6f} " + f"at (m, n, k, block_dim)=" + f"({global_worst.m}, {global_worst.n}, {global_worst.k}, " + f"{global_worst.block_dim})" + ) + + if global_worst.max_absdiff > MAX_ABSDIFF_THRESHOLD: + raise AssertionError( + f"[{variant_name}] max_absdiff {global_worst.max_absdiff:.6f} exceeds " + f"threshold {MAX_ABSDIFF_THRESHOLD:.6f}" + ) + if global_worst.mean_absdiff > MEAN_ABSDIFF_THRESHOLD: + raise AssertionError( + f"[{variant_name}] mean_absdiff {global_worst.mean_absdiff:.6f} exceeds " + f"threshold {MEAN_ABSDIFF_THRESHOLD:.6f}" + ) + + +if __name__ == "__main__": + test_matmul() diff --git a/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py b/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py new file mode 100644 index 00000000..4439d74e --- /dev/null +++ b/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py @@ -0,0 +1,355 @@ +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +def build(manual_sync: bool = False): + M_TILE = 128 + K_QTILE = 64 + K_TILE = 256 + K_DTILE = 512 + N_FULL = 256 + # Hard-coded build-time swizzle config for this simple demo. + # Direction=1 (NZ), count=5. + SWIZZLE_COUNT = 5 + + def meta_data(): + dtype = pto.float16 + acc_dtype = pto.float32 + ptr_type = pto.PtrType(dtype) + i32 = pto.int32 + tv_2d = pto.TensorType(rank=2, dtype=dtype) + + # TODO: omit shape for `SubTensorType`, can merge into one type https://github.com/zhangstevenunity/PTOAS/issues/31 + tile_view_a = pto.SubTensorType(shape=[M_TILE, K_DTILE], dtype=dtype) + tile_view_b_256 = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) + tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) + + b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) + + tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") + tile_buf_b_l1_256 = pto.TileBufType(shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg) + tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") + tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") + tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") + + return { + "ptr_type": ptr_type, + "i32": i32, + "tv_2d": tv_2d, + "tile_view_a": tile_view_a, + "tile_view_b_256": tile_view_b_256, + "tile_view_c_256": tile_view_c_256, + "tile_buf_a_l1": tile_buf_a_l1, + "tile_buf_b_l1_256": tile_buf_b_l1_256, + "tile_buf_a_l0": tile_buf_a_l0, + "tile_buf_b_l0_256": tile_buf_b_l0_256, + "tile_buf_c_256": tile_buf_c_256 + } + + def swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2): + tile_block_loop = (n_loop + cSwizzleM1) // cSwizzle + tile_block_span = cSwizzle * m_loop + tile_block_idx = li // tile_block_span + in_tile_block_idx = li % tile_block_span + is_last_block = tile_block_idx == (tile_block_loop - c1) + n_col_tail = n_loop - cSwizzle * tile_block_idx + n_col = s.select(is_last_block, n_col_tail, cSwizzle) + m_idx = in_tile_block_idx // n_col + n_idx = tile_block_idx * cSwizzle + (in_tile_block_idx % n_col) + odd_block = (tile_block_idx % c2) == c1 + flipped_m_idx = m_loop - m_idx - c1 + m_idx = s.select(odd_block, flipped_m_idx, m_idx) + return m_idx, n_idx + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32" + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + cSwizzle = const(SWIZZLE_COUNT) + cSwizzleM1 = cSwizzle - c1 + + tvA = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tvB = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tvC = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] + c_l0 = pto.alloc_tile(tile_buf_c_256) + + pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) + pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) + + for li in pto.range(bid, core_loop, num_blocks): + # Build-time fixed swizzle configuration: direction=1 (NZ), count=5. + m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + m_offset = m_idx * c128 + n_offset = n_idx * c256 + cKT = const(K_TILE) + cKD = const(K_DTILE) + cNT = const(N_FULL) + + not_first_tile = li != bid + with pto.if_context(not_first_tile): + pto.wait_event("STORE_ACC", "MATMUL", event_id=0) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, c0], + sizes=[const(M_TILE), cKD], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.load(sv_a0, a_l1[0]) + pto.record_event("LOAD", "MOV_M2L", event_id=0) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * cKD + + def run_loop_k(curr_id, next_id, a_curr, a_next): + # NOTE: here declare nested function so we can reuse for double-buffering + is_first_k_tile = k_idx == c0 + + for h in range(2): + b_evt = 2 + h + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tvB, + offsets=[k_offset + h_off, n_offset], + sizes=[cKT, cNT], + ) + + pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) + pto.load(sv_b, b_l1[h]) + pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) + + for quarter in range(4): + # NOTE: here is native Python loop, treats as build-time loop unrolling + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + pto.wait_event("MATMUL", "MOV_M2L", event_id=ping) + if phase == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=curr_id) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + if phase == 7: + pto.record_event("MOV_M2L", "LOAD", event_id=curr_id) + + if quarter == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=b_evt) + + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + pto.record_event("MOV_M2L", "MATMUL", event_id=0) + + if quarter == 3: + pto.record_event("MOV_M2L", "LOAD", event_id=b_evt) + + pto.wait_event("MOV_M2L", "MATMUL", event_id=0) + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + pto.record_event("MATMUL", "MOV_M2L", event_id=ping) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, k_offset + cKD], + sizes=[const(M_TILE), cKD], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=next_id) + pto.load(sv_a_next, a_next) + pto.record_event("LOAD", "MOV_M2L", event_id=next_id) + + is_curr0 = (k_idx % c2) == c0 + with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(0, 1, a_l1[0], a_l1[1]) + with branch.else_context(): + run_loop_k(1, 0, a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tvC, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), cNT], + ) + pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) + pto.store(c_l0, sv_c) + + with pto.if_context(li + num_blocks < core_loop): + pto.record_event("STORE_ACC", "MATMUL", event_id=0) + + pto.wait_event("MOV_M2L", "LOAD", event_id=3) + pto.wait_event("MOV_M2L", "LOAD", event_id=2) + pto.wait_event("MOV_M2L", "LOAD", event_id=1) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=1) + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt_autosync( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32" + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + cSwizzle = const(SWIZZLE_COUNT) + cSwizzleM1 = cSwizzle - c1 + + tvA = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tvB = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tvC = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] + c_l0 = pto.alloc_tile(tile_buf_c_256) + + for li in pto.range(bid, core_loop, num_blocks): + # Build-time fixed swizzle configuration: direction=1 (NZ), count=5. + m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + m_offset = m_idx * c128 + n_offset = n_idx * c256 + cKT = const(K_TILE) + cKD = const(K_DTILE) + cNT = const(N_FULL) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, c0], + sizes=[const(M_TILE), cKD], + ) + pto.load(sv_a0, a_l1[0]) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * cKD + + def run_loop_k(curr_id, next_id, a_curr, a_next): + # NOTE: here declare nested function so we can reuse for double-buffering + is_first_k_tile = k_idx == c0 + + for h in range(2): + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tvB, + offsets=[k_offset + h_off, n_offset], + sizes=[cKT, cNT], + ) + pto.load(sv_b, b_l1[h]) + + for quarter in range(4): + # NOTE: here is native Python loop, treats as build-time loop unrolling + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[m_offset, k_offset + cKD], + sizes=[const(M_TILE), cKD], + ) + pto.load(sv_a_next, a_next) + + is_curr0 = (k_idx % c2) == c0 + with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(0, 1, a_l1[0], a_l1[1]) + with branch.else_context(): + run_loop_k(1, 0, a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tvC, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), cNT], + ) + pto.store(c_l0, sv_c) + + if manual_sync: + return matmul_kernel_ABt + return matmul_kernel_ABt_autosync + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Emit explicit record/wait events instead of relying on auto sync insertion.", + ) + args = parser.parse_args() + print(build(manual_sync=args.manual_sync)) From 79749ebf8b7834bb62c54b4175dc272454dc3966 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Tue, 10 Mar 2026 11:06:51 +0100 Subject: [PATCH 15/53] fix pip install problem without -e, and cover both editable and non-editable install by CI (#77) * fix pip install problem without -e * update CI to run pip install both with or without -e * more clear CI job naming --------- Co-authored-by: jiawei_zhuang --- .github/workflows/ci.yml | 18 ++++++++++++++++-- ptodsl/pyproject.toml | 7 ++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a12fee9..b1cbe8e3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,14 +9,23 @@ on: jobs: test: + name: test (${{ matrix.arch }}, ${{ matrix.install-mode }}) strategy: fail-fast: false matrix: include: - arch: x86_64 runs-on: ubuntu-24.04 + install-mode: standard + - arch: x86_64 + runs-on: ubuntu-24.04 + install-mode: editable - arch: aarch64 runs-on: ubuntu-24.04-arm + install-mode: standard + - arch: aarch64 + runs-on: ubuntu-24.04-arm + install-mode: editable runs-on: ${{ matrix.runs-on }} container: @@ -63,8 +72,13 @@ jobs: git clone https://gitcode.com/cann/pto-isa.git /sources/pto-isa cd /sources/pto-isa && git checkout ${PTOISA_COMMIT} - - name: Install ptodsl - run: pip install -e ./ptodsl + - name: Install ptodsl (${{ matrix.install-mode }}) + run: | + if [ "${{ matrix.install-mode }}" = "standard" ]; then + pip install ./ptodsl + else + pip install -e ./ptodsl + fi - name: Run frontend tests run: pytest -v ./tests/frontend diff --git a/ptodsl/pyproject.toml b/ptodsl/pyproject.toml index a788ec3c..3d0c169e 100644 --- a/ptodsl/pyproject.toml +++ b/ptodsl/pyproject.toml @@ -15,7 +15,12 @@ authors = [ dev = ["matplotlib"] [tool.setuptools] -packages = ["ptodsl"] +packages = [ + "ptodsl", + "ptodsl.api", + "ptodsl.compiler", + "ptodsl.utils", +] [tool.setuptools.package-dir] ptodsl = "." From f646f94c24873be97f517a4eb3c16f6be237b332 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Tue, 10 Mar 2026 13:59:06 +0000 Subject: [PATCH 16/53] add matplotlib and pandas to docker image (commonly used by benchmark scripts) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 788ef7a5..d93eef83 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -13,7 +13,7 @@ RUN pip install --no-cache-dir torch==2.9.0 --index-url https://download.pytorch # extra util RUN pip install --no-cache-dir \ pytest pybind11 nanobind setuptools wheel \ - ipython jupyterlab + ipython jupyterlab matplotlib pandas # cache above layers unrelated to ptoas version change From 8875bca9a97443422a9b368f6c6dd1b87902dce4 Mon Sep 17 00:00:00 2001 From: Mirko De Vita <61700769+MirkoDeVita98@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:22:54 +0100 Subject: [PATCH 17/53] added min and max for tiles with tests (#78) Co-authored-by: mirkodevita --- ptodsl/api/tile.py | 8 ++++++++ .../.gitignore | 0 .../README.md | 2 ++ .../binary_builder.py} | 0 .../caller.py | 2 ++ .../clean.sh | 0 .../compile.sh | 0 .../gen_ir.py | 4 +++- .../test_binary_builder.py} | 2 ++ 9 files changed, 17 insertions(+), 1 deletion(-) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/.gitignore (100%) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/README.md (94%) rename tests/npu/{elementwise_dynamic_multicore/builder.py => elementwise_binary_dynamic_multicore/binary_builder.py} (100%) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/caller.py (97%) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/clean.sh (100%) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/compile.sh (100%) rename tests/npu/{elementwise_dynamic_multicore => elementwise_binary_dynamic_multicore}/gen_ir.py (90%) rename tests/npu/{elementwise_dynamic_multicore/test_builder.py => elementwise_binary_dynamic_multicore/test_binary_builder.py} (98%) diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index fc3f67a3..f15650fe 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -27,6 +27,14 @@ def or_(lhs, rhs, out): _pto.TOrOp(lhs, rhs, out) +def min(lhs, rhs, out): + _pto.TMinOp(lhs, rhs, out) + + +def max(lhs, rhs, out): + _pto.TMaxOp(lhs, rhs, out) + + def gather(src, out, indices=None, *, mask_pattern=None): if mask_pattern is not None: mask = _pto.MaskPatternAttr.get(getattr(_pto.MaskPattern, mask_pattern)) diff --git a/tests/npu/elementwise_dynamic_multicore/.gitignore b/tests/npu/elementwise_binary_dynamic_multicore/.gitignore similarity index 100% rename from tests/npu/elementwise_dynamic_multicore/.gitignore rename to tests/npu/elementwise_binary_dynamic_multicore/.gitignore diff --git a/tests/npu/elementwise_dynamic_multicore/README.md b/tests/npu/elementwise_binary_dynamic_multicore/README.md similarity index 94% rename from tests/npu/elementwise_dynamic_multicore/README.md rename to tests/npu/elementwise_binary_dynamic_multicore/README.md index afd9c8bc..91a50697 100644 --- a/tests/npu/elementwise_dynamic_multicore/README.md +++ b/tests/npu/elementwise_binary_dynamic_multicore/README.md @@ -41,6 +41,8 @@ pytest test_builder.py -k "test_binary_2d_precision and add-float32" | sub | ✓ | ✓ | ✓ | | mul | ✓ | ✓ | ✓ | | div | ✓ | ✓ | skip | +| min | ✓ | ✓ | ✓ | +| max | ✓ | ✓ | ✓ | ## Compile a kernel manually diff --git a/tests/npu/elementwise_dynamic_multicore/builder.py b/tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py similarity index 100% rename from tests/npu/elementwise_dynamic_multicore/builder.py rename to tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py diff --git a/tests/npu/elementwise_dynamic_multicore/caller.py b/tests/npu/elementwise_binary_dynamic_multicore/caller.py similarity index 97% rename from tests/npu/elementwise_dynamic_multicore/caller.py rename to tests/npu/elementwise_binary_dynamic_multicore/caller.py index 9fa70f0b..b9d02196 100644 --- a/tests/npu/elementwise_dynamic_multicore/caller.py +++ b/tests/npu/elementwise_binary_dynamic_multicore/caller.py @@ -9,6 +9,8 @@ "sub": "float", "add": "float", "or": "int16_t", + "max": "float", + "min": "float", } diff --git a/tests/npu/elementwise_dynamic_multicore/clean.sh b/tests/npu/elementwise_binary_dynamic_multicore/clean.sh similarity index 100% rename from tests/npu/elementwise_dynamic_multicore/clean.sh rename to tests/npu/elementwise_binary_dynamic_multicore/clean.sh diff --git a/tests/npu/elementwise_dynamic_multicore/compile.sh b/tests/npu/elementwise_binary_dynamic_multicore/compile.sh similarity index 100% rename from tests/npu/elementwise_dynamic_multicore/compile.sh rename to tests/npu/elementwise_binary_dynamic_multicore/compile.sh diff --git a/tests/npu/elementwise_dynamic_multicore/gen_ir.py b/tests/npu/elementwise_binary_dynamic_multicore/gen_ir.py similarity index 90% rename from tests/npu/elementwise_dynamic_multicore/gen_ir.py rename to tests/npu/elementwise_binary_dynamic_multicore/gen_ir.py index 46c14310..d1c9517c 100644 --- a/tests/npu/elementwise_dynamic_multicore/gen_ir.py +++ b/tests/npu/elementwise_binary_dynamic_multicore/gen_ir.py @@ -9,7 +9,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from ptodsl import tile -from builder import build_binary_kernels +from binary_builder import build_binary_kernels _OPS = { "add": tile.add, @@ -17,6 +17,8 @@ "mul": tile.mul, "div": tile.div, "or": tile.or_, + "max": tile.max, + "min": tile.min, } if __name__ == "__main__": diff --git a/tests/npu/elementwise_dynamic_multicore/test_builder.py b/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py similarity index 98% rename from tests/npu/elementwise_dynamic_multicore/test_builder.py rename to tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py index ba188107..331f169d 100644 --- a/tests/npu/elementwise_dynamic_multicore/test_builder.py +++ b/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py @@ -17,6 +17,8 @@ ("sub", lambda x, y: x - y), ("mul", lambda x, y: x * y), ("div", lambda x, y: x / y), + ("max", lambda x, y: torch.max(x, y)), + ("min", lambda x, y: torch.min(x, y)), #("or", lambda x, y: x | y), #TODO add back bitwise or test after fixing int16 support in the builder ] From d923ac2ed3c1a2180475c1d279699ea952022e77 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Tue, 10 Mar 2026 17:38:18 +0100 Subject: [PATCH 18/53] Step-by-step optimization guide of dynamic general-shape matmul (#79) * remove unused event id * even simpler single-buffer matmul, compare to double buf perf * fix print * more explain in comments * flatten nested loop * correct flop ratio print * correct print * also compare with non swizzle * re-structure to step-by-step optimization * output artifacts dir * refactor: split IR builder into 4 steps * draft optimization guide * add pto syntax explain * add numpy simulation code for step-1 * print swizzle grid in numpy * rename tutorial directory --------- Co-authored-by: jiawei_zhuang --- .../aot/matmul_swizzle/simple_demo/README.md | 14 - .../simple_demo/bench_matmul.py | 193 ------- .../aot/matmul_swizzle/simple_demo/compile.sh | 31 -- .../simple_demo/simple_matmul_builder.py | 355 ------------- .../.gitignore | 0 .../step_by_step_guide/README.md | 23 + .../step_by_step_guide/bench_matmul.py | 263 ++++++++++ .../caller.cpp | 0 .../step_by_step_guide/common_utils.py | 66 +++ .../step_by_step_guide/compile.sh | 55 ++ .../step_by_step_guide/optimization_guide.md | 489 ++++++++++++++++++ .../run_simple_matmul.py | 21 +- .../simple_matmul_builder.py | 26 + .../single_buffer_matmul.py | 9 + .../step_by_step_guide/step1_baseline.py | 118 +++++ .../step_by_step_guide/step1_numpy_sim.py | 92 ++++ .../step_by_step_guide/step2_doublebuffer.py | 139 +++++ .../step_by_step_guide/step3_swizzle.py | 140 +++++ .../step3_swizzle_numpy_sim.py | 51 ++ .../step4_manual_pipelining.py | 183 +++++++ 20 files changed, 1671 insertions(+), 597 deletions(-) delete mode 100644 examples/aot/matmul_swizzle/simple_demo/README.md delete mode 100644 examples/aot/matmul_swizzle/simple_demo/bench_matmul.py delete mode 100644 examples/aot/matmul_swizzle/simple_demo/compile.sh delete mode 100644 examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py rename examples/aot/matmul_swizzle/{simple_demo => step_by_step_guide}/.gitignore (100%) create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/README.md create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py rename examples/aot/matmul_swizzle/{simple_demo => step_by_step_guide}/caller.cpp (100%) create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/compile.sh create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md rename examples/aot/matmul_swizzle/{simple_demo => step_by_step_guide}/run_simple_matmul.py (88%) create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py create mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py diff --git a/examples/aot/matmul_swizzle/simple_demo/README.md b/examples/aot/matmul_swizzle/simple_demo/README.md deleted file mode 100644 index 323bb056..00000000 --- a/examples/aot/matmul_swizzle/simple_demo/README.md +++ /dev/null @@ -1,14 +0,0 @@ -Usage: - -```bash -bash ./compile.sh -# Run both variants (default) -python ./run_simple_matmul.py - -# Or run a single variant -python ./run_simple_matmul.py --variant auto-sync -python ./run_simple_matmul.py --variant manual-sync - -# Benchmark auto-sync vs manual-sync performance. -python ./bench_matmul.py -``` diff --git a/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py b/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py deleted file mode 100644 index d79f3a3f..00000000 --- a/examples/aot/matmul_swizzle/simple_demo/bench_matmul.py +++ /dev/null @@ -1,193 +0,0 @@ -import argparse -import ctypes -import os -from pathlib import Path - -import torch -import torch_npu # noqa: F401 - -from ptodsl.test_util import get_test_device - - -BLOCK_DIM = 24 -M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 -SHAPES_NK = [ - (4096, 4096), - (8192, 8192), - (16384, 16384), -] -N_WARMUP = 5 -N_REPEAT = 20 - - -def torch_to_ctypes(tensor): - return ctypes.c_void_p(tensor.data_ptr()) - - -def load_lib(lib_path): - lib = ctypes.CDLL(os.path.abspath(lib_path)) - lib.call_kernel.argtypes = [ - ctypes.c_uint32, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ] - lib.call_kernel.restype = None - - def matmul_abt(a, b, *, block_dim=BLOCK_DIM, stream_ptr=None): - if stream_ptr is None: - stream_ptr = torch.npu.current_stream()._as_parameter_ - - m = int(a.shape[0]) - k = int(a.shape[1]) - n = int(b.shape[0]) - c = torch.empty((m, n), device=a.device, dtype=a.dtype) - lib.call_kernel( - block_dim, - stream_ptr, - torch_to_ctypes(a), - torch_to_ctypes(b), - torch_to_ctypes(c), - m, - n, - k, - ) - return c - - return matmul_abt - - -def _parse_int_list(raw): - parts = [p.strip() for p in raw.split(",") if p.strip()] - if not parts: - raise ValueError("List cannot be empty.") - return [int(p) for p in parts] - - -def _time_us(fn, a_list, b_list, warmup, repeat): - for a, b in zip(a_list[:warmup], b_list[:warmup]): - fn(a, b) - torch.npu.synchronize() - - start = torch.npu.Event(enable_timing=True) - end = torch.npu.Event(enable_timing=True) - start.record() - for a, b in zip(a_list[warmup : warmup + repeat], b_list[warmup : warmup + repeat]): - fn(a, b) - end.record() - torch.npu.synchronize() - return start.elapsed_time(end) * 1000.0 / repeat - - -def _parse_args(): - parser = argparse.ArgumentParser( - description="Benchmark simple matmul auto-sync vs manual-sync and report ratios." - ) - parser.add_argument( - "--auto-lib", - type=str, - default="./simple_matmul_auto_sync_kernel.so", - help="Path to auto-sync shared library.", - ) - parser.add_argument( - "--manual-lib", - type=str, - default="./simple_matmul_manual_sync_kernel.so", - help="Path to manual-sync shared library.", - ) - parser.add_argument( - "--m-list", - type=str, - default=",".join(str(m) for m in M_LIST), - help="Comma-separated M values (default: script M_LIST).", - ) - parser.add_argument( - "--warmup", - type=int, - default=N_WARMUP, - help=f"Warmup iterations (default: {N_WARMUP}).", - ) - parser.add_argument( - "--repeat", - type=int, - default=N_REPEAT, - help=f"Timed iterations (default: {N_REPEAT}).", - ) - return parser.parse_args() - - -def main(): - args = _parse_args() - if args.warmup < 1 or args.repeat < 1: - raise ValueError("--warmup and --repeat must be positive integers.") - - base_dir = Path(__file__).resolve().parent - - auto_lib = Path(args.auto_lib) - if not auto_lib.is_absolute(): - auto_lib = base_dir / auto_lib - manual_lib = Path(args.manual_lib) - if not manual_lib.is_absolute(): - manual_lib = base_dir / manual_lib - if not auto_lib.exists(): - raise FileNotFoundError(f"Auto-sync library not found: {auto_lib}") - if not manual_lib.exists(): - raise FileNotFoundError(f"Manual-sync library not found: {manual_lib}") - - device = get_test_device() - torch.npu.set_device(device) - torch.manual_seed(0) - - auto_mm = load_lib(str(auto_lib)) - manual_mm = load_lib(str(manual_lib)) - m_list = _parse_int_list(args.m_list) - - ratios = [] - print(f"auto-sync lib: {auto_lib}") - print(f"manual-sync lib: {manual_lib}") - print("") - - for n, k in SHAPES_NK: - print(f"=== N={n}, K={k} ===") - for m in m_list: - alloc = args.warmup + args.repeat - a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] - b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] - - auto_us = _time_us(auto_mm, a_list, b_list, args.warmup, args.repeat) - manual_us = _time_us(manual_mm, a_list, b_list, args.warmup, args.repeat) - - flops = 2.0 * m * n * k - auto_tflops = flops / auto_us / 1e6 - manual_tflops = flops / manual_us / 1e6 - auto_vs_manual = manual_us / auto_us - manual_vs_auto = auto_us / manual_us - ratios.append(auto_vs_manual) - - print( - f"(M,N,K)=({m},{n},{k}) " - f"auto={auto_tflops:.3f}TF, manual={manual_tflops:.3f}TF, " - f"ratio(auto/manual)={auto_vs_manual:.3f}x " - f"(manual/auto={manual_vs_auto:.3f}x)" - ) - print("") - - avg_ratio = sum(ratios) / len(ratios) - min_ratio = min(ratios) - max_ratio = max(ratios) - print("=== Summary ===") - print(f"avg ratio(auto/manual): {avg_ratio:.3f}x") - print(f"min ratio(auto/manual): {min_ratio:.3f}x") - print(f"max ratio(auto/manual): {max_ratio:.3f}x") - if avg_ratio >= 1.0: - print(f"auto-sync is faster on average by {(avg_ratio - 1.0) * 100.0:.2f}%") - else: - print(f"manual-sync is faster on average by {(1.0 - avg_ratio) * 100.0:.2f}%") - - -if __name__ == "__main__": - main() diff --git a/examples/aot/matmul_swizzle/simple_demo/compile.sh b/examples/aot/matmul_swizzle/simple_demo/compile.sh deleted file mode 100644 index a9343060..00000000 --- a/examples/aot/matmul_swizzle/simple_demo/compile.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -rm -f \ - simple_matmul_auto_sync.pto simple_matmul_manual_sync.pto \ - simple_matmul_auto_sync.cpp simple_matmul_manual_sync.cpp \ - simple_matmul_auto_sync_kernel.so simple_matmul_manual_sync_kernel.so - -# Manual-sync kernel variant: explicit record/wait events in PTO. -python ./simple_matmul_builder.py --manual-sync > simple_matmul_manual_sync.pto -ptoas simple_matmul_manual_sync.pto -o simple_matmul_manual_sync.cpp - -bisheng -fPIC -shared -xcce -O2 -std=c++17 \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -I"${ASCEND_TOOLKIT_HOME}/include" \ - -DKERNEL_CPP="\"simple_matmul_manual_sync.cpp\"" \ - -DKERNEL_FN=matmul_kernel_ABt \ - ./caller.cpp \ - -o ./simple_matmul_manual_sync_kernel.so - -# Auto-sync kernel variant: no explicit record/wait events in PTO. -python ./simple_matmul_builder.py > simple_matmul_auto_sync.pto -ptoas --enable-insert-sync simple_matmul_auto_sync.pto -o simple_matmul_auto_sync.cpp - -bisheng -fPIC -shared -xcce -O2 -std=c++17 \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -I"${ASCEND_TOOLKIT_HOME}/include" \ - -DKERNEL_CPP="\"simple_matmul_auto_sync.cpp\"" \ - -DKERNEL_FN=matmul_kernel_ABt_autosync \ - ./caller.cpp \ - -o ./simple_matmul_auto_sync_kernel.so diff --git a/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py b/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py deleted file mode 100644 index 4439d74e..00000000 --- a/examples/aot/matmul_swizzle/simple_demo/simple_matmul_builder.py +++ /dev/null @@ -1,355 +0,0 @@ -import argparse - -from ptodsl import pto, tile, to_ir_module -from ptodsl import scalar as s - -const = s.const - -def build(manual_sync: bool = False): - M_TILE = 128 - K_QTILE = 64 - K_TILE = 256 - K_DTILE = 512 - N_FULL = 256 - # Hard-coded build-time swizzle config for this simple demo. - # Direction=1 (NZ), count=5. - SWIZZLE_COUNT = 5 - - def meta_data(): - dtype = pto.float16 - acc_dtype = pto.float32 - ptr_type = pto.PtrType(dtype) - i32 = pto.int32 - tv_2d = pto.TensorType(rank=2, dtype=dtype) - - # TODO: omit shape for `SubTensorType`, can merge into one type https://github.com/zhangstevenunity/PTOAS/issues/31 - tile_view_a = pto.SubTensorType(shape=[M_TILE, K_DTILE], dtype=dtype) - tile_view_b_256 = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) - tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) - - b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) - - tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") - tile_buf_b_l1_256 = pto.TileBufType(shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg) - tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") - tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") - tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") - - return { - "ptr_type": ptr_type, - "i32": i32, - "tv_2d": tv_2d, - "tile_view_a": tile_view_a, - "tile_view_b_256": tile_view_b_256, - "tile_view_c_256": tile_view_c_256, - "tile_buf_a_l1": tile_buf_a_l1, - "tile_buf_b_l1_256": tile_buf_b_l1_256, - "tile_buf_a_l0": tile_buf_a_l0, - "tile_buf_b_l0_256": tile_buf_b_l0_256, - "tile_buf_c_256": tile_buf_c_256 - } - - def swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2): - tile_block_loop = (n_loop + cSwizzleM1) // cSwizzle - tile_block_span = cSwizzle * m_loop - tile_block_idx = li // tile_block_span - in_tile_block_idx = li % tile_block_span - is_last_block = tile_block_idx == (tile_block_loop - c1) - n_col_tail = n_loop - cSwizzle * tile_block_idx - n_col = s.select(is_last_block, n_col_tail, cSwizzle) - m_idx = in_tile_block_idx // n_col - n_idx = tile_block_idx * cSwizzle + (in_tile_block_idx % n_col) - odd_block = (tile_block_idx % c2) == c1 - flipped_m_idx = m_loop - m_idx - c1 - m_idx = s.select(odd_block, flipped_m_idx, m_idx) - return m_idx, n_idx - - @to_ir_module(meta_data=meta_data) - def matmul_kernel_ABt( - a_ptr: "ptr_type", - b_ptr: "ptr_type", - c_ptr: "ptr_type", - m_i32: "i32", - n_i32: "i32", - k_i32: "i32" - ) -> None: - with pto.cube_section(): - c0 = const(0) - c1 = const(1) - c2 = const(2) - c128 = const(M_TILE) - c256 = const(N_FULL) - c512 = const(K_DTILE) - - m_total = s.index_cast(m_i32) - n_total = s.index_cast(n_i32) - k_total = s.index_cast(k_i32) - num_blocks = s.index_cast(pto.get_block_num()) - bid = s.index_cast(pto.get_block_idx()) - - n_loop = (n_total + c256 - c1) // c256 - m_loop = m_total // c128 - core_loop = n_loop * m_loop - k_dtile_num = k_total // c512 - cSwizzle = const(SWIZZLE_COUNT) - cSwizzleM1 = cSwizzle - c1 - - tvA = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tvB = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tvC = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) - - a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] - b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] - a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] - b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] - c_l0 = pto.alloc_tile(tile_buf_c_256) - - pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) - pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) - - for li in pto.range(bid, core_loop, num_blocks): - # Build-time fixed swizzle configuration: direction=1 (NZ), count=5. - m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) - m_offset = m_idx * c128 - n_offset = n_idx * c256 - cKT = const(K_TILE) - cKD = const(K_DTILE) - cNT = const(N_FULL) - - not_first_tile = li != bid - with pto.if_context(not_first_tile): - pto.wait_event("STORE_ACC", "MATMUL", event_id=0) - - sv_a0 = pto.slice_view( - tile_view_a, - source=tvA, - offsets=[m_offset, c0], - sizes=[const(M_TILE), cKD], - ) - pto.wait_event("MOV_M2L", "LOAD", event_id=0) - pto.load(sv_a0, a_l1[0]) - pto.record_event("LOAD", "MOV_M2L", event_id=0) - - for k_idx in pto.range(c0, k_dtile_num, c1): - k_offset = k_idx * cKD - - def run_loop_k(curr_id, next_id, a_curr, a_next): - # NOTE: here declare nested function so we can reuse for double-buffering - is_first_k_tile = k_idx == c0 - - for h in range(2): - b_evt = 2 + h - h_off = const(h * K_TILE) - sv_b = pto.slice_view( - tile_view_b_256, - source=tvB, - offsets=[k_offset + h_off, n_offset], - sizes=[cKT, cNT], - ) - - pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) - pto.load(sv_b, b_l1[h]) - pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) - - for quarter in range(4): - # NOTE: here is native Python loop, treats as build-time loop unrolling - phase = h * 4 + quarter - ping = phase & 1 - a_col = const(phase * K_QTILE) - b_row = const(quarter * K_QTILE) - - pto.wait_event("MATMUL", "MOV_M2L", event_id=ping) - if phase == 0: - pto.wait_event("LOAD", "MOV_M2L", event_id=curr_id) - - tile.extract(a_curr, c0, a_col, a_l0[ping]) - if phase == 7: - pto.record_event("MOV_M2L", "LOAD", event_id=curr_id) - - if quarter == 0: - pto.wait_event("LOAD", "MOV_M2L", event_id=b_evt) - - tile.extract(b_l1[h], b_row, c0, b_l0[ping]) - pto.record_event("MOV_M2L", "MATMUL", event_id=0) - - if quarter == 3: - pto.record_event("MOV_M2L", "LOAD", event_id=b_evt) - - pto.wait_event("MOV_M2L", "MATMUL", event_id=0) - if phase == 0: - pto.cond( - is_first_k_tile, - lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), - ) - else: - tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) - - pto.record_event("MATMUL", "MOV_M2L", event_id=ping) - - with pto.if_context(k_idx + c1 < k_dtile_num): - sv_a_next = pto.slice_view( - tile_view_a, - source=tvA, - offsets=[m_offset, k_offset + cKD], - sizes=[const(M_TILE), cKD], - ) - pto.wait_event("MOV_M2L", "LOAD", event_id=next_id) - pto.load(sv_a_next, a_next) - pto.record_event("LOAD", "MOV_M2L", event_id=next_id) - - is_curr0 = (k_idx % c2) == c0 - with pto.if_context(is_curr0, has_else=True) as branch: - run_loop_k(0, 1, a_l1[0], a_l1[1]) - with branch.else_context(): - run_loop_k(1, 0, a_l1[1], a_l1[0]) - - sv_c = pto.slice_view( - tile_view_c_256, - source=tvC, - offsets=[m_offset, n_offset], - sizes=[const(M_TILE), cNT], - ) - pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) - pto.store(c_l0, sv_c) - - with pto.if_context(li + num_blocks < core_loop): - pto.record_event("STORE_ACC", "MATMUL", event_id=0) - - pto.wait_event("MOV_M2L", "LOAD", event_id=3) - pto.wait_event("MOV_M2L", "LOAD", event_id=2) - pto.wait_event("MOV_M2L", "LOAD", event_id=1) - pto.wait_event("MOV_M2L", "LOAD", event_id=0) - pto.wait_event("MATMUL", "MOV_M2L", event_id=0) - pto.wait_event("MATMUL", "MOV_M2L", event_id=1) - - @to_ir_module(meta_data=meta_data) - def matmul_kernel_ABt_autosync( - a_ptr: "ptr_type", - b_ptr: "ptr_type", - c_ptr: "ptr_type", - m_i32: "i32", - n_i32: "i32", - k_i32: "i32" - ) -> None: - with pto.cube_section(): - c0 = const(0) - c1 = const(1) - c2 = const(2) - c128 = const(M_TILE) - c256 = const(N_FULL) - c512 = const(K_DTILE) - - m_total = s.index_cast(m_i32) - n_total = s.index_cast(n_i32) - k_total = s.index_cast(k_i32) - num_blocks = s.index_cast(pto.get_block_num()) - bid = s.index_cast(pto.get_block_idx()) - - n_loop = (n_total + c256 - c1) // c256 - m_loop = m_total // c128 - core_loop = n_loop * m_loop - k_dtile_num = k_total // c512 - cSwizzle = const(SWIZZLE_COUNT) - cSwizzleM1 = cSwizzle - c1 - - tvA = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tvB = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tvC = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) - - a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] - b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] - a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] - b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] - c_l0 = pto.alloc_tile(tile_buf_c_256) - - for li in pto.range(bid, core_loop, num_blocks): - # Build-time fixed swizzle configuration: direction=1 (NZ), count=5. - m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) - m_offset = m_idx * c128 - n_offset = n_idx * c256 - cKT = const(K_TILE) - cKD = const(K_DTILE) - cNT = const(N_FULL) - - sv_a0 = pto.slice_view( - tile_view_a, - source=tvA, - offsets=[m_offset, c0], - sizes=[const(M_TILE), cKD], - ) - pto.load(sv_a0, a_l1[0]) - - for k_idx in pto.range(c0, k_dtile_num, c1): - k_offset = k_idx * cKD - - def run_loop_k(curr_id, next_id, a_curr, a_next): - # NOTE: here declare nested function so we can reuse for double-buffering - is_first_k_tile = k_idx == c0 - - for h in range(2): - h_off = const(h * K_TILE) - sv_b = pto.slice_view( - tile_view_b_256, - source=tvB, - offsets=[k_offset + h_off, n_offset], - sizes=[cKT, cNT], - ) - pto.load(sv_b, b_l1[h]) - - for quarter in range(4): - # NOTE: here is native Python loop, treats as build-time loop unrolling - phase = h * 4 + quarter - ping = phase & 1 - a_col = const(phase * K_QTILE) - b_row = const(quarter * K_QTILE) - - tile.extract(a_curr, c0, a_col, a_l0[ping]) - tile.extract(b_l1[h], b_row, c0, b_l0[ping]) - - if phase == 0: - pto.cond( - is_first_k_tile, - lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), - ) - else: - tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) - - with pto.if_context(k_idx + c1 < k_dtile_num): - sv_a_next = pto.slice_view( - tile_view_a, - source=tvA, - offsets=[m_offset, k_offset + cKD], - sizes=[const(M_TILE), cKD], - ) - pto.load(sv_a_next, a_next) - - is_curr0 = (k_idx % c2) == c0 - with pto.if_context(is_curr0, has_else=True) as branch: - run_loop_k(0, 1, a_l1[0], a_l1[1]) - with branch.else_context(): - run_loop_k(1, 0, a_l1[1], a_l1[0]) - - sv_c = pto.slice_view( - tile_view_c_256, - source=tvC, - offsets=[m_offset, n_offset], - sizes=[const(M_TILE), cNT], - ) - pto.store(c_l0, sv_c) - - if manual_sync: - return matmul_kernel_ABt - return matmul_kernel_ABt_autosync - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--manual-sync", - action="store_true", - help="Emit explicit record/wait events instead of relying on auto sync insertion.", - ) - args = parser.parse_args() - print(build(manual_sync=args.manual_sync)) diff --git a/examples/aot/matmul_swizzle/simple_demo/.gitignore b/examples/aot/matmul_swizzle/step_by_step_guide/.gitignore similarity index 100% rename from examples/aot/matmul_swizzle/simple_demo/.gitignore rename to examples/aot/matmul_swizzle/step_by_step_guide/.gitignore diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/README.md b/examples/aot/matmul_swizzle/step_by_step_guide/README.md new file mode 100644 index 00000000..425fdc42 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/README.md @@ -0,0 +1,23 @@ +Usage: + +```bash +# Build all tutorial steps +bash ./compile.sh + +# Run correctness on all steps (default) +python ./run_simple_matmul.py + +# Or run one specific tutorial step +python ./run_simple_matmul.py --variant step1-baseline +python ./run_simple_matmul.py --variant step2-doublebuffer +python ./run_simple_matmul.py --variant step3-swizzle +python ./run_simple_matmul.py --variant step4-manual-pipelining + +# Stepwise benchmark comparisons: +# Step1: double-buffer vs single-buffer (both non-swizzle, auto-sync) +# Step2: swizzle vs non-swizzle (both double-buffer, auto-sync) +# Step3: manual-sync vs auto-sync (both double-buffer, swizzle) +python ./bench_matmul.py +``` + +See `optimization_guide.md` for full step-by-step build and benchmark commands. diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py b/examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py new file mode 100644 index 00000000..1aa569c3 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py @@ -0,0 +1,263 @@ +import argparse +import ctypes +import os +from pathlib import Path + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +BLOCK_DIM = 24 +M_LIST = [128 * i for i in range(1, 37, 4)] # 128, ..., 4224 +SHAPES_NK = [ + (4096, 4096), + (8192, 8192), + (16384, 16384), +] +N_WARMUP = 5 +N_REPEAT = 20 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(os.path.abspath(lib_path)) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] + lib.call_kernel.restype = None + + def matmul_abt(a, b, *, block_dim=BLOCK_DIM, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + + m = int(a.shape[0]) + k = int(a.shape[1]) + n = int(b.shape[0]) + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + m, + n, + k, + ) + return c + + return matmul_abt + + +def _parse_int_list(raw): + parts = [p.strip() for p in raw.split(",") if p.strip()] + if not parts: + raise ValueError("List cannot be empty.") + return [int(p) for p in parts] + + +def _time_us(fn, a_list, b_list, warmup, repeat): + for a, b in zip(a_list[:warmup], b_list[:warmup]): + fn(a, b) + torch.npu.synchronize() + + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + start.record() + for a, b in zip(a_list[warmup : warmup + repeat], b_list[warmup : warmup + repeat]): + fn(a, b) + end.record() + torch.npu.synchronize() + return start.elapsed_time(end) * 1000.0 / repeat + + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Stepwise performance benchmark for buffering, swizzle, and manual sync." + ) + parser.add_argument( + "--double-auto-swizzle-lib", + type=str, + default="./build_artifacts/step3_swizzle_kernel.so", + help="Path to double-buffer auto-sync swizzled shared library.", + ) + parser.add_argument( + "--double-auto-noswizzle-lib", + type=str, + default="./build_artifacts/step2_doublebuffer_kernel.so", + help="Path to double-buffer auto-sync non-swizzle shared library.", + ) + parser.add_argument( + "--double-manual-swizzle-lib", + type=str, + default="./build_artifacts/step4_manual_pipelining_kernel.so", + help="Path to double-buffer manual-sync swizzled shared library.", + ) + parser.add_argument( + "--single-auto-noswizzle-lib", + type=str, + default="./build_artifacts/step1_baseline_kernel.so", + help="Path to single-buffer auto-sync non-swizzle shared library.", + ) + parser.add_argument( + "--m-list", + type=str, + default=",".join(str(m) for m in M_LIST), + help="Comma-separated M values (default: script M_LIST).", + ) + parser.add_argument( + "--warmup", + type=int, + default=N_WARMUP, + help=f"Warmup iterations (default: {N_WARMUP}).", + ) + parser.add_argument( + "--repeat", + type=int, + default=N_REPEAT, + help=f"Timed iterations (default: {N_REPEAT}).", + ) + return parser.parse_args() + + +def main(): + args = _parse_args() + if args.warmup < 1 or args.repeat < 1: + raise ValueError("--warmup and --repeat must be positive integers.") + + base_dir = Path(__file__).resolve().parent + + double_auto_swizzle_lib = Path(args.double_auto_swizzle_lib) + if not double_auto_swizzle_lib.is_absolute(): + double_auto_swizzle_lib = base_dir / double_auto_swizzle_lib + double_auto_noswizzle_lib = Path(args.double_auto_noswizzle_lib) + if not double_auto_noswizzle_lib.is_absolute(): + double_auto_noswizzle_lib = base_dir / double_auto_noswizzle_lib + double_manual_swizzle_lib = Path(args.double_manual_swizzle_lib) + if not double_manual_swizzle_lib.is_absolute(): + double_manual_swizzle_lib = base_dir / double_manual_swizzle_lib + single_auto_noswizzle_lib = Path(args.single_auto_noswizzle_lib) + if not single_auto_noswizzle_lib.is_absolute(): + single_auto_noswizzle_lib = base_dir / single_auto_noswizzle_lib + if not double_auto_swizzle_lib.exists(): + raise FileNotFoundError(f"Double-buffer auto-sync swizzle library not found: {double_auto_swizzle_lib}") + if not double_auto_noswizzle_lib.exists(): + raise FileNotFoundError( + f"Double-buffer auto-sync non-swizzle library not found: {double_auto_noswizzle_lib}" + ) + if not double_manual_swizzle_lib.exists(): + raise FileNotFoundError( + f"Double-buffer manual-sync swizzle library not found: {double_manual_swizzle_lib}" + ) + if not single_auto_noswizzle_lib.exists(): + raise FileNotFoundError( + f"Single-buffer auto-sync non-swizzle library not found: {single_auto_noswizzle_lib}" + ) + + device = get_test_device() + torch.npu.set_device(device) + torch.manual_seed(0) + + double_auto_swizzle_mm = load_lib(str(double_auto_swizzle_lib)) + double_auto_noswizzle_mm = load_lib(str(double_auto_noswizzle_lib)) + double_manual_swizzle_mm = load_lib(str(double_manual_swizzle_lib)) + single_auto_noswizzle_mm = load_lib(str(single_auto_noswizzle_lib)) + m_list = _parse_int_list(args.m_list) + + ratios_step1_double_vs_single_noswizzle = [] + ratios_step2_swizzle_vs_noswizzle = [] + ratios_step3_manual_vs_auto_swizzle = [] + print(f"double-buffer auto-sync swizzle lib: {double_auto_swizzle_lib}") + print(f"double-buffer auto-sync non-swizzle lib: {double_auto_noswizzle_lib}") + print(f"double-buffer manual-sync swizzle lib: {double_manual_swizzle_lib}") + print(f"single-buffer auto-sync non-swizzle lib: {single_auto_noswizzle_lib}") + print("") + + for n, k in SHAPES_NK: + print(f"=== N={n}, K={k} ===") + for m in m_list: + alloc = args.warmup + args.repeat + a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] + b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] + + double_auto_swizzle_us = _time_us( + double_auto_swizzle_mm, a_list, b_list, args.warmup, args.repeat + ) + double_auto_noswizzle_us = _time_us( + double_auto_noswizzle_mm, a_list, b_list, args.warmup, args.repeat + ) + double_manual_swizzle_us = _time_us( + double_manual_swizzle_mm, a_list, b_list, args.warmup, args.repeat + ) + single_auto_noswizzle_us = _time_us( + single_auto_noswizzle_mm, a_list, b_list, args.warmup, args.repeat + ) + + flops = 2.0 * m * n * k + double_auto_swizzle_tflops = flops / double_auto_swizzle_us / 1e6 + double_auto_noswizzle_tflops = flops / double_auto_noswizzle_us / 1e6 + double_manual_swizzle_tflops = flops / double_manual_swizzle_us / 1e6 + single_auto_noswizzle_tflops = flops / single_auto_noswizzle_us / 1e6 + + # Step 1: buffering effect (double-buffer vs single-buffer, both non-swizzle auto-sync). + step1_double_vs_single = double_auto_noswizzle_tflops / single_auto_noswizzle_tflops + # Step 2: swizzle effect (double-buffer auto-sync swizzle vs non-swizzle). + step2_swizzle_vs_noswizzle = double_auto_swizzle_tflops / double_auto_noswizzle_tflops + # Step 3: manual-sync effect (double-buffer swizzle manual-sync vs auto-sync). + step3_manual_vs_auto = double_manual_swizzle_tflops / double_auto_swizzle_tflops + + ratios_step1_double_vs_single_noswizzle.append(step1_double_vs_single) + ratios_step2_swizzle_vs_noswizzle.append(step2_swizzle_vs_noswizzle) + ratios_step3_manual_vs_auto_swizzle.append(step3_manual_vs_auto) + + print( + f"(M,N,K)=({m},{n},{k}) " + f"single_noswizzle={single_auto_noswizzle_tflops:.3f}TF, " + f"double_noswizzle_auto={double_auto_noswizzle_tflops:.3f}TF, " + f"double_swizzle_auto={double_auto_swizzle_tflops:.3f}TF, " + f"double_swizzle_manual={double_manual_swizzle_tflops:.3f}TF, " + f"step1_ratio(double_noswizzle_auto/single_noswizzle)={step1_double_vs_single:.3f}x, " + f"step2_ratio(double_swizzle_auto/double_noswizzle_auto)={step2_swizzle_vs_noswizzle:.3f}x, " + f"step3_ratio(double_swizzle_manual/double_swizzle_auto)={step3_manual_vs_auto:.3f}x" + ) + print("") + + avg_step1 = sum(ratios_step1_double_vs_single_noswizzle) / len(ratios_step1_double_vs_single_noswizzle) + min_step1 = min(ratios_step1_double_vs_single_noswizzle) + max_step1 = max(ratios_step1_double_vs_single_noswizzle) + avg_step2 = sum(ratios_step2_swizzle_vs_noswizzle) / len(ratios_step2_swizzle_vs_noswizzle) + min_step2 = min(ratios_step2_swizzle_vs_noswizzle) + max_step2 = max(ratios_step2_swizzle_vs_noswizzle) + avg_step3 = sum(ratios_step3_manual_vs_auto_swizzle) / len(ratios_step3_manual_vs_auto_swizzle) + min_step3 = min(ratios_step3_manual_vs_auto_swizzle) + max_step3 = max(ratios_step3_manual_vs_auto_swizzle) + + print("=== Summary ===") + print("Step1 (double-buffer speedup, both non-swizzle auto-sync):") + print(f"avg FLOP ratio(double_noswizzle_auto/single_noswizzle): {avg_step1:.3f}x") + print(f"min FLOP ratio(double_noswizzle_auto/single_noswizzle): {min_step1:.3f}x") + print(f"max FLOP ratio(double_noswizzle_auto/single_noswizzle): {max_step1:.3f}x") + print("Step2 (swizzle speedup, both double-buffer auto-sync):") + print(f"avg FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {avg_step2:.3f}x") + print(f"min FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {min_step2:.3f}x") + print(f"max FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {max_step2:.3f}x") + print("Step3 (manual-sync speedup, both double-buffer swizzle):") + print(f"avg FLOP ratio(double_swizzle_manual/double_swizzle_auto): {avg_step3:.3f}x") + print(f"min FLOP ratio(double_swizzle_manual/double_swizzle_auto): {min_step3:.3f}x") + print(f"max FLOP ratio(double_swizzle_manual/double_swizzle_auto): {max_step3:.3f}x") + + +if __name__ == "__main__": + main() diff --git a/examples/aot/matmul_swizzle/simple_demo/caller.cpp b/examples/aot/matmul_swizzle/step_by_step_guide/caller.cpp similarity index 100% rename from examples/aot/matmul_swizzle/simple_demo/caller.cpp rename to examples/aot/matmul_swizzle/step_by_step_guide/caller.cpp diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py b/examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py new file mode 100644 index 00000000..d1c2d6b1 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py @@ -0,0 +1,66 @@ +from ptodsl import pto +from ptodsl import scalar as s + +const = s.const + +M_TILE = 128 +K_QTILE = 64 +K_TILE = 256 +K_DTILE = 512 +N_FULL = 256 +SWIZZLE_COUNT = 5 + + +def build_meta_data(): + def meta_data(): + dtype = pto.float16 + acc_dtype = pto.float32 + ptr_type = pto.PtrType(dtype) + i32 = pto.int32 + tv_2d = pto.TensorType(rank=2, dtype=dtype) + + tile_view_a = pto.SubTensorType(shape=[M_TILE, K_DTILE], dtype=dtype) + tile_view_b_256 = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) + tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) + + b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) + + tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") + tile_buf_b_l1_256 = pto.TileBufType( + shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg + ) + tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") + tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") + tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") + + return { + "ptr_type": ptr_type, + "i32": i32, + "tv_2d": tv_2d, + "tile_view_a": tile_view_a, + "tile_view_b_256": tile_view_b_256, + "tile_view_c_256": tile_view_c_256, + "tile_buf_a_l1": tile_buf_a_l1, + "tile_buf_b_l1_256": tile_buf_b_l1_256, + "tile_buf_a_l0": tile_buf_a_l0, + "tile_buf_b_l0_256": tile_buf_b_l0_256, + "tile_buf_c_256": tile_buf_c_256, + } + + return meta_data + + +def swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2): + tile_block_loop = (n_loop + c_swizzle_m1) // c_swizzle + tile_block_span = c_swizzle * m_loop + tile_block_idx = li // tile_block_span + in_tile_block_idx = li % tile_block_span + is_last_block = tile_block_idx == (tile_block_loop - c1) + n_col_tail = n_loop - c_swizzle * tile_block_idx + n_col = s.select(is_last_block, n_col_tail, c_swizzle) + m_idx = in_tile_block_idx // n_col + n_idx = tile_block_idx * c_swizzle + (in_tile_block_idx % n_col) + odd_block = (tile_block_idx % c2) == c1 + flipped_m_idx = m_loop - m_idx - c1 + m_idx = s.select(odd_block, flipped_m_idx, m_idx) + return m_idx, n_idx diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/compile.sh b/examples/aot/matmul_swizzle/step_by_step_guide/compile.sh new file mode 100644 index 00000000..d1a60813 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/compile.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARTIFACT_DIR="./build_artifacts" +mkdir -p "${ARTIFACT_DIR}" + +rm -f "${ARTIFACT_DIR}"/*.pto "${ARTIFACT_DIR}"/*.cpp "${ARTIFACT_DIR}"/*.so + +# Step1 baseline: functionally correct dynamic-shape matmul without optimizations. +python ./step1_baseline.py > "${ARTIFACT_DIR}/step1_baseline.pto" +ptoas --enable-insert-sync "${ARTIFACT_DIR}/step1_baseline.pto" -o "${ARTIFACT_DIR}/step1_baseline.cpp" + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/step1_baseline.cpp\"" \ + -DKERNEL_FN=matmul_kernel_step1_baseline \ + ./caller.cpp \ + -o "${ARTIFACT_DIR}/step1_baseline_kernel.so" + +# Step2: double-buffer only (no swizzle, auto-sync). +python ./step2_doublebuffer.py > "${ARTIFACT_DIR}/step2_doublebuffer.pto" +ptoas --enable-insert-sync "${ARTIFACT_DIR}/step2_doublebuffer.pto" -o "${ARTIFACT_DIR}/step2_doublebuffer.cpp" + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/step2_doublebuffer.cpp\"" \ + -DKERNEL_FN=matmul_kernel_ABt_autosync \ + ./caller.cpp \ + -o "${ARTIFACT_DIR}/step2_doublebuffer_kernel.so" + +# Step3: swizzle + double-buffer (auto-sync). +python ./step3_swizzle.py > "${ARTIFACT_DIR}/step3_swizzle.pto" +ptoas --enable-insert-sync "${ARTIFACT_DIR}/step3_swizzle.pto" -o "${ARTIFACT_DIR}/step3_swizzle.cpp" + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/step3_swizzle.cpp\"" \ + -DKERNEL_FN=matmul_kernel_ABt_autosync \ + ./caller.cpp \ + -o "${ARTIFACT_DIR}/step3_swizzle_kernel.so" + +# Step4: swizzle + double-buffer + manual software pipelining. +python ./step4_manual_pipelining.py > "${ARTIFACT_DIR}/step4_manual_pipelining.pto" +ptoas "${ARTIFACT_DIR}/step4_manual_pipelining.pto" -o "${ARTIFACT_DIR}/step4_manual_pipelining.cpp" + +bisheng -fPIC -shared -xcce -O2 -std=c++17 \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -I"${ASCEND_TOOLKIT_HOME}/include" \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/step4_manual_pipelining.cpp\"" \ + -DKERNEL_FN=matmul_kernel_ABt \ + ./caller.cpp \ + -o "${ARTIFACT_DIR}/step4_manual_pipelining_kernel.so" diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md b/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md new file mode 100644 index 00000000..34ba6996 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md @@ -0,0 +1,489 @@ +# PTO DSL Matmul Optimization Guide (4 Steps) + +This tutorial walks through a practical optimization path for dynamic-shape matmul on NPU using the ptodsl framework. + +The key idea: **keep correctness fixed**, then change only one optimization dimension at a time so each speedup is easy to understand and measure. + +--- + +## 1) Mental Model: What each step changes + +- **Step1 (`step1_baseline.py`)**: functionally correct baseline; simple tile order; single L1 buffers. +- **Step2 (`step2_doublebuffer.py`)**: add double-buffering for A/B tiles (overlap data movement with compute), still linear tile order. +- **Step3 (`step3_swizzle.py`)**: keep double-buffering and add swizzled tile traversal to improve access/balance patterns. +- **Step4 (`step4_manual_pipelining.py`)**: keep step3 algorithm but replace compiler auto-sync with explicit event-driven software pipeline. + +--- + +## 2) Shared Building Blocks (`common_utils.py`) + +All steps reuse the same tile sizes, metadata, and swizzle helper. + +### Why shared utilities matter + +- Keeps step diffs focused on optimization logic. +- Reduces accidental config drift across kernels. +- Makes benchmarking comparisons fair. + +### Key shared code + +```python +M_TILE = 128 +K_QTILE = 64 +K_TILE = 256 +K_DTILE = 512 +N_FULL = 256 +SWIZZLE_COUNT = 5 +``` + +```python +def build_meta_data(): + def meta_data(): + dtype = pto.float16 + acc_dtype = pto.float32 + ptr_type = pto.PtrType(dtype) + i32 = pto.int32 + tv_2d = pto.TensorType(rank=2, dtype=dtype) + ... +``` + +```python +def swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2): + tile_block_loop = (n_loop + c_swizzle_m1) // c_swizzle + tile_block_span = c_swizzle * m_loop + tile_block_idx = li // tile_block_span + ... + m_idx = s.select(odd_block, flipped_m_idx, m_idx) + return m_idx, n_idx +``` + +If `swizzle_nz` looks confusing: think of it as remapping linear tile index `li` into a 2D `(m_idx, n_idx)` traversal order that improves behavior compared with pure row-major tile walking. + +--- + +## 3) Step1 Baseline: Correctness-first kernel + +File: `step1_baseline.py` + +### Algorithm behavior + +- Dynamic shape support from runtime `(m, n, k)` parameters. +- Tiles are visited in plain linear order: + - `m_idx = li // n_loop` + - `n_idx = li % n_loop` +- One L1 tile for A and one L1 tile for B (no ping-pong buffers). +- No explicit pipeline/event synchronization. + +### Important code + +```python +for li in pto.range(bid, core_loop, num_blocks): + m_idx = li // n_loop + n_idx = li % n_loop + m_offset = m_idx * c128 + n_offset = n_idx * c256 +``` + +```python +a_l1 = pto.alloc_tile(tile_buf_a_l1) +b_l1 = pto.alloc_tile(tile_buf_b_l1_256) +... +pto.load(sv_a0, a_l1) +... +pto.load(sv_b, b_l1) +``` + +```python +if phase == 0: + with pto.if_context(is_first_k_tile, has_else=True) as branch: + tile.matmul(a_l0, b_l0, c_l0) + with branch.else_context(): + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +else: + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +``` + +### Why this is the baseline + +It is easy to reason about and debug. Every later step should preserve this numerical result. + +### NumPy simulation of Step1 (algorithm teaching version) + +The full code is in `step1_numpy_sim.py`. + +Run it directly: + +```bash +python ./step1_numpy_sim.py +``` + +### Line-by-line mapping to `step1_baseline.py` + +- **Loop space construction** + - NumPy: `n_loop`, `m_loop`, `core_loop`, `k_dtile_num` + - ptodsl: same scalar setup in `step1_baseline.py` +- **Core tile traversal** + - NumPy: `for li in range(core_loop)` + - ptodsl: `for li in pto.range(bid, core_loop, num_blocks)` +- **Tile index mapping** + - NumPy: `m_idx = li // n_loop`, `n_idx = li % n_loop` + - ptodsl: same formulas +- **K loop** + - NumPy: `for k_idx in range(k_dtile_num)` + - ptodsl: `for k_idx in pto.range(c0, k_dtile_num, c1)` +- **Phase loop (build-time unrolled in ptodsl)** + - NumPy: `for phase in range(8)` + - ptodsl: same Python loop, used for static unrolling in IR build +- **First-accumulate logic** + - NumPy: `if phase == 0 and is_first_k_tile: c_tile = prod else: c_tile += prod` + - ptodsl: `if phase == 0` + `pto.if_context(is_first_k_tile, has_else=True)` with `matmul` / `matmul_acc` + +### Why `b_l0.T` is needed (and how it maps to ptodsl) + +In this tutorial, `b` is stored as shape `[n, k]`, while `a` is `[m, k]`. + +- NumPy quarter tile: + - `a_l0` shape is `[M_TILE, K_QTILE]` + - `b_l0` shape is `[N_FULL, K_QTILE]` +- To compute output tile `[M_TILE, N_FULL]`, we need: + - `[M_TILE, K_QTILE] @ [K_QTILE, N_FULL]` + - therefore NumPy uses `a_l0 @ b_l0.T` + +In ptodsl, this transpose handling is embedded by the tensor/view layout settings and tile ops: +- `tv_b` is created with `layout="DN"` in `step1_baseline.py` +- `tile.extract(...)` and `tile.matmul(...)` then consume B in the expected orientation for GEMM + +So `b_l0.T` in NumPy is the explicit equivalent of what ptodsl layout + tile pipeline already encodes implicitly. + +### Why accumulate in `float32` + +The original kernel metadata sets: +- input dtype: `float16` +- accumulator dtype: `float32` + +That is why the NumPy simulation casts tile loads to `float32` and keeps `c_tile`/`c` as `float32`. This mirrors: +- `acc_dtype = pto.float32` +- `tile_buf_c_256` using `acc_dtype` + +Using float32 accumulation is important for numerical stability across many partial products (especially large K). + +--- + +## 4) Step2 Double-buffer: overlap movement and compute + +File: `step2_doublebuffer.py` + +### Algorithm delta from Step1 + +- Change single buffers into ping-pong buffers: + - `a_l1 = [buf0, buf1]` + - `b_l1 = [buf0, buf1]` +- Keep tile traversal **non-swizzled** (same simple `m_idx/n_idx` as baseline). +- Keep autosync flow (no explicit manual event schedule in source). + +### Important code + +```python +a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] +b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] +... +is_curr0 = (k_idx % c2) == c0 +with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(a_l1[0], a_l1[1]) +with branch.else_context(): + run_loop_k(a_l1[1], a_l1[0]) +``` + +```python +def run_loop_k(a_curr, a_next): + ... + tile.extract(a_curr, c0, a_col, a_l0[ping]) + ... + with pto.if_context(k_idx + c1 < k_dtile_num): + ... + pto.load(sv_a_next, a_next) +``` + +### Why this tends to speed up + +While compute is consuming `a_curr`, the next tile can be prepared into `a_next`, reducing pipeline bubbles. + +Tiny timeline sketch (conceptual): + +```text +Step2 (double-buffer, auto-sync) +time ----> +Load A/B buf0: [====] +Compute buf0: [========] +Load A/B buf1: [====] +Compute buf1: [========] +``` + +--- + +## 5) Step3 Swizzle: improve tile traversal pattern + +File: `step3_swizzle.py` + +### Algorithm delta from Step2 + +- Keep same double-buffer kernel structure. +- Only change the mapping from linear loop index `li` to tile coordinates `(m_idx, n_idx)`: + - from linear mapping + - to `swizzle_nz(...)` mapping + +### Important code + +```python +c_swizzle = const(SWIZZLE_COUNT) +c_swizzle_m1 = c_swizzle - c1 +... +m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) +``` + +Everything else (double-buffer loop body) stays essentially the same as Step2, which makes Step2 -> Step3 comparison clean. + +### Intuition for new users + +Swizzling is **not changing math**, only **work scheduling order**. On NPUs, scheduling order can strongly affect memory traffic and utilization. + +### NumPy swizzle mapping demo + +To make swizzle behavior concrete, use `step3_swizzle_numpy_sim.py`. +It prints tile index mapping before/after swizzle for several swizzle factors. + +```bash +python ./step3_swizzle_numpy_sim.py +``` + +Example output format: + +```text +=== swizzle=5, m_loop=4, n_loop=7, core_loop=28 === +li | linear(m,n) -> swizzle(m,n) + 0 | ( 0, 0) -> ( 0, 0) + 1 | ( 0, 1) -> ( 0, 1) + 2 | ( 0, 2) -> ( 0, 2) + ... +``` + +Interpretation: +- `linear(m,n)` is the baseline order (`m_idx = li // n_loop`, `n_idx = li % n_loop`). +- `swizzle(m,n)` is the remapped order used by `swizzle_nz(...)`. +- As you vary `c_swizzle` (2, 3, 5), you can see how traversal shape and direction change, especially near N-tail blocks. +- The script also prints 2D order grids: + - `linear_order_grid[m, n] = li` in baseline traversal + - `swizzle_order_grid[m, n] = li` in swizzled traversal + This gives an intuitive “heatmap-like” view of where each tile is visited in time. + +--- + +## 6) Step4 Manual Pipelining: explicit software schedule + +File: `step4_manual_pipelining.py` + +### Algorithm delta from Step3 + +- Keep swizzled traversal and double-buffer dataflow. +- Switch from autosync-style source to explicit event orchestration: + - `record_event(...)` + - `wait_event(...)` + - `record_wait_pair(...)` + +### Important code + +```python +pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) +pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) +``` + +```python +pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) +pto.load(sv_b, b_l1[h]) +pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) +``` + +```python +pto.wait_event("MOV_M2L", "MATMUL", event_id=0) +... +pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) +pto.store(c_l0, sv_c) +``` + +### Why this can help + +Manual scheduling gives tighter control over producer-consumer ordering and overlap. It often improves tail behavior and removes conservative compiler sync points. + +Tiny timeline sketch (conceptual): + +```text +Step4 (manual pipeline with explicit events) +time ----> +LOAD ----record----> MOV_M2L ----record----> MATMUL ----record----> STORE + ^ | | | + |------ wait ----------+------ wait -----------+------ wait --------+ +``` + +--- + +## 7) Build and Run + +### Build all 4 steps + +```bash +bash ./compile.sh +``` + +Artifacts are generated in `build_artifacts/`: +- `step1_baseline_kernel.so` +- `step2_doublebuffer_kernel.so` +- `step3_swizzle_kernel.so` +- `step4_manual_pipelining_kernel.so` + +### Validate correctness + +```bash +python ./run_simple_matmul.py +``` + +Run one step only: + +```bash +python ./run_simple_matmul.py --variant step1-baseline +python ./run_simple_matmul.py --variant step2-doublebuffer +python ./run_simple_matmul.py --variant step3-swizzle +python ./run_simple_matmul.py --variant step4-manual-pipelining +``` + +### Run stepwise benchmark + +```bash +python ./bench_matmul.py +``` + +--- + +## 8) Interpreting benchmark ratios + +The benchmark prints three ratio groups: + +1. **Step1 ratio**: `step2 / step1` + - isolates gain from double-buffering. +2. **Step2 ratio**: `step3 / step2` + - isolates gain from swizzle. +3. **Step3 ratio**: `step4 / step3` + - isolates gain from manual software pipelining. + +Reference result: + +```text +=== Summary === +Step1 (double-buffer speedup, both non-swizzle auto-sync): +avg FLOP ratio(double_noswizzle_auto/single_noswizzle): 1.607x +min FLOP ratio(double_noswizzle_auto/single_noswizzle): 0.943x +max FLOP ratio(double_noswizzle_auto/single_noswizzle): 1.826x +Step2 (swizzle speedup, both double-buffer auto-sync): +avg FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 1.227x +min FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 0.863x +max FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 1.871x +Step3 (manual-sync speedup, both double-buffer swizzle): +avg FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.100x +min FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.001x +max FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.173x +``` + +--- + +## 9) Suggested learning path + +- First, run `step1` only and inspect correctness outputs. +- Next, compare `step1` vs `step2` source side by side, focusing on buffer allocation and `run_loop_k`. +- Then inspect only the index mapping change from `step2` to `step3`. +- Finally, study `step4` event dependencies as a timeline (LOAD -> MOV_M2L -> MATMUL -> STORE). + +If you keep this one-change-per-step mindset, it becomes much easier to learn NPU kernel optimization systematically. + +--- + +## Appendix A) ptodsl Syntax for Python Users + +If you are new to ptodsl, the biggest source of confusion is: +- some syntax is **Python control flow** +- some syntax is **IR-builder control flow** + +They look similar, but they execute at different times. + +### Build-time vs run-time cheat sheet + +- **Python `for ... in range(...)`** + - runs when generating the IR (build-time) + - usually acts like compile-time metaprogramming/unrolling +- **`for ... in pto.range(...)`** + - emits an MLIR `scf.for` loop + - executes dynamically at kernel run-time +- **Python `if condition:`** + - condition evaluated at build-time by Python + - branch is selected while generating IR +- **`with pto.if_context(cond):` / `pto.cond(...)`** + - emits runtime `scf.if` + - condition is evaluated when kernel runs + +### Example 1: `pto.range` (runtime loop in IR) + +From `step1_baseline.py`: + +```python +for li in pto.range(bid, core_loop, num_blocks): + ... +``` + +This is **not** Python iteration over integers. In ptodsl, `pto.range` is an IR-builder primitive (see `control_flow.py`) that constructs `scf.ForOp` and yields an induction variable value. + +Practical effect: +- loop trip count depends on runtime values like `bid`, `core_loop`, `num_blocks` +- loop stays as a loop in generated IR (not unrolled by Python) + +### Example 2: Python `range` (build-time unrolling) + +From `step1_baseline.py`: + +```python +for phase in range(8): + ... +``` + +This loop is executed by Python while building IR, so it typically creates 8 repeated code regions in IR. + +For readers with C++ background: +- this is conceptually similar to compile-time code generation / metaprogramming +- useful when loop bounds are small constants + +### Example 3: Python `if` vs `pto.if_context` + +From `step1_baseline.py`: + +```python +if phase == 0: + with pto.if_context(is_first_k_tile, has_else=True) as branch: + tile.matmul(a_l0, b_l0, c_l0) + with branch.else_context(): + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +else: + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +``` + +How to read this correctly: +- `if phase == 0` is a **Python** branch (build-time), because `phase` is a Python integer from `range(8)`. +- `pto.if_context(is_first_k_tile, ...)` emits a **runtime** branch in IR, because `is_first_k_tile` is a kernel scalar value. + +In plain words: +- first, Python decides which code shape to generate for each unrolled `phase` +- inside that shape, ptodsl inserts dynamic control flow for runtime conditions + +### Rule of thumb + +When in doubt, ask: +1. Is this condition/index a Python value (`int`, `bool`)? + - then it is build-time. +2. Is this a ptodsl scalar/value (`s.*`, kernel arg-derived)? + - then use ptodsl control flow (`pto.range`, `pto.if_context`, `pto.cond`) for runtime behavior. diff --git a/examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py b/examples/aot/matmul_swizzle/step_by_step_guide/run_simple_matmul.py similarity index 88% rename from examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py rename to examples/aot/matmul_swizzle/step_by_step_guide/run_simple_matmul.py index 16940b83..b5a7497c 100644 --- a/examples/aot/matmul_swizzle/simple_demo/run_simple_matmul.py +++ b/examples/aot/matmul_swizzle/step_by_step_guide/run_simple_matmul.py @@ -109,7 +109,13 @@ def test_matmul(): parser = argparse.ArgumentParser() parser.add_argument( "--variant", - choices=["auto-sync", "manual-sync", "all"], + choices=[ + "step1-baseline", + "step2-doublebuffer", + "step3-swizzle", + "step4-manual-pipelining", + "all", + ], default="all", help="Which kernel variant to run.", ) @@ -119,11 +125,18 @@ def test_matmul(): torch.npu.set_device(device) variants = { - "auto-sync": "./simple_matmul_auto_sync_kernel.so", - "manual-sync": "./simple_matmul_manual_sync_kernel.so", + "step1-baseline": "./build_artifacts/step1_baseline_kernel.so", + "step2-doublebuffer": "./build_artifacts/step2_doublebuffer_kernel.so", + "step3-swizzle": "./build_artifacts/step3_swizzle_kernel.so", + "step4-manual-pipelining": "./build_artifacts/step4_manual_pipelining_kernel.so", } if args.variant == "all": - selected = [("auto-sync", variants["auto-sync"]), ("manual-sync", variants["manual-sync"])] + selected = [ + ("step1-baseline", variants["step1-baseline"]), + ("step2-doublebuffer", variants["step2-doublebuffer"]), + ("step3-swizzle", variants["step3-swizzle"]), + ("step4-manual-pipelining", variants["step4-manual-pipelining"]), + ] else: selected = [(args.variant, variants[args.variant])] diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py b/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py new file mode 100644 index 00000000..f3f2fe97 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py @@ -0,0 +1,26 @@ +import argparse + +from step2_doublebuffer import build as build_step2 +from step3_swizzle import build as build_step3 +from step4_manual_pipelining import build as build_step4 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Emit explicit record/wait events instead of relying on auto sync insertion.", + ) + parser.add_argument( + "--disable-swizzle", + action="store_true", + help="Emit step2 (double-buffer only) instead of swizzled versions.", + ) + args = parser.parse_args() + if args.manual_sync: + print(build_step4()) + elif args.disable_swizzle: + print(build_step2()) + else: + print(build_step3()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py b/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py new file mode 100644 index 00000000..ef496abb --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py @@ -0,0 +1,9 @@ +import argparse + +from step1_baseline import build + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _ = parser.parse_args() + print(build()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py b/examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py new file mode 100644 index 00000000..e16bb91d --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py @@ -0,0 +1,118 @@ +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +from common_utils import K_DTILE, K_QTILE, K_TILE, M_TILE, N_FULL, build_meta_data, const + + +def build(): + meta_data = build_meta_data() + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_step1_baseline( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + + tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = pto.alloc_tile(tile_buf_a_l1) + b_l1 = pto.alloc_tile(tile_buf_b_l1_256) + a_l0 = pto.alloc_tile(tile_buf_a_l0) + b_l0 = pto.alloc_tile(tile_buf_b_l0_256) + c_l0 = pto.alloc_tile(tile_buf_c_256) + + for li in pto.range(bid, core_loop, num_blocks): + m_idx = li // n_loop + n_idx = li % n_loop + m_offset = m_idx * c128 + n_offset = n_idx * c256 + c_kt = const(K_TILE) + c_kd = const(K_DTILE) + c_nt = const(N_FULL) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, c0], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a0, a_l1) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * c_kd + is_first_k_tile = k_idx == c0 + + for phase in range(8): + if phase % 4 == 0: + b_half = phase // 4 + h_off = const(b_half * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tv_b, + offsets=[k_offset + h_off, n_offset], + sizes=[c_kt, c_nt], + ) + pto.load(sv_b, b_l1) + + a_col = const(phase * K_QTILE) + b_row = const((phase % 4) * K_QTILE) + tile.extract(a_l1, c0, a_col, a_l0) + tile.extract(b_l1, b_row, c0, b_l0) + + if phase == 0: + with pto.if_context(is_first_k_tile, has_else=True) as branch: + tile.matmul(a_l0, b_l0, c_l0) + with branch.else_context(): + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + else: + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, k_offset + c_kd], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a_next, a_l1) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tv_c, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), c_nt], + ) + pto.store(c_l0, sv_c) + + return matmul_kernel_step1_baseline + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _ = parser.parse_args() + print(build()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py b/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py new file mode 100644 index 00000000..2b5d02ab --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py @@ -0,0 +1,92 @@ +import numpy as np + +M_TILE = 128 +K_QTILE = 64 +K_TILE = 256 +K_DTILE = 512 +N_FULL = 256 + + +def step1_numpy_sim(a, b): + """ + a: [m, k] float16/float32 + b: [n, k] float16/float32 + returns c: [m, n], equivalent to a @ b.T + """ + m_total, k_total = a.shape + n_total, k_total_b = b.shape + assert k_total == k_total_b + assert m_total % M_TILE == 0, "Step1 kernel uses full M tiles in this demo." + assert k_total % K_DTILE == 0, "Step1 kernel uses full K_DTILE tiles." + assert n_total % N_FULL == 0, "Tutorial simulation assumes full N tiles." + + # Corresponds to: n_loop, m_loop, core_loop, k_dtile_num + n_loop = (n_total + N_FULL - 1) // N_FULL + m_loop = m_total // M_TILE + core_loop = n_loop * m_loop + k_dtile_num = k_total // K_DTILE + + c = np.zeros((m_total, n_total), dtype=np.float32) + + # Corresponds to: for li in pto.range(...) + for li in range(core_loop): + # Corresponds to: m_idx = li // n_loop; n_idx = li % n_loop + m_idx = li // n_loop + n_idx = li % n_loop + m_offset = m_idx * M_TILE + n_offset = n_idx * N_FULL + + # Corresponds to tile accumulator c_l0 + c_tile = np.zeros((M_TILE, N_FULL), dtype=np.float32) + + # Corresponds to: load A tile once before k_idx loop + a_l1 = a[m_offset : m_offset + M_TILE, 0:K_DTILE].astype(np.float32) + + for k_idx in range(k_dtile_num): + k_offset = k_idx * K_DTILE + is_first_k_tile = k_idx == 0 + + # prefetch A tile for current k chunk (equivalent to pto.load) + a_l1 = a[m_offset : m_offset + M_TILE, k_offset : k_offset + K_DTILE].astype(np.float32) + + # Corresponds to: for phase in range(8) + for phase in range(8): + # Corresponds to loading one B half tile every 4 phases + if phase % 4 == 0: + b_half = phase // 4 + h_off = b_half * K_TILE + b_l1 = b[n_offset : n_offset + N_FULL, k_offset + h_off : k_offset + h_off + K_TILE].astype( + np.float32 + ) + + # Corresponds to extract A/B quarter tiles + a_col = phase * K_QTILE + b_row = (phase % 4) * K_QTILE + a_l0 = a_l1[:, a_col : a_col + K_QTILE] + b_l0 = b_l1[:, b_row : b_row + K_QTILE] # [N_FULL, K_QTILE] + + # Corresponds to matmul vs matmul_acc + prod = a_l0 @ b_l0.T + if phase == 0 and is_first_k_tile: + c_tile = prod + else: + c_tile += prod + + c[m_offset : m_offset + M_TILE, n_offset : n_offset + N_FULL] = c_tile + + return c + + +def test_step1_numpy_sim(): + np.random.seed(0) + for m, n, k in [(256, 512, 512), (384, 768, 1024)]: + a = np.random.randn(m, k).astype(np.float16) + b = np.random.randn(n, k).astype(np.float16) + c_ref = a.astype(np.float32) @ b.astype(np.float32).T + c_sim = step1_numpy_sim(a, b) + np.testing.assert_allclose(c_sim, c_ref, rtol=1e-4, atol=1e-3) + print("step1_numpy_sim unit test passed") + + +if __name__ == "__main__": + test_step1_numpy_sim() diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py b/examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py new file mode 100644 index 00000000..d7b9ba66 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py @@ -0,0 +1,139 @@ +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +from common_utils import ( + K_DTILE, + K_QTILE, + K_TILE, + M_TILE, + N_FULL, + build_meta_data, + const, +) + + +def build(): + meta_data = build_meta_data() + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt_autosync( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + + tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] + c_l0 = pto.alloc_tile(tile_buf_c_256) + + for li in pto.range(bid, core_loop, num_blocks): + m_idx = li // n_loop + n_idx = li % n_loop + + m_offset = m_idx * c128 + n_offset = n_idx * c256 + c_kt = const(K_TILE) + c_kd = const(K_DTILE) + c_nt = const(N_FULL) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, c0], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a0, a_l1[0]) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * c_kd + + def run_loop_k(a_curr, a_next): + is_first_k_tile = k_idx == c0 + + for h in range(2): + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tv_b, + offsets=[k_offset + h_off, n_offset], + sizes=[c_kt, c_nt], + ) + pto.load(sv_b, b_l1[h]) + + for quarter in range(4): + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, k_offset + c_kd], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a_next, a_next) + + is_curr0 = (k_idx % c2) == c0 + with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(a_l1[0], a_l1[1]) + with branch.else_context(): + run_loop_k(a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tv_c, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), c_nt], + ) + pto.store(c_l0, sv_c) + + return matmul_kernel_ABt_autosync + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _ = parser.parse_args() + print(build()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py b/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py new file mode 100644 index 00000000..c8f9e102 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py @@ -0,0 +1,140 @@ +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +from common_utils import ( + K_DTILE, + K_QTILE, + K_TILE, + M_TILE, + N_FULL, + SWIZZLE_COUNT, + build_meta_data, + const, + swizzle_nz, +) + +def build(): + meta_data = build_meta_data() + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt_autosync( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + c_swizzle = const(SWIZZLE_COUNT) + c_swizzle_m1 = c_swizzle - c1 + + tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] + c_l0 = pto.alloc_tile(tile_buf_c_256) + + for li in pto.range(bid, core_loop, num_blocks): + m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) + m_offset = m_idx * c128 + n_offset = n_idx * c256 + c_kt = const(K_TILE) + c_kd = const(K_DTILE) + c_nt = const(N_FULL) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, c0], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a0, a_l1[0]) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * c_kd + + def run_loop_k(a_curr, a_next): + is_first_k_tile = k_idx == c0 + + for h in range(2): + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tv_b, + offsets=[k_offset + h_off, n_offset], + sizes=[c_kt, c_nt], + ) + pto.load(sv_b, b_l1[h]) + + for quarter in range(4): + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, k_offset + c_kd], + sizes=[const(M_TILE), c_kd], + ) + pto.load(sv_a_next, a_next) + + is_curr0 = (k_idx % c2) == c0 + with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(a_l1[0], a_l1[1]) + with branch.else_context(): + run_loop_k(a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tv_c, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), c_nt], + ) + pto.store(c_l0, sv_c) + + return matmul_kernel_ABt_autosync + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _ = parser.parse_args() + print(build()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py b/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py new file mode 100644 index 00000000..82b19ff3 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py @@ -0,0 +1,51 @@ +import numpy as np + + +def swizzle_nz_py(li, m_loop, n_loop, c_swizzle, c1=1, c2=2): + c_swizzle_m1 = c_swizzle - c1 + tile_block_loop = (n_loop + c_swizzle_m1) // c_swizzle + tile_block_span = c_swizzle * m_loop + tile_block_idx = li // tile_block_span + in_tile_block_idx = li % tile_block_span + is_last_block = tile_block_idx == (tile_block_loop - c1) + n_col_tail = n_loop - c_swizzle * tile_block_idx + n_col = n_col_tail if is_last_block else c_swizzle + m_idx = in_tile_block_idx // n_col + n_idx = tile_block_idx * c_swizzle + (in_tile_block_idx % n_col) + odd_block = (tile_block_idx % c2) == c1 + if odd_block: + m_idx = m_loop - m_idx - c1 + return m_idx, n_idx + + +def show_mapping(m_loop, n_loop, c_swizzle, preview=24): + core_loop = m_loop * n_loop + rows = [] + linear_order_grid = np.full((m_loop, n_loop), -1, dtype=np.int32) + swizzle_order_grid = np.full((m_loop, n_loop), -1, dtype=np.int32) + for li in range(min(core_loop, preview)): + m_linear = li // n_loop + n_linear = li % n_loop + m_swz, n_swz = swizzle_nz_py(li, m_loop, n_loop, c_swizzle) + linear_order_grid[m_linear, n_linear] = li + swizzle_order_grid[m_swz, n_swz] = li + rows.append((li, m_linear, n_linear, m_swz, n_swz)) + + arr = np.array(rows, dtype=np.int32) + print(f"\n=== swizzle={c_swizzle}, m_loop={m_loop}, n_loop={n_loop}, core_loop={core_loop} ===") + print("li | linear(m,n) -> swizzle(m,n)") + for li, ml, nl, ms, ns in arr: + print(f"{li:2d} | ({ml:2d},{nl:2d}) -> ({ms:2d},{ns:2d})") + + print("\nLinear traversal order grid (value = li):") + print(linear_order_grid) + print("\nSwizzled traversal order grid (value = li):") + print(swizzle_order_grid) + + +if __name__ == "__main__": + # Use a non-multiple n_loop to demonstrate tail handling. + m_loop = 4 + n_loop = 7 + for c_swizzle in [2, 3, 5]: + show_mapping(m_loop=m_loop, n_loop=n_loop, c_swizzle=c_swizzle, preview=28) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py b/examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py new file mode 100644 index 00000000..38033523 --- /dev/null +++ b/examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py @@ -0,0 +1,183 @@ +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +from common_utils import ( + K_DTILE, + K_QTILE, + K_TILE, + M_TILE, + N_FULL, + SWIZZLE_COUNT, + build_meta_data, + const, + swizzle_nz, +) + +def build(): + meta_data = build_meta_data() + + @to_ir_module(meta_data=meta_data) + def matmul_kernel_ABt( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + m_i32: "i32", + n_i32: "i32", + k_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c128 = const(M_TILE) + c256 = const(N_FULL) + c512 = const(K_DTILE) + + m_total = s.index_cast(m_i32) + n_total = s.index_cast(n_i32) + k_total = s.index_cast(k_i32) + num_blocks = s.index_cast(pto.get_block_num()) + bid = s.index_cast(pto.get_block_idx()) + + n_loop = (n_total + c256 - c1) // c256 + m_loop = m_total // c128 + core_loop = n_loop * m_loop + k_dtile_num = k_total // c512 + c_swizzle = const(SWIZZLE_COUNT) + c_swizzle_m1 = c_swizzle - c1 + + tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) + tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") + tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + + a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] + b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] + c_l0 = pto.alloc_tile(tile_buf_c_256) + + pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) + pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) + + for li in pto.range(bid, core_loop, num_blocks): + m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) + m_offset = m_idx * c128 + n_offset = n_idx * c256 + c_kt = const(K_TILE) + c_kd = const(K_DTILE) + c_nt = const(N_FULL) + + not_first_tile = li != bid + with pto.if_context(not_first_tile): + pto.wait_event("STORE_ACC", "MATMUL", event_id=0) + + sv_a0 = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, c0], + sizes=[const(M_TILE), c_kd], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.load(sv_a0, a_l1[0]) + pto.record_event("LOAD", "MOV_M2L", event_id=0) + + for k_idx in pto.range(c0, k_dtile_num, c1): + k_offset = k_idx * c_kd + + def run_loop_k(curr_id, next_id, a_curr, a_next): + is_first_k_tile = k_idx == c0 + + for h in range(2): + b_evt = 2 + h + h_off = const(h * K_TILE) + sv_b = pto.slice_view( + tile_view_b_256, + source=tv_b, + offsets=[k_offset + h_off, n_offset], + sizes=[c_kt, c_nt], + ) + + pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) + pto.load(sv_b, b_l1[h]) + pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) + + for quarter in range(4): + phase = h * 4 + quarter + ping = phase & 1 + a_col = const(phase * K_QTILE) + b_row = const(quarter * K_QTILE) + + pto.wait_event("MATMUL", "MOV_M2L", event_id=ping) + if phase == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=curr_id) + + tile.extract(a_curr, c0, a_col, a_l0[ping]) + if phase == 7: + pto.record_event("MOV_M2L", "LOAD", event_id=curr_id) + + if quarter == 0: + pto.wait_event("LOAD", "MOV_M2L", event_id=b_evt) + + tile.extract(b_l1[h], b_row, c0, b_l0[ping]) + pto.record_event("MOV_M2L", "MATMUL", event_id=0) + + if quarter == 3: + pto.record_event("MOV_M2L", "LOAD", event_id=b_evt) + + pto.wait_event("MOV_M2L", "MATMUL", event_id=0) + if phase == 0: + pto.cond( + is_first_k_tile, + lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + ) + else: + tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) + + pto.record_event("MATMUL", "MOV_M2L", event_id=ping) + + with pto.if_context(k_idx + c1 < k_dtile_num): + sv_a_next = pto.slice_view( + tile_view_a, + source=tv_a, + offsets=[m_offset, k_offset + c_kd], + sizes=[const(M_TILE), c_kd], + ) + pto.wait_event("MOV_M2L", "LOAD", event_id=next_id) + pto.load(sv_a_next, a_next) + pto.record_event("LOAD", "MOV_M2L", event_id=next_id) + + is_curr0 = (k_idx % c2) == c0 + with pto.if_context(is_curr0, has_else=True) as branch: + run_loop_k(0, 1, a_l1[0], a_l1[1]) + with branch.else_context(): + run_loop_k(1, 0, a_l1[1], a_l1[0]) + + sv_c = pto.slice_view( + tile_view_c_256, + source=tv_c, + offsets=[m_offset, n_offset], + sizes=[const(M_TILE), c_nt], + ) + pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) + pto.store(c_l0, sv_c) + + with pto.if_context(li + num_blocks < core_loop): + pto.record_event("STORE_ACC", "MATMUL", event_id=0) + + pto.wait_event("MOV_M2L", "LOAD", event_id=3) + pto.wait_event("MOV_M2L", "LOAD", event_id=2) + pto.wait_event("MOV_M2L", "LOAD", event_id=1) + pto.wait_event("MOV_M2L", "LOAD", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=0) + pto.wait_event("MATMUL", "MOV_M2L", event_id=1) + + return matmul_kernel_ABt + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _ = parser.parse_args() + print(build()) From 9a41ef47376d497e2a8f16ddabc860f6dd688a33 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Tue, 10 Mar 2026 17:32:35 +0000 Subject: [PATCH 19/53] added colsum dynamic multicore test and merged to rowsum --- ptodsl/api/tile.py | 6 + tests/npu/rowsum_dynamic_multicore/caller.py | 31 -- tests/npu/rowsum_dynamic_multicore/compile.sh | 29 -- tests/npu/rowsum_dynamic_multicore/gen_ir.py | 14 - .../rowsum_builder.py | 129 -------- .../rowsum_dynamic_multicore/test_rowsum.py | 74 ----- tests/npu/sum_dynamic_multicore/caller.py | 33 ++ tests/npu/sum_dynamic_multicore/compile.sh | 34 +++ tests/npu/sum_dynamic_multicore/gen_ir.py | 20 ++ .../npu/sum_dynamic_multicore/sum_builder.py | 285 ++++++++++++++++++ tests/npu/sum_dynamic_multicore/test_sum.py | 135 +++++++++ 11 files changed, 513 insertions(+), 277 deletions(-) delete mode 100644 tests/npu/rowsum_dynamic_multicore/caller.py delete mode 100755 tests/npu/rowsum_dynamic_multicore/compile.sh delete mode 100644 tests/npu/rowsum_dynamic_multicore/gen_ir.py delete mode 100644 tests/npu/rowsum_dynamic_multicore/rowsum_builder.py delete mode 100644 tests/npu/rowsum_dynamic_multicore/test_rowsum.py create mode 100644 tests/npu/sum_dynamic_multicore/caller.py create mode 100755 tests/npu/sum_dynamic_multicore/compile.sh create mode 100644 tests/npu/sum_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/sum_dynamic_multicore/sum_builder.py create mode 100644 tests/npu/sum_dynamic_multicore/test_sum.py diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index f15650fe..d1152eb0 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -1,4 +1,5 @@ from mlir.dialects import pto as _pto +from mlir.ir import BoolAttr from .scalar import _unwrap @@ -91,6 +92,10 @@ def row_sum(src, tmp, dst): _pto.TRowSumOp(src=src, tmp=tmp, dst=dst) +def col_sum(src, tmp, dst, is_binary=True): + _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) + + def subset(source, offsets, sizes): offset_vals = [_unwrap(v) for v in offsets] return _pto.subset(source, offset_vals, sizes) @@ -120,5 +125,6 @@ def print(source): "matmul_acc", "extract", "row_sum", + "col_sum", "subset", ] \ No newline at end of file diff --git a/tests/npu/rowsum_dynamic_multicore/caller.py b/tests/npu/rowsum_dynamic_multicore/caller.py deleted file mode 100644 index d62967ea..00000000 --- a/tests/npu/rowsum_dynamic_multicore/caller.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Generate caller.cpp for the dynamic multicore rowsum kernel (fp32). - -Usage: python caller.py -""" - -_BLOCK_DIM = 24 - - -def generate_caller(): - return f"""\ -#include "rowsum.cpp" - -extern "C" void call_kernel( - uint32_t blockDim, - void *stream, - uint8_t *x, - uint8_t *y, - uint32_t batch, - uint32_t n_cols) -{{ - _kernel<<>>( - reinterpret_cast(x), - reinterpret_cast(y), - static_cast(batch), - static_cast(n_cols)); -}} -""" - - -if __name__ == "__main__": - print(generate_caller()) diff --git a/tests/npu/rowsum_dynamic_multicore/compile.sh b/tests/npu/rowsum_dynamic_multicore/compile.sh deleted file mode 100755 index 4b20495c..00000000 --- a/tests/npu/rowsum_dynamic_multicore/compile.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -TMP=$(mktemp -d) -trap "rm -rf $TMP" EXIT - -python "$SCRIPT_DIR/gen_ir.py" > "$TMP/rowsum.pto" -ptoas --enable-insert-sync "$TMP/rowsum.pto" -o "$TMP/rowsum.cpp" - -python "$SCRIPT_DIR/caller.py" > "$TMP/caller.cpp" - -bisheng \ - -I${ASCEND_TOOLKIT_HOME}/include \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ - -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ - -xcce -Xhost-start -Xhost-end \ - -mllvm -cce-aicore-stack-size=0x8000 \ - -mllvm -cce-aicore-function-stack-size=0x8000 \ - -mllvm -cce-aicore-record-overflow=true \ - -mllvm -cce-aicore-addr-transform \ - -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -std=gnu++17 \ - "$TMP/caller.cpp" \ - -o "$SCRIPT_DIR/rowsum_lib.so" - -echo "Built rowsum_lib.so successfully." diff --git a/tests/npu/rowsum_dynamic_multicore/gen_ir.py b/tests/npu/rowsum_dynamic_multicore/gen_ir.py deleted file mode 100644 index 5e903352..00000000 --- a/tests/npu/rowsum_dynamic_multicore/gen_ir.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Print MLIR IR for the dynamic multicore rowsum kernel (fp32). - -Usage: python gen_ir.py -""" - -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -from rowsum_builder import build_rowsum - -if __name__ == "__main__": - print(build_rowsum(dtype="fp32")) diff --git a/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py b/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py deleted file mode 100644 index 5ea4338f..00000000 --- a/tests/npu/rowsum_dynamic_multicore/rowsum_builder.py +++ /dev/null @@ -1,129 +0,0 @@ -from ptodsl import pto, tile, to_ir_module -from ptodsl import scalar as s - -const = s.const - -# 32 KB of UB -_TILE_SIZE_BYTES = 32 * 1024 -_DTYPE_BYTES = {"fp16": 2, "fp32": 4} - - -def meta_data(dtype="fp32"): - pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] - elements_per_tile = _TILE_SIZE_BYTES // _DTYPE_BYTES[dtype] - ptr_type = pto.PtrType(pto_dtype) - index_dtype = pto.int32 - - tensor_type = pto.TensorType(rank=1, dtype=pto_dtype) - subtensor_in = pto.SubTensorType(shape=[1, elements_per_tile], dtype=pto_dtype) - - tile_cfg = pto.TileBufConfig() - tile_type = pto.TileBufType( - shape=[1, elements_per_tile], - valid_shape=[1, -1], - dtype=pto_dtype, - memory_space="VEC", - config=tile_cfg, - ) - - return { - "ptr_type": ptr_type, - "pto_dtype": pto_dtype, - "elements_per_tile": elements_per_tile, - "index_dtype": index_dtype, - "tensor_type": tensor_type, - "subtensor_in": subtensor_in, - "tile_type": tile_type, - } - - -def build_rowsum(fn_name="rowsum_fp32", dtype="fp32"): - """ - Computes per-row sum across columns using PTO TROWSUM (`tile.row_sum` wrapper). - - Args: - x_ptr : dtype[batch * n_cols] input matrix flattened row-major - y_ptr : dtype[batch] output vector, one sum per row - batch : int32 - n_cols: int32 (<= elements_per_tile) - - Semantics: - y[row] = sum_{j=0..n_cols-1} x[row, j] - """ - _meta_data = lambda: meta_data(dtype=dtype) - - @to_ir_module(meta_data=_meta_data) - def _kernel( - x_ptr: "ptr_type", - y_ptr: "ptr_type", - batch_i32: "index_dtype", - n_cols_i32: "index_dtype", - ) -> None: - c0 = const(0) - c1 = const(1) - - batch = s.index_cast(batch_i32) - n_cols = s.index_cast(n_cols_i32) - - with pto.vector_section(): - bid = s.index_cast(pto.get_block_idx()) - num_cores = s.index_cast(pto.get_block_num()) - - rows_per_core = s.ceil_div(batch, num_cores) - row_start = bid * rows_per_core - row_end = s.min_u(row_start + rows_per_core, batch) - num_rows = row_end - row_start - - total_elems = batch * n_cols - tv_x = pto.as_tensor( - tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] - ) - tv_y = pto.as_tensor( - tensor_type, ptr=y_ptr, shape=[batch], strides=[c1] - ) - - with pto.if_context(num_rows > c0): - tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) - tb_sum = pto.alloc_tile( - tile_type, valid_col=c1 - ) # scalar output - tb_tmp = pto.alloc_tile( - tile_type, valid_col=n_cols - ) # scratch - - for r in pto.range(c0, num_rows, c1): - gm_offset = (row_start + r) * n_cols - - sv_x = pto.slice_view( - subtensor_in, - source=tv_x, - offsets=[gm_offset], - sizes=[n_cols], - ) - - # y is a vector of length batch; write one element per row - sv_y = pto.slice_view( - subtensor_in, - source=tv_y, - offsets=[row_start + r], - sizes=[c1], - ) - - pto.load(sv_x, tb_x) - tile.row_sum(tb_x, tb_tmp, tb_sum) - - # Store the 1-element tile to y[row] - pto.store(tb_sum, sv_y) - - _ = fn_name - return _kernel - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--fn-name", default="rowsum_fp32") - parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") - args = parser.parse_args() - print(build_rowsum(fn_name=args.fn_name, dtype=args.dtype)) diff --git a/tests/npu/rowsum_dynamic_multicore/test_rowsum.py b/tests/npu/rowsum_dynamic_multicore/test_rowsum.py deleted file mode 100644 index e8f3c2fc..00000000 --- a/tests/npu/rowsum_dynamic_multicore/test_rowsum.py +++ /dev/null @@ -1,74 +0,0 @@ -import ctypes -import os -import subprocess - -import pytest -import torch - -from ptodsl.test_util import get_test_device - -torch.manual_seed(0) - -_DIR = os.path.dirname(os.path.abspath(__file__)) -_DEVICE = get_test_device() -_LIB_PATH = os.path.join(_DIR, "rowsum_lib.so") -_BLOCK_DIM = 24 - -_BATCH_LIST = [1, 7, 29, 32, 65, 200] -_N_COLS_LIST = [128, 256, 512, 1024, 2048, 4096, 8192] - -_SHAPE_PARAMS = [ - pytest.param(batch, n_cols, id=f"batch{batch}-cols{n_cols}") - for batch in _BATCH_LIST - for n_cols in _N_COLS_LIST -] - - -@pytest.fixture(scope="session") -def compiled_rowsum(): - subprocess.check_call(["bash", os.path.join(_DIR, "compile.sh")], cwd=_DIR) - yield - os.remove(_LIB_PATH) - - -def test_build_rowsum(compiled_rowsum): - assert os.path.exists(_LIB_PATH) - - -@pytest.mark.require_npu -@pytest.mark.parametrize("batch, n_cols", _SHAPE_PARAMS) -def test_rowsum_precision(compiled_rowsum, batch, n_cols): - import torch_npu # noqa: F401 - - lib = ctypes.CDLL(_LIB_PATH) - lib.call_kernel.argtypes = [ - ctypes.c_uint32, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_uint32, - ctypes.c_uint32, - ] - lib.call_kernel.restype = None - - torch.npu.set_device(_DEVICE) - x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) - y = torch.empty(batch, device=_DEVICE, dtype=torch.float32) - - y_ref = x.float().sum(dim=-1) - stream_ptr = torch.npu.current_stream()._as_parameter_ - lib.call_kernel( - ctypes.c_uint32(_BLOCK_DIM), - stream_ptr, - ctypes.c_void_p(x.data_ptr()), - ctypes.c_void_p(y.data_ptr()), - ctypes.c_uint32(batch), - ctypes.c_uint32(n_cols), - ) - torch.npu.synchronize() - - torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/tests/npu/sum_dynamic_multicore/caller.py b/tests/npu/sum_dynamic_multicore/caller.py new file mode 100644 index 00000000..180fe150 --- /dev/null +++ b/tests/npu/sum_dynamic_multicore/caller.py @@ -0,0 +1,33 @@ +"""Generate caller.cpp for a dynamic multicore rowsum or colsum kernel (fp32). + +Usage: python caller.py --mode row|col +""" + +def generate_caller(mode): + return f"""\ +#include "{mode}sum.cpp" + +extern "C" void call_{mode}sum( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{{ + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +}} +""" + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["row", "col"], required=True) + args = parser.parse_args() + print(generate_caller(args.mode)) diff --git a/tests/npu/sum_dynamic_multicore/compile.sh b/tests/npu/sum_dynamic_multicore/compile.sh new file mode 100755 index 00000000..8cd4014e --- /dev/null +++ b/tests/npu/sum_dynamic_multicore/compile.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +TMP=$(mktemp -d) +trap "rm -rf \"$TMP\"" EXIT + +BISHENG_FLAGS=( + -I${ASCEND_TOOLKIT_HOME}/include + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong + -xcce -Xhost-start -Xhost-end + -mllvm -cce-aicore-stack-size=0x8000 + -mllvm -cce-aicore-function-stack-size=0x8000 + -mllvm -cce-aicore-record-overflow=true + -mllvm -cce-aicore-addr-transform + -mllvm -cce-aicore-dcci-insert-for-scalar=false + --npu-arch=dav-2201 -DMEMORY_BASE + -std=gnu++17 +) + +for MODE in row col; do + python "$SCRIPT_DIR/gen_ir.py" --mode "$MODE" > "$TMP/${MODE}sum.pto" + ptoas --enable-insert-sync "$TMP/${MODE}sum.pto" -o "$TMP/${MODE}sum.cpp" + + python "$SCRIPT_DIR/caller.py" --mode "$MODE" > "$TMP/${MODE}sum_caller.cpp" + + bisheng "${BISHENG_FLAGS[@]}" \ + "$TMP/${MODE}sum_caller.cpp" \ + -o "$SCRIPT_DIR/${MODE}sum_lib.so" + + echo "Built ${MODE}sum_lib.so successfully." +done diff --git a/tests/npu/sum_dynamic_multicore/gen_ir.py b/tests/npu/sum_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..f9b0e5cd --- /dev/null +++ b/tests/npu/sum_dynamic_multicore/gen_ir.py @@ -0,0 +1,20 @@ +"""Print MLIR IR for the dynamic multicore rowsum or colsum kernel (fp32). + +Usage: python gen_ir.py --mode row|col +""" + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from sum_builder import build_colsum, build_rowsum + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["row", "col"], default="row") + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + builder = build_rowsum if args.mode == "row" else build_colsum + print(builder(dtype=args.dtype)) diff --git a/tests/npu/sum_dynamic_multicore/sum_builder.py b/tests/npu/sum_dynamic_multicore/sum_builder.py new file mode 100644 index 00000000..f751f210 --- /dev/null +++ b/tests/npu/sum_dynamic_multicore/sum_builder.py @@ -0,0 +1,285 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# 32 KB of UB +_TILE_SIZE_BYTES = 32 * 1024 +_DTYPE_BYTES = {"fp16": 2, "fp32": 4} + + +def meta_data_row(dtype="fp32"): + pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] + elements_per_tile = _TILE_SIZE_BYTES // _DTYPE_BYTES[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=pto_dtype) + subtensor_in = pto.SubTensorType(shape=[1, elements_per_tile], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, elements_per_tile], + valid_shape=[1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "elements_per_tile": elements_per_tile, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_in": subtensor_in, + "tile_type": tile_type, + } + + +def build_rowsum(dtype="fp32"): + """ + Computes per-row sum across columns using PTO TROWSUM (`tile.row_sum` wrapper). + + Args: + x_ptr : dtype[batch * n_cols] input matrix flattened row-major + y_ptr : dtype[batch] output vector, one sum per row + batch : int32 + n_cols: int32 (<= elements_per_tile) + + Semantics: + y[row] = sum_{j=0..n_cols-1} x[row, j] + """ + _meta_data = lambda: meta_data_row(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + + rows_per_core = s.ceil_div(batch, num_cores) + row_start = bid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[batch], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_sum = pto.alloc_tile( + tile_type, valid_col=c1 + ) # scalar output + tb_tmp = pto.alloc_tile( + tile_type, valid_col=n_cols + ) # scratch + + for r in pto.range(c0, num_rows, c1): + gm_offset = (row_start + r) * n_cols + + sv_x = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + + # y is a vector of length batch; write one element per row + sv_y = pto.slice_view( + subtensor_in, + source=tv_y, + offsets=[row_start + r], + sizes=[c1], + ) + + pto.load(sv_x, tb_x) + tile.row_sum(tb_x, tb_tmp, tb_sum) + + # Store the 1-element tile to y[row] + pto.store(tb_sum, sv_y) + + return _kernel + + +def meta_data_col(dtype="fp32"): + pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tile_rows = 32 + tile_cols = 32 + + tensor2d_type = pto.TensorType(rank=2, dtype=pto_dtype) + subtensor_in = pto.SubTensorType(shape=[tile_rows, tile_cols], dtype=pto_dtype) + subtensor_out = pto.SubTensorType(shape=[1, tile_cols], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + + # Dynamic valid rows + cols + tile_type = pto.TileBufType( + shape=[tile_rows, tile_cols], + valid_shape=[-1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + # Dynamic valid cols + tile_sum_type = pto.TileBufType( + shape=[1, tile_cols], + valid_shape=[1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "index_dtype": index_dtype, + "tensor2d_type": tensor2d_type, + "subtensor_in": subtensor_in, + "subtensor_out": subtensor_out, + "tile_type": tile_type, + "tile_sum_type": tile_sum_type, + "tile_rows": tile_rows, + "tile_cols": tile_cols, + } + + +def build_colsum(dtype="fp32"): + _meta_data = lambda: meta_data_col(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile_rows = const(tile_rows) + c_tile_cols = const(tile_cols) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + + cols_per_core = s.ceil_div(n_cols, num_cores) + col_start = bid * cols_per_core + col_end = s.min_u(col_start + cols_per_core, n_cols) + num_cols = col_end - col_start + + tv_x = pto.as_tensor( + tensor2d_type, + ptr=x_ptr, + shape=[batch, n_cols], + strides=[n_cols, c1], + ) + tv_y = pto.as_tensor( + tensor2d_type, + ptr=y_ptr, + shape=[c1, n_cols], + strides=[n_cols, c1], + ) + + with pto.if_context(num_cols > c0): + for col in pto.range(col_start, col_end, c_tile_cols): + cols_this = s.min_u(c_tile_cols, col_end - col) + + with pto.if_context(batch > c0): + first_rows = s.min_u(c_tile_rows, batch) + + tb_x = pto.alloc_tile( + tile_type, + valid_row=first_rows, + valid_col=cols_this, + ) + tb_tmp = pto.alloc_tile( + tile_type, + valid_row=first_rows, + valid_col=cols_this, + ) + tb_acc = pto.alloc_tile( + tile_sum_type, + valid_col=cols_this, + ) + + sv_x0 = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[c0, col], + sizes=[first_rows, cols_this], + ) + + pto.load(sv_x0, tb_x) + tile.col_sum(tb_x, tb_tmp, tb_acc) + + for row in pto.range(c_tile_rows, batch, c_tile_rows): + rows_this = s.min_u(c_tile_rows, batch - row) + + tb_x_k = pto.alloc_tile( + tile_type, + valid_row=rows_this, + valid_col=cols_this, + ) + tb_tmp_k = pto.alloc_tile( + tile_type, + valid_row=rows_this, + valid_col=cols_this, + ) + tb_part = pto.alloc_tile( + tile_sum_type, + valid_col=cols_this, + ) + + sv_xk = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[row, col], + sizes=[rows_this, cols_this], + ) + + pto.load(sv_xk, tb_x_k) + tile.col_sum(tb_x_k, tb_tmp_k, tb_part) + tile.add(tb_acc, tb_part, tb_acc) + + sv_y = pto.slice_view( + subtensor_out, + source=tv_y, + offsets=[c0, col], + sizes=[c1, cols_this], + ) + pto.store(tb_acc, sv_y) + + return _kernel +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["row", "col"], default="row") + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + builder = build_rowsum if args.mode == "row" else build_colsum + print(builder(dtype=args.dtype)) diff --git a/tests/npu/sum_dynamic_multicore/test_sum.py b/tests/npu/sum_dynamic_multicore/test_sum.py new file mode 100644 index 00000000..659f029c --- /dev/null +++ b/tests/npu/sum_dynamic_multicore/test_sum.py @@ -0,0 +1,135 @@ +import ctypes +import os +import subprocess + +import pytest +import torch + +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() +_ROWSUM_LIB_PATH = os.path.join(_DIR, "rowsum_lib.so") +_COLSUM_LIB_PATH = os.path.join(_DIR, "colsum_lib.so") +_BLOCK_DIM = 24 + +_SHAPES = [ + (1, 1), + (2, 3), + (3, 2), + (7, 7), + (15, 17), + (17, 15), + (31, 32), + (32, 31), + (32, 32), + (33, 33), + (31, 33), + (33, 31), + (64, 32), + (32, 64), + (63, 64), + (64, 63), + (65, 33), + (65, 64), + (29, 257), + (127, 129), +] + +_SHAPE_IDS = [f"batch{batch}-cols{n_cols}" for batch, n_cols in _SHAPES] + + +@pytest.fixture(scope="session") +def compiled_sum(): + subprocess.check_call(["bash", os.path.join(_DIR, "compile.sh")], cwd=_DIR) + yield + if os.path.exists(_ROWSUM_LIB_PATH): + os.remove(_ROWSUM_LIB_PATH) + if os.path.exists(_COLSUM_LIB_PATH): + os.remove(_COLSUM_LIB_PATH) + + +def test_build_rowsum(compiled_sum): + assert os.path.exists(_ROWSUM_LIB_PATH) + + +def test_build_colsum(compiled_sum): + assert os.path.exists(_COLSUM_LIB_PATH) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("batch, n_cols", _SHAPES, ids=_SHAPE_IDS) +def test_rowsum_precision(compiled_sum, batch, n_cols): + import torch_npu # noqa: F401 + + lib = ctypes.CDLL(_ROWSUM_LIB_PATH) + lib.call_rowsum.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ] + lib.call_rowsum.restype = None + + torch.npu.set_device(_DEVICE) + x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) + y = torch.full((batch,), float("nan"), device=_DEVICE, dtype=torch.float32) + + y_ref = x.float().sum(dim=-1) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_rowsum( + ctypes.c_uint32(_BLOCK_DIM), + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_uint32(batch), + ctypes.c_uint32(n_cols), + ) + torch.npu.synchronize() + + torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("batch, n_cols", _SHAPES, ids=_SHAPE_IDS) +def test_colsum_precision(compiled_sum, batch, n_cols): + import torch_npu # noqa: F401 + + lib = ctypes.CDLL(_COLSUM_LIB_PATH) + lib.call_colsum.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ] + lib.call_colsum.restype = None + + torch.npu.set_device(_DEVICE) + x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) + y = torch.full((n_cols,), float("nan"), device=_DEVICE, dtype=torch.float32) + + y_ref = x.float().sum(dim=0) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_colsum( + ctypes.c_uint32(_BLOCK_DIM), + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_uint32(batch), + ctypes.c_uint32(n_cols), + ) + torch.npu.synchronize() + + torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) \ No newline at end of file From 1c0bec8f48373376ed6fd24b427c70e77638fe9a Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Tue, 10 Mar 2026 18:03:15 +0000 Subject: [PATCH 20/53] wip: generalization of reduce ops, misisng colmin, colmax and rowprod and colprod --- ptodsl/api/tile.py | 24 ++ tests/npu/sum_dynamic_multicore/caller.py | 34 +- tests/npu/sum_dynamic_multicore/compile.sh | 27 +- tests/npu/sum_dynamic_multicore/gen_ir.py | 37 +- .../npu/sum_dynamic_multicore/sum_builder.py | 346 +++++++++++------- tests/npu/sum_dynamic_multicore/test_sum.py | 132 ++++--- 6 files changed, 386 insertions(+), 214 deletions(-) diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index d1152eb0..a4e396c1 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -92,10 +92,34 @@ def row_sum(src, tmp, dst): _pto.TRowSumOp(src=src, tmp=tmp, dst=dst) +def row_min(src, tmp, dst): + _pto.TRowMinOp(src=src, tmp=tmp, dst=dst) + + +def row_max(src, tmp, dst): + _pto.TRowMaxOp(src=src, tmp=tmp, dst=dst) + + +def row_prod(src, tmp, dst): + _pto.TRowProdOp(src=src, tmp=tmp, dst=dst) + + def col_sum(src, tmp, dst, is_binary=True): _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) +def col_min(src, dst): + _pto.TColMinOp(src=src, dst=dst) + + +def col_max(src, dst): + _pto.TColMaxOp(src=src, dst=dst) + + +def col_prod(src, tmp, dst, is_binary=True): + _pto.TColProdOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) + + def subset(source, offsets, sizes): offset_vals = [_unwrap(v) for v in offsets] return _pto.subset(source, offset_vals, sizes) diff --git a/tests/npu/sum_dynamic_multicore/caller.py b/tests/npu/sum_dynamic_multicore/caller.py index 180fe150..f940bad4 100644 --- a/tests/npu/sum_dynamic_multicore/caller.py +++ b/tests/npu/sum_dynamic_multicore/caller.py @@ -1,13 +1,16 @@ -"""Generate caller.cpp for a dynamic multicore rowsum or colsum kernel (fp32). +"""Generate caller.cpp for dynamic multicore row/col reduction kernels. -Usage: python caller.py --mode row|col +Usage: + python caller.py --mode rowsum|rowmin|rowmax|rowprod|colsum|colmin|colmax|colprod """ -def generate_caller(mode): + +def generate_caller(mode, dtype): + ctype = "half" if dtype == "fp16" else "float" return f"""\ -#include "{mode}sum.cpp" +#include "{mode}.cpp" -extern "C" void call_{mode}sum( +extern "C" void call_{mode}( uint32_t blockDim, void *stream, uint8_t *x, @@ -16,8 +19,8 @@ def generate_caller(mode): uint32_t n_cols) {{ _kernel<<>>( - reinterpret_cast(x), - reinterpret_cast(y), + reinterpret_cast<{ctype} *>(x), + reinterpret_cast<{ctype} *>(y), static_cast(batch), static_cast(n_cols)); }} @@ -27,7 +30,20 @@ def generate_caller(mode): if __name__ == "__main__": import argparse + MODES = [ + "rowsum", + "rowmin", + "rowmax", + # "rowprod", + "colsum", + # "colmin", + # "colmax", + # "colprod", + ] + parser = argparse.ArgumentParser() - parser.add_argument("--mode", choices=["row", "col"], required=True) + parser.add_argument("--mode", choices=MODES, required=True) + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") args = parser.parse_args() - print(generate_caller(args.mode)) + + print(generate_caller(args.mode, args.dtype)) \ No newline at end of file diff --git a/tests/npu/sum_dynamic_multicore/compile.sh b/tests/npu/sum_dynamic_multicore/compile.sh index 8cd4014e..3a76e113 100755 --- a/tests/npu/sum_dynamic_multicore/compile.sh +++ b/tests/npu/sum_dynamic_multicore/compile.sh @@ -20,15 +20,26 @@ BISHENG_FLAGS=( -std=gnu++17 ) -for MODE in row col; do - python "$SCRIPT_DIR/gen_ir.py" --mode "$MODE" > "$TMP/${MODE}sum.pto" - ptoas --enable-insert-sync "$TMP/${MODE}sum.pto" -o "$TMP/${MODE}sum.cpp" +MODES=( + rowsum + rowmin + rowmax + # rowprod + colsum + # colmin + # colmax + # colprod +) + +for MODE in "${MODES[@]}"; do + python "$SCRIPT_DIR/gen_ir.py" --mode "$MODE" > "$TMP/${MODE}.pto" + ptoas --enable-insert-sync "$TMP/${MODE}.pto" -o "$TMP/${MODE}.cpp" - python "$SCRIPT_DIR/caller.py" --mode "$MODE" > "$TMP/${MODE}sum_caller.cpp" + python "$SCRIPT_DIR/caller.py" --mode "$MODE" > "$TMP/${MODE}_caller.cpp" bisheng "${BISHENG_FLAGS[@]}" \ - "$TMP/${MODE}sum_caller.cpp" \ - -o "$SCRIPT_DIR/${MODE}sum_lib.so" + "$TMP/${MODE}_caller.cpp" \ + -o "$SCRIPT_DIR/${MODE}_lib.so" - echo "Built ${MODE}sum_lib.so successfully." -done + echo "Built ${MODE}_lib.so successfully." +done \ No newline at end of file diff --git a/tests/npu/sum_dynamic_multicore/gen_ir.py b/tests/npu/sum_dynamic_multicore/gen_ir.py index f9b0e5cd..c3dbbf8b 100644 --- a/tests/npu/sum_dynamic_multicore/gen_ir.py +++ b/tests/npu/sum_dynamic_multicore/gen_ir.py @@ -1,6 +1,7 @@ -"""Print MLIR IR for the dynamic multicore rowsum or colsum kernel (fp32). +"""Print MLIR IR for dynamic multicore row/col reduction kernels. -Usage: python gen_ir.py --mode row|col +Usage: + python gen_ir.py --mode rowsum|rowmin|rowmax|rowprod|colsum|colmin|colmax|colprod """ import argparse @@ -9,12 +10,36 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from sum_builder import build_colsum, build_rowsum +from sum_builder import ( + # build_colmax, + # build_colmin, + # build_colprod, + build_colsum, + build_rowmax, + build_rowmin, + # build_rowprod, + build_rowsum, +) + +_BUILDERS = { + "rowsum": build_rowsum, + "rowmin": build_rowmin, + "rowmax": build_rowmax, + # "rowprod": build_rowprod, + "colsum": build_colsum, + # "colmin": build_colmin, + # "colmax": build_colmax, + # "colprod": build_colprod, +} if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--mode", choices=["row", "col"], default="row") + parser.add_argument( + "--mode", + choices=list(_BUILDERS.keys()), + default="rowsum", + ) parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") args = parser.parse_args() - builder = build_rowsum if args.mode == "row" else build_colsum - print(builder(dtype=args.dtype)) + + print(_BUILDERS[args.mode](dtype=args.dtype)) \ No newline at end of file diff --git a/tests/npu/sum_dynamic_multicore/sum_builder.py b/tests/npu/sum_dynamic_multicore/sum_builder.py index f751f210..99754876 100644 --- a/tests/npu/sum_dynamic_multicore/sum_builder.py +++ b/tests/npu/sum_dynamic_multicore/sum_builder.py @@ -37,19 +37,83 @@ def meta_data_row(dtype="fp32"): } -def build_rowsum(dtype="fp32"): - """ - Computes per-row sum across columns using PTO TROWSUM (`tile.row_sum` wrapper). +def meta_data_col(dtype="fp32"): + pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tile_rows = 32 + tile_cols = 32 + + tensor2d_type = pto.TensorType(rank=2, dtype=pto_dtype) + subtensor_in = pto.SubTensorType(shape=[tile_rows, tile_cols], dtype=pto_dtype) + subtensor_out = pto.SubTensorType(shape=[1, tile_cols], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + + tile_type = pto.TileBufType( + shape=[tile_rows, tile_cols], + valid_shape=[-1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + tile_out_type = pto.TileBufType( + shape=[1, tile_cols], + valid_shape=[1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "index_dtype": index_dtype, + "tensor2d_type": tensor2d_type, + "subtensor_in": subtensor_in, + "subtensor_out": subtensor_out, + "tile_type": tile_type, + "tile_out_type": tile_out_type, + "tile_rows": tile_rows, + "tile_cols": tile_cols, + } + + +_ROW_REDUCE_OPS = { + "sum": tile.row_sum, + "min": tile.row_min, + "max": tile.row_max, + "prod": tile.row_prod, +} - Args: - x_ptr : dtype[batch * n_cols] input matrix flattened row-major - y_ptr : dtype[batch] output vector, one sum per row - batch : int32 - n_cols: int32 (<= elements_per_tile) +_COL_REDUCE_OPS = { + "sum": tile.col_sum, + "min": tile.col_min, + "max": tile.col_max, + "prod": tile.col_prod, +} + +_COL_COMBINE_OPS = { + "sum": tile.add, + "min": tile.min, + "max": tile.max, + "prod": tile.mul, +} + + +def build_row_reduce(kind="sum", dtype="fp32"): + """ + Generic row-wise reduction across columns. Semantics: - y[row] = sum_{j=0..n_cols-1} x[row, j] + y[row] = reduce_j x[row, j] """ + if kind not in _ROW_REDUCE_OPS: + raise ValueError(f"Unsupported row reduction kind: {kind}") + + row_reduce = _ROW_REDUCE_OPS[kind] _meta_data = lambda: meta_data_row(dtype=dtype) @to_ir_module(meta_data=_meta_data) @@ -84,12 +148,8 @@ def _kernel( with pto.if_context(num_rows > c0): tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) - tb_sum = pto.alloc_tile( - tile_type, valid_col=c1 - ) # scalar output - tb_tmp = pto.alloc_tile( - tile_type, valid_col=n_cols - ) # scratch + tb_out = pto.alloc_tile(tile_type, valid_col=c1) + tb_tmp = pto.alloc_tile(tile_type, valid_col=n_cols) for r in pto.range(c0, num_rows, c1): gm_offset = (row_start + r) * n_cols @@ -101,7 +161,6 @@ def _kernel( sizes=[n_cols], ) - # y is a vector of length batch; write one element per row sv_y = pto.slice_view( subtensor_in, source=tv_y, @@ -110,61 +169,24 @@ def _kernel( ) pto.load(sv_x, tb_x) - tile.row_sum(tb_x, tb_tmp, tb_sum) - - # Store the 1-element tile to y[row] - pto.store(tb_sum, sv_y) + row_reduce(tb_x, tb_tmp, tb_out) + pto.store(tb_out, sv_y) return _kernel -def meta_data_col(dtype="fp32"): - pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] - ptr_type = pto.PtrType(pto_dtype) - index_dtype = pto.int32 - - tile_rows = 32 - tile_cols = 32 - - tensor2d_type = pto.TensorType(rank=2, dtype=pto_dtype) - subtensor_in = pto.SubTensorType(shape=[tile_rows, tile_cols], dtype=pto_dtype) - subtensor_out = pto.SubTensorType(shape=[1, tile_cols], dtype=pto_dtype) - - tile_cfg = pto.TileBufConfig() - - # Dynamic valid rows + cols - tile_type = pto.TileBufType( - shape=[tile_rows, tile_cols], - valid_shape=[-1, -1], - dtype=pto_dtype, - memory_space="VEC", - config=tile_cfg, - ) - - # Dynamic valid cols - tile_sum_type = pto.TileBufType( - shape=[1, tile_cols], - valid_shape=[1, -1], - dtype=pto_dtype, - memory_space="VEC", - config=tile_cfg, - ) - - return { - "ptr_type": ptr_type, - "pto_dtype": pto_dtype, - "index_dtype": index_dtype, - "tensor2d_type": tensor2d_type, - "subtensor_in": subtensor_in, - "subtensor_out": subtensor_out, - "tile_type": tile_type, - "tile_sum_type": tile_sum_type, - "tile_rows": tile_rows, - "tile_cols": tile_cols, - } +def build_col_reduce(kind="sum", dtype="fp32"): + """ + Generic column-wise reduction across rows. + Semantics: + y[col] = reduce_i x[i, col] + """ + if kind not in _COL_REDUCE_OPS: + raise ValueError(f"Unsupported column reduction kind: {kind}") -def build_colsum(dtype="fp32"): + col_reduce = _COL_REDUCE_OPS[kind] + combine = _COL_COMBINE_OPS[kind] _meta_data = lambda: meta_data_col(dtype=dtype) @to_ir_module(meta_data=_meta_data) @@ -203,83 +225,135 @@ def _kernel( shape=[c1, n_cols], strides=[n_cols, c1], ) + for col in pto.range(col_start, col_end, c_tile_cols): + cols_this = s.min_u(c_tile_cols, col_end - col) + rows_this0 = s.min_u(c_tile_rows, batch) + + tb_x0 = pto.alloc_tile( + tile_type, + valid_row=rows_this0, + valid_col=cols_this, + ) + tb_tmp0 = pto.alloc_tile( + tile_type, + valid_row=rows_this0, + valid_col=cols_this, + ) + tb_acc = pto.alloc_tile( + tile_out_type, + valid_col=cols_this, + ) + + sv_x0 = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[c0, col], + sizes=[rows_this0, cols_this], + ) + pto.load(sv_x0, tb_x0) + col_reduce(tb_x0, tb_tmp0, tb_acc) + + for row in pto.range(c_tile_rows, batch, c_tile_rows): + rows_this = s.min_u(c_tile_rows, batch - row) + + tb_x = pto.alloc_tile( + tile_type, + valid_row=rows_this, + valid_col=cols_this, + ) + tb_tmp = pto.alloc_tile( + tile_type, + valid_row=rows_this, + valid_col=cols_this, + ) + tb_part = pto.alloc_tile( + tile_out_type, + valid_col=cols_this, + ) + + sv_x = pto.slice_view( + subtensor_in, + source=tv_x, + offsets=[row, col], + sizes=[rows_this, cols_this], + ) + pto.load(sv_x, tb_x) + col_reduce(tb_x, tb_tmp, tb_part) + combine(tb_acc, tb_part, tb_acc) - with pto.if_context(num_cols > c0): - for col in pto.range(col_start, col_end, c_tile_cols): - cols_this = s.min_u(c_tile_cols, col_end - col) - - with pto.if_context(batch > c0): - first_rows = s.min_u(c_tile_rows, batch) - - tb_x = pto.alloc_tile( - tile_type, - valid_row=first_rows, - valid_col=cols_this, - ) - tb_tmp = pto.alloc_tile( - tile_type, - valid_row=first_rows, - valid_col=cols_this, - ) - tb_acc = pto.alloc_tile( - tile_sum_type, - valid_col=cols_this, - ) - - sv_x0 = pto.slice_view( - subtensor_in, - source=tv_x, - offsets=[c0, col], - sizes=[first_rows, cols_this], - ) - - pto.load(sv_x0, tb_x) - tile.col_sum(tb_x, tb_tmp, tb_acc) - - for row in pto.range(c_tile_rows, batch, c_tile_rows): - rows_this = s.min_u(c_tile_rows, batch - row) - - tb_x_k = pto.alloc_tile( - tile_type, - valid_row=rows_this, - valid_col=cols_this, - ) - tb_tmp_k = pto.alloc_tile( - tile_type, - valid_row=rows_this, - valid_col=cols_this, - ) - tb_part = pto.alloc_tile( - tile_sum_type, - valid_col=cols_this, - ) - - sv_xk = pto.slice_view( - subtensor_in, - source=tv_x, - offsets=[row, col], - sizes=[rows_this, cols_this], - ) - - pto.load(sv_xk, tb_x_k) - tile.col_sum(tb_x_k, tb_tmp_k, tb_part) - tile.add(tb_acc, tb_part, tb_acc) - - sv_y = pto.slice_view( - subtensor_out, - source=tv_y, - offsets=[c0, col], - sizes=[c1, cols_this], - ) - pto.store(tb_acc, sv_y) + sv_y = pto.slice_view( + subtensor_out, + source=tv_y, + offsets=[c0, col], + sizes=[c1, cols_this], + ) + pto.store(tb_acc, sv_y) return _kernel + + +def build_rowsum(dtype="fp32"): + return build_row_reduce("sum", dtype=dtype) + + +def build_rowmin(dtype="fp32"): + return build_row_reduce("min", dtype=dtype) + + +def build_rowmax(dtype="fp32"): + return build_row_reduce("max", dtype=dtype) + + +def build_rowprod(dtype="fp32"): + return build_row_reduce("prod", dtype=dtype) + + +def build_colsum(dtype="fp32"): + return build_col_reduce("sum", dtype=dtype) + + +def build_colmin(dtype="fp32"): + return build_col_reduce("min", dtype=dtype) + + +def build_colmax(dtype="fp32"): + return build_col_reduce("max", dtype=dtype) + + +def build_colprod(dtype="fp32"): + return build_col_reduce("prod", dtype=dtype) + + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() - parser.add_argument("--mode", choices=["row", "col"], default="row") + parser.add_argument( + "--mode", + choices=[ + "rowsum", + "rowmin", + "rowmax", + "rowprod", + "colsum", + "colmin", + "colmax", + "colprod", + ], + default="rowsum", + ) parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") args = parser.parse_args() - builder = build_rowsum if args.mode == "row" else build_colsum - print(builder(dtype=args.dtype)) + + builders = { + "rowsum": build_rowsum, + "rowmin": build_rowmin, + "rowmax": build_rowmax, + "rowprod": build_rowprod, + "colsum": build_colsum, + "colmin": build_colmin, + "colmax": build_colmax, + "colprod": build_colprod, + } + + print(builders[args.mode](dtype=args.dtype)) \ No newline at end of file diff --git a/tests/npu/sum_dynamic_multicore/test_sum.py b/tests/npu/sum_dynamic_multicore/test_sum.py index 659f029c..9626d5a1 100644 --- a/tests/npu/sum_dynamic_multicore/test_sum.py +++ b/tests/npu/sum_dynamic_multicore/test_sum.py @@ -11,10 +11,24 @@ _DIR = os.path.dirname(os.path.abspath(__file__)) _DEVICE = get_test_device() -_ROWSUM_LIB_PATH = os.path.join(_DIR, "rowsum_lib.so") -_COLSUM_LIB_PATH = os.path.join(_DIR, "colsum_lib.so") _BLOCK_DIM = 24 +_KERNELS = [ + "rowsum", + "rowmin", + "rowmax", + # "rowprod", + "colsum", + # "colmin", + # "colmax", + # "colprod", +] + +_LIB_PATHS = { + name: os.path.join(_DIR, f"{name}_lib.so") + for name in _KERNELS +} + _SHAPES = [ (1, 1), (2, 3), @@ -42,30 +56,18 @@ @pytest.fixture(scope="session") -def compiled_sum(): +def compiled_kernels(): subprocess.check_call(["bash", os.path.join(_DIR, "compile.sh")], cwd=_DIR) yield - if os.path.exists(_ROWSUM_LIB_PATH): - os.remove(_ROWSUM_LIB_PATH) - if os.path.exists(_COLSUM_LIB_PATH): - os.remove(_COLSUM_LIB_PATH) - - -def test_build_rowsum(compiled_sum): - assert os.path.exists(_ROWSUM_LIB_PATH) + for path in _LIB_PATHS.values(): + if os.path.exists(path): + os.remove(path) -def test_build_colsum(compiled_sum): - assert os.path.exists(_COLSUM_LIB_PATH) - - -@pytest.mark.require_npu -@pytest.mark.parametrize("batch, n_cols", _SHAPES, ids=_SHAPE_IDS) -def test_rowsum_precision(compiled_sum, batch, n_cols): - import torch_npu # noqa: F401 - - lib = ctypes.CDLL(_ROWSUM_LIB_PATH) - lib.call_rowsum.argtypes = [ +def _load_kernel(name): + lib = ctypes.CDLL(_LIB_PATHS[name]) + fn = getattr(lib, f"call_{name}") + fn.argtypes = [ ctypes.c_uint32, ctypes.c_void_p, ctypes.c_void_p, @@ -73,52 +75,72 @@ def test_rowsum_precision(compiled_sum, batch, n_cols): ctypes.c_uint32, ctypes.c_uint32, ] - lib.call_rowsum.restype = None + fn.restype = None + return fn - torch.npu.set_device(_DEVICE) - x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) - y = torch.full((batch,), float("nan"), device=_DEVICE, dtype=torch.float32) - y_ref = x.float().sum(dim=-1) +def _reference(name, x): + if name == "rowsum": + return x.float().sum(dim=-1) + if name == "rowmin": + return x.float().amin(dim=-1) + if name == "rowmax": + return x.float().amax(dim=-1) + if name == "rowprod": + return x.float().prod(dim=-1) + if name == "colsum": + return x.float().sum(dim=0) + if name == "colmin": + return x.float().amin(dim=0) + if name == "colmax": + return x.float().amax(dim=0) + if name == "colprod": + return x.float().prod(dim=0) + raise ValueError(f"Unknown kernel: {name}") - stream_ptr = torch.npu.current_stream()._as_parameter_ - lib.call_rowsum( - ctypes.c_uint32(_BLOCK_DIM), - stream_ptr, - ctypes.c_void_p(x.data_ptr()), - ctypes.c_void_p(y.data_ptr()), - ctypes.c_uint32(batch), - ctypes.c_uint32(n_cols), - ) - torch.npu.synchronize() - torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) +def _output_shape(name, batch, n_cols): + return (batch,) if name.startswith("row") else (n_cols,) + + +def _make_input(name, batch, n_cols, device): + if name.endswith("prod"): + return torch.empty(batch, n_cols, device=device, dtype=torch.float32).uniform_(0.5, 1.5) + return torch.randn(batch, n_cols, device=device, dtype=torch.float32) + + +def _tolerances(name): + if name.endswith("prod"): + return {"atol": 1e-3, "rtol": 1e-3} + return {"atol": 1e-4, "rtol": 0} + + +@pytest.mark.parametrize("name", _KERNELS) +def test_build_kernel(compiled_kernels, name): + assert os.path.exists(_LIB_PATHS[name]) @pytest.mark.require_npu +@pytest.mark.parametrize("name", _KERNELS) @pytest.mark.parametrize("batch, n_cols", _SHAPES, ids=_SHAPE_IDS) -def test_colsum_precision(compiled_sum, batch, n_cols): +def test_kernel_precision(compiled_kernels, name, batch, n_cols): import torch_npu # noqa: F401 - lib = ctypes.CDLL(_COLSUM_LIB_PATH) - lib.call_colsum.argtypes = [ - ctypes.c_uint32, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_uint32, - ctypes.c_uint32, - ] - lib.call_colsum.restype = None - torch.npu.set_device(_DEVICE) - x = torch.randn(batch, n_cols, device=_DEVICE, dtype=torch.float32) - y = torch.full((n_cols,), float("nan"), device=_DEVICE, dtype=torch.float32) - y_ref = x.float().sum(dim=0) + fn = _load_kernel(name) + + x = _make_input(name, batch, n_cols, _DEVICE) + y = torch.full( + _output_shape(name, batch, n_cols), + float("nan"), + device=_DEVICE, + dtype=torch.float32, + ) + y_ref = _reference(name, x) stream_ptr = torch.npu.current_stream()._as_parameter_ - lib.call_colsum( + fn( ctypes.c_uint32(_BLOCK_DIM), stream_ptr, ctypes.c_void_p(x.data_ptr()), @@ -128,7 +150,7 @@ def test_colsum_precision(compiled_sum, batch, n_cols): ) torch.npu.synchronize() - torch.testing.assert_close(y, y_ref, atol=1e-4, rtol=0) + torch.testing.assert_close(y, y_ref, **_tolerances(name)) if __name__ == "__main__": From 0adadc9c413d26dcfa3273af10a2c48bf004439e Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Wed, 11 Mar 2026 09:49:01 +0000 Subject: [PATCH 21/53] working col min and col max --- tests/npu/sum_dynamic_multicore/caller.py | 4 +-- tests/npu/sum_dynamic_multicore/compile.sh | 4 +-- tests/npu/sum_dynamic_multicore/gen_ir.py | 8 +++--- .../npu/sum_dynamic_multicore/sum_builder.py | 28 ++++++++++++------- tests/npu/sum_dynamic_multicore/test_sum.py | 4 +-- 5 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tests/npu/sum_dynamic_multicore/caller.py b/tests/npu/sum_dynamic_multicore/caller.py index f940bad4..2149a565 100644 --- a/tests/npu/sum_dynamic_multicore/caller.py +++ b/tests/npu/sum_dynamic_multicore/caller.py @@ -36,8 +36,8 @@ def generate_caller(mode, dtype): "rowmax", # "rowprod", "colsum", - # "colmin", - # "colmax", + "colmin", + "colmax", # "colprod", ] diff --git a/tests/npu/sum_dynamic_multicore/compile.sh b/tests/npu/sum_dynamic_multicore/compile.sh index 3a76e113..f2915e68 100755 --- a/tests/npu/sum_dynamic_multicore/compile.sh +++ b/tests/npu/sum_dynamic_multicore/compile.sh @@ -26,8 +26,8 @@ MODES=( rowmax # rowprod colsum - # colmin - # colmax + colmin + colmax # colprod ) diff --git a/tests/npu/sum_dynamic_multicore/gen_ir.py b/tests/npu/sum_dynamic_multicore/gen_ir.py index c3dbbf8b..075c67ea 100644 --- a/tests/npu/sum_dynamic_multicore/gen_ir.py +++ b/tests/npu/sum_dynamic_multicore/gen_ir.py @@ -11,8 +11,8 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from sum_builder import ( - # build_colmax, - # build_colmin, + build_colmax, + build_colmin, # build_colprod, build_colsum, build_rowmax, @@ -27,8 +27,8 @@ "rowmax": build_rowmax, # "rowprod": build_rowprod, "colsum": build_colsum, - # "colmin": build_colmin, - # "colmax": build_colmax, + "colmin": build_colmin, + "colmax": build_colmax, # "colprod": build_colprod, } diff --git a/tests/npu/sum_dynamic_multicore/sum_builder.py b/tests/npu/sum_dynamic_multicore/sum_builder.py index 99754876..a7dadac9 100644 --- a/tests/npu/sum_dynamic_multicore/sum_builder.py +++ b/tests/npu/sum_dynamic_multicore/sum_builder.py @@ -90,8 +90,8 @@ def meta_data_col(dtype="fp32"): _COL_REDUCE_OPS = { "sum": tile.col_sum, - "min": tile.col_min, - "max": tile.col_max, + "min": lambda src, tmp, dst: tile.col_min(src, dst), + "max": lambda src, tmp, dst: tile.col_max(src, dst), "prod": tile.col_prod, } @@ -239,10 +239,14 @@ def _kernel( valid_row=rows_this0, valid_col=cols_this, ) - tb_acc = pto.alloc_tile( - tile_out_type, - valid_col=cols_this, - ) + if kind in {"min", "max"}: + tb_acc = pto.alloc_tile( + tile_type, valid_row=c1, valid_col=cols_this + ) + else: + tb_acc = pto.alloc_tile( + tile_out_type, valid_col=cols_this + ) sv_x0 = pto.slice_view( subtensor_in, @@ -266,10 +270,14 @@ def _kernel( valid_row=rows_this, valid_col=cols_this, ) - tb_part = pto.alloc_tile( - tile_out_type, - valid_col=cols_this, - ) + if kind in {"min", "max"}: + tb_part = pto.alloc_tile( + tile_type, valid_row=c1, valid_col=cols_this + ) + else: + tb_part = pto.alloc_tile( + tile_out_type, valid_col=cols_this + ) sv_x = pto.slice_view( subtensor_in, diff --git a/tests/npu/sum_dynamic_multicore/test_sum.py b/tests/npu/sum_dynamic_multicore/test_sum.py index 9626d5a1..879c5b88 100644 --- a/tests/npu/sum_dynamic_multicore/test_sum.py +++ b/tests/npu/sum_dynamic_multicore/test_sum.py @@ -19,8 +19,8 @@ "rowmax", # "rowprod", "colsum", - # "colmin", - # "colmax", + "colmin", + "colmax", # "colprod", ] From 132e34988a0aa03e8cfe768d3dda8c3c9781d5cd Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Wed, 11 Mar 2026 09:54:43 +0000 Subject: [PATCH 22/53] rename to reduce tests --- .../caller.py | 0 .../compile.sh | 0 .../gen_ir.py | 2 +- .../reduce_builder.py} | 0 .../test_sum.py => reduce_dynamic_multicore/test_reduce.py} | 0 5 files changed, 1 insertion(+), 1 deletion(-) rename tests/npu/{sum_dynamic_multicore => reduce_dynamic_multicore}/caller.py (100%) rename tests/npu/{sum_dynamic_multicore => reduce_dynamic_multicore}/compile.sh (100%) rename tests/npu/{sum_dynamic_multicore => reduce_dynamic_multicore}/gen_ir.py (97%) rename tests/npu/{sum_dynamic_multicore/sum_builder.py => reduce_dynamic_multicore/reduce_builder.py} (100%) rename tests/npu/{sum_dynamic_multicore/test_sum.py => reduce_dynamic_multicore/test_reduce.py} (100%) diff --git a/tests/npu/sum_dynamic_multicore/caller.py b/tests/npu/reduce_dynamic_multicore/caller.py similarity index 100% rename from tests/npu/sum_dynamic_multicore/caller.py rename to tests/npu/reduce_dynamic_multicore/caller.py diff --git a/tests/npu/sum_dynamic_multicore/compile.sh b/tests/npu/reduce_dynamic_multicore/compile.sh similarity index 100% rename from tests/npu/sum_dynamic_multicore/compile.sh rename to tests/npu/reduce_dynamic_multicore/compile.sh diff --git a/tests/npu/sum_dynamic_multicore/gen_ir.py b/tests/npu/reduce_dynamic_multicore/gen_ir.py similarity index 97% rename from tests/npu/sum_dynamic_multicore/gen_ir.py rename to tests/npu/reduce_dynamic_multicore/gen_ir.py index 075c67ea..8224f566 100644 --- a/tests/npu/sum_dynamic_multicore/gen_ir.py +++ b/tests/npu/reduce_dynamic_multicore/gen_ir.py @@ -10,7 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from sum_builder import ( +from reduce_builder import ( build_colmax, build_colmin, # build_colprod, diff --git a/tests/npu/sum_dynamic_multicore/sum_builder.py b/tests/npu/reduce_dynamic_multicore/reduce_builder.py similarity index 100% rename from tests/npu/sum_dynamic_multicore/sum_builder.py rename to tests/npu/reduce_dynamic_multicore/reduce_builder.py diff --git a/tests/npu/sum_dynamic_multicore/test_sum.py b/tests/npu/reduce_dynamic_multicore/test_reduce.py similarity index 100% rename from tests/npu/sum_dynamic_multicore/test_sum.py rename to tests/npu/reduce_dynamic_multicore/test_reduce.py From 838492e6da6c4f6d96726a4a197396188f1afb1a Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Wed, 11 Mar 2026 10:42:06 +0000 Subject: [PATCH 23/53] added reduce op to list for import visibility --- ptodsl/api/tile.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index a4e396c1..637c4ac3 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -104,6 +104,10 @@ def row_prod(src, tmp, dst): _pto.TRowProdOp(src=src, tmp=tmp, dst=dst) +def row_expand(src, dst): + _pto.TRowExpandOp(src=src, dst=dst) + + def col_sum(src, tmp, dst, is_binary=True): _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) @@ -120,6 +124,10 @@ def col_prod(src, tmp, dst, is_binary=True): _pto.TColProdOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) +def col_expand(src, dst): + _pto.TColExpandOp(src=src, dst=dst) + + def subset(source, offsets, sizes): offset_vals = [_unwrap(v) for v in offsets] return _pto.subset(source, offset_vals, sizes) @@ -149,6 +157,14 @@ def print(source): "matmul_acc", "extract", "row_sum", + "row_min", + "row_max", + "row_prod", + "row_expand", "col_sum", + "col_min", + "col_max", + "col_prod", + "col_expand", "subset", ] \ No newline at end of file From 7800e14ec108f7b37f9d8ff79f2bdc8bfc379579 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Wed, 11 Mar 2026 15:25:20 +0000 Subject: [PATCH 24/53] added row/col expand dynamic multicore tests --- ptodsl/api/tile.py | 15 + tests/npu/expand_dynamic_multicore/caller.py | 70 ++++ tests/npu/expand_dynamic_multicore/compile.sh | 44 +++ .../expand_builder.py | 351 ++++++++++++++++++ tests/npu/expand_dynamic_multicore/gen_ir.py | 39 ++ .../expand_dynamic_multicore/test_expand.py | 182 +++++++++ 6 files changed, 701 insertions(+) create mode 100644 tests/npu/expand_dynamic_multicore/caller.py create mode 100755 tests/npu/expand_dynamic_multicore/compile.sh create mode 100644 tests/npu/expand_dynamic_multicore/expand_builder.py create mode 100644 tests/npu/expand_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/expand_dynamic_multicore/test_expand.py diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index 637c4ac3..3a186dc3 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -108,6 +108,18 @@ def row_expand(src, dst): _pto.TRowExpandOp(src=src, dst=dst) +def row_expand_sub(src0, src1, dst): + _pto.TRowExpandSubOp(src0=src0, src1=src1, dst=dst) + + +def row_expand_div(src0, src1, dst): + _pto.TRowExpandDivOp(src0=src0, src1=src1, dst=dst) + + +def row_expand_mul(src0, src1, dst): + _pto.TRowExpandMulOp(src0=src0, src1=src1, dst=dst) + + def col_sum(src, tmp, dst, is_binary=True): _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) @@ -161,6 +173,9 @@ def print(source): "row_max", "row_prod", "row_expand", + "row_expand_sub", + "row_expand_div", + "row_expand_mul", "col_sum", "col_min", "col_max", diff --git a/tests/npu/expand_dynamic_multicore/caller.py b/tests/npu/expand_dynamic_multicore/caller.py new file mode 100644 index 00000000..698d9b17 --- /dev/null +++ b/tests/npu/expand_dynamic_multicore/caller.py @@ -0,0 +1,70 @@ +"""Generate caller.cpp for dynamic multicore col/row expand kernels. + +Usage: + python caller.py --mode colexpand|rowexpand|rowexpand_mul|rowexpand_sub|rowexpand_div +""" + + +_FUSED_MODES = {"rowexpand_mul", "rowexpand_sub", "rowexpand_div"} + + +def generate_caller(mode, dtype): + ctype = "half" if dtype == "fp16" else "float" + if mode in _FUSED_MODES: + return f"""\ +#include "{mode}.cpp" + +extern "C" void call_{mode}( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint8_t *z, + uint32_t batch, + uint32_t n_cols) +{{ + _kernel<<>>( + reinterpret_cast<{ctype} *>(x), + reinterpret_cast<{ctype} *>(y), + reinterpret_cast<{ctype} *>(z), + static_cast(batch), + static_cast(n_cols)); +}} +""" + return f"""\ +#include "{mode}.cpp" + +extern "C" void call_{mode}( + uint32_t blockDim, + void *stream, + uint8_t *src, + uint8_t *dst, + uint32_t batch, + uint32_t n_cols) +{{ + _kernel<<>>( + reinterpret_cast<{ctype} *>(src), + reinterpret_cast<{ctype} *>(dst), + static_cast(batch), + static_cast(n_cols)); +}} +""" + + +if __name__ == "__main__": + import argparse + + MODES = [ + "colexpand", + "rowexpand", + "rowexpand_mul", + "rowexpand_sub", + "rowexpand_div", + ] + + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=MODES, required=True) + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + + print(generate_caller(args.mode, args.dtype)) diff --git a/tests/npu/expand_dynamic_multicore/compile.sh b/tests/npu/expand_dynamic_multicore/compile.sh new file mode 100755 index 00000000..54cbabfd --- /dev/null +++ b/tests/npu/expand_dynamic_multicore/compile.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +TMP=$(mktemp -d) +trap "rm -rf \"$TMP\"" EXIT + + +PTO_LIB_PATH=/sources/pto-isa +BISHENG_FLAGS=( + -I${PTO_LIB_PATH}/include + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong + -xcce -Xhost-start -Xhost-end + -mllvm -cce-aicore-stack-size=0x8000 + -mllvm -cce-aicore-function-stack-size=0x8000 + -mllvm -cce-aicore-record-overflow=true + -mllvm -cce-aicore-addr-transform + -mllvm -cce-aicore-dcci-insert-for-scalar=false + --npu-arch=dav-2201 -DMEMORY_BASE + -std=gnu++17 +) + +MODES=( + colexpand + rowexpand + rowexpand_mul + rowexpand_sub + rowexpand_div +) + +for MODE in "${MODES[@]}"; do + python "$SCRIPT_DIR/gen_ir.py" --mode "$MODE" > "$TMP/${MODE}.pto" + ptoas --enable-insert-sync "$TMP/${MODE}.pto" -o "$TMP/${MODE}.cpp" + + python "$SCRIPT_DIR/caller.py" --mode "$MODE" > "$TMP/${MODE}_caller.cpp" + + bisheng "${BISHENG_FLAGS[@]}" \ + "$TMP/${MODE}_caller.cpp" \ + -o "$SCRIPT_DIR/${MODE}_lib.so" + + echo "Built ${MODE}_lib.so successfully." +done diff --git a/tests/npu/expand_dynamic_multicore/expand_builder.py b/tests/npu/expand_dynamic_multicore/expand_builder.py new file mode 100644 index 00000000..b36a6b66 --- /dev/null +++ b/tests/npu/expand_dynamic_multicore/expand_builder.py @@ -0,0 +1,351 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +_TILE_ROWS = 32 +_TILE_COLS = 32 + + +def meta_data_expand(dtype="fp32"): + pto_dtype = {"fp16": pto.float16, "fp32": pto.float32}[dtype] + ptr_type = pto.PtrType(pto_dtype) + index_dtype = pto.int32 + + tile_rows = _TILE_ROWS + tile_cols = _TILE_COLS + + tensor2d_type = pto.TensorType(rank=2, dtype=pto_dtype) + + # For col_expand: src slice is [1, tile_cols] (one row of the input vector) + subtensor_col_src = pto.SubTensorType(shape=[1, tile_cols], dtype=pto_dtype) + # For row_expand: src slice is [tile_rows, 1] (one column of the input vector) + subtensor_row_src = pto.SubTensorType(shape=[tile_rows, 1], dtype=pto_dtype) + # For loading/storing the 2D matrix + subtensor_dst = pto.SubTensorType(shape=[tile_rows, tile_cols], dtype=pto_dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[tile_rows, tile_cols], + valid_shape=[-1, -1], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ) + return { + "ptr_type": ptr_type, + "pto_dtype": pto_dtype, + "index_dtype": index_dtype, + "tensor2d_type": tensor2d_type, + "subtensor_col_src": subtensor_col_src, + "subtensor_row_src": subtensor_row_src, + "subtensor_dst": subtensor_dst, + "tile_type": tile_type, + "tile_rows": tile_rows, + "tile_cols": tile_cols, + } + + +def build_col_expand(dtype="fp32"): + """ + Column-wise broadcast: replicate each element of y[j] across all rows. + + Semantics: + X[i, j] = y[j] + """ + _meta_data = lambda: meta_data_expand(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + y_ptr: "ptr_type", + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile_rows = const(tile_rows) + c_tile_cols = const(tile_cols) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + + cols_per_core = s.ceil_div(n_cols, num_cores) + col_start = bid * cols_per_core + col_end = s.min_u(col_start + cols_per_core, n_cols) + + # y[n_cols] represented as 2D [1, n_cols] for uniform slice_view usage + tv_y = pto.as_tensor( + tensor2d_type, + ptr=y_ptr, + shape=[c1, n_cols], + strides=[n_cols, c1], + ) + tv_x = pto.as_tensor( + tensor2d_type, + ptr=x_ptr, + shape=[batch, n_cols], + strides=[n_cols, c1], + ) + + for col in pto.range(col_start, col_end, c_tile_cols): + cols_this = s.min_u(c_tile_cols, col_end - col) + + # Load one row of y into the src tile (valid_row=1) + tb_src = pto.alloc_tile(tile_type, valid_row=c1, valid_col=cols_this) + sv_y = pto.slice_view( + subtensor_col_src, + source=tv_y, + offsets=[c0, col], + sizes=[c1, cols_this], + ) + pto.load(sv_y, tb_src) + + for row in pto.range(c0, batch, c_tile_rows): + rows_this = s.min_u(c_tile_rows, batch - row) + + tb_dst = pto.alloc_tile( + tile_type, valid_row=rows_this, valid_col=cols_this + ) + tile.col_expand(tb_src, tb_dst) + + sv_x = pto.slice_view( + subtensor_dst, + source=tv_x, + offsets=[row, col], + sizes=[rows_this, cols_this], + ) + pto.store(tb_dst, sv_x) + + return _kernel + + +def build_row_expand(dtype="fp32"): + """ + Row-wise broadcast: replicate each element of x[i] across all columns. + + Semantics: + Y[i,j] = x[i] + """ + _meta_data = lambda: meta_data_expand(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile_rows = const(tile_rows) + c_tile_cols = const(tile_cols) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + + rows_per_core = s.ceil_div(batch, num_cores) + row_start = bid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + + # x[batch] represented as 2D [batch, 1] for uniform slice_view usage + tv_x = pto.as_tensor( + tensor2d_type, + ptr=x_ptr, + shape=[batch, c1], + strides=[c1, c1], + ) + tv_y = pto.as_tensor( + tensor2d_type, + ptr=y_ptr, + shape=[batch, n_cols], + strides=[n_cols, c1], + ) + + for row in pto.range(row_start, row_end, c_tile_rows): + rows_this = s.min_u(c_tile_rows, row_end - row) + + # Load one column of x into the src tile (valid_col=1) + tb_src = pto.alloc_tile(tile_type, valid_row=rows_this, valid_col=c1) + sv_x = pto.slice_view( + subtensor_row_src, + source=tv_x, + offsets=[row, c0], + sizes=[rows_this, c1], + ) + pto.load(sv_x, tb_src) + + for col in pto.range(c0, n_cols, c_tile_cols): + cols_this = s.min_u(c_tile_cols, n_cols - col) + + tb_dst = pto.alloc_tile( + tile_type, valid_row=rows_this, valid_col=cols_this + ) + tile.row_expand(tb_src, tb_dst) + + sv_y = pto.slice_view( + subtensor_dst, + source=tv_y, + offsets=[row, col], + sizes=[rows_this, cols_this], + ) + pto.store(tb_dst, sv_y) + + return _kernel + + +# Fused row-expand ops: dst[i,j] = src0[i,j] op src1[0,i] +# src1 is a row-vector tile (valid_row=1, valid_col=rows_this) +# so src1[0,i] = x[row+i] per the hardware op convention. +_ROW_EXPAND_FUSED_OPS = { + "expand_mul": tile.row_expand_mul, + "expand_sub": tile.row_expand_sub, + "expand_div": tile.row_expand_div, +} + + +def _build_row_expand_fused(kind, dtype="fp32"): + """ + Fused row-expand: apply element-wise op between Y[i,j] and x[i]. + + Semantics: + expand_mul: Y[i,j] *= x[i] + expand_sub: Y[i,j] -= x[i] + expand_div: Y[i,j] /= x[i] + + src1 tile is a scalar [1, 1]: src1[0,0] = x[row], one row at a time. + """ + row_op = _ROW_EXPAND_FUSED_OPS[kind] + _meta_data = lambda: meta_data_expand(dtype=dtype) + + @to_ir_module(meta_data=_meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + z_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile_cols = const(tile_cols) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + bid = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + + rows_per_core = s.ceil_div(batch, num_cores) + row_start = bid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + + # y[batch, n_cols] - input matrix (src0) + tv_y = pto.as_tensor( + tensor2d_type, + ptr=y_ptr, + shape=[batch, n_cols], + strides=[n_cols, c1], + ) + # z[batch, n_cols] - output matrix (dst) + tv_z = pto.as_tensor( + tensor2d_type, + ptr=z_ptr, + shape=[batch, n_cols], + strides=[n_cols, c1], + ) + # x as column vector [batch, 1]: x[row] stored at tv_x[row, 0] + tv_x = pto.as_tensor( + tensor2d_type, + ptr=x_ptr, + shape=[batch, c1], + strides=[c1, c1], + ) + + # Process one row at a time so tb_src1 always has rows_this=1, + # making src1[0,0] = x[row] unambiguous for both row/col conventions. + for row in pto.range(row_start, row_end, c1): + # Load scalar x[row] into a [1, 1] tile: src1[0,0] = x[row] + tb_src1 = pto.alloc_tile(tile_type, valid_row=c1, valid_col=c1) + sv_x = pto.slice_view( + subtensor_row_src, + source=tv_x, + offsets=[row, c0], + sizes=[c1, c1], + ) + pto.load(sv_x, tb_src1) + + for col in pto.range(c0, n_cols, c_tile_cols): + cols_this = s.min_u(c_tile_cols, n_cols - col) + + sv_y = pto.slice_view( + subtensor_dst, + source=tv_y, + offsets=[row, col], + sizes=[c1, cols_this], + ) + sv_z = pto.slice_view( + subtensor_dst, + source=tv_z, + offsets=[row, col], + sizes=[c1, cols_this], + ) + + # src0 = one row of Y, src1 = scalar x[row], dst = one row of Z + tb_src0 = pto.alloc_tile(tile_type, valid_row=c1, valid_col=cols_this) + pto.load(sv_y, tb_src0) + + tb_dst = pto.alloc_tile(tile_type, valid_row=c1, valid_col=cols_this) + row_op(tb_src0, tb_src1, tb_dst) + + pto.store(tb_dst, sv_z) + + return _kernel + + +def build_row_expand_mul(dtype="fp32"): + return _build_row_expand_fused("expand_mul", dtype=dtype) + + +def build_row_expand_sub(dtype="fp32"): + return _build_row_expand_fused("expand_sub", dtype=dtype) + + +def build_row_expand_div(dtype="fp32"): + return _build_row_expand_fused("expand_div", dtype=dtype) + + +if __name__ == "__main__": + import argparse + + _MODES = [ + "colexpand", + "rowexpand", + "rowexpand_mul", + "rowexpand_sub", + "rowexpand_div", + ] + + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=_MODES, default="colexpand") + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + + builders = { + "colexpand": build_col_expand, + "rowexpand": build_row_expand, + "rowexpand_mul": build_row_expand_mul, + "rowexpand_sub": build_row_expand_sub, + "rowexpand_div": build_row_expand_div, + } + + print(builders[args.mode](dtype=args.dtype)) diff --git a/tests/npu/expand_dynamic_multicore/gen_ir.py b/tests/npu/expand_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..d63ae353 --- /dev/null +++ b/tests/npu/expand_dynamic_multicore/gen_ir.py @@ -0,0 +1,39 @@ +"""Print MLIR IR for dynamic multicore col/row expand kernels. + +Usage: + python gen_ir.py --mode colexpand|rowexpand|rowexpand_mul|rowexpand_sub|rowexpand_div +""" + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from expand_builder import ( + build_col_expand, + build_row_expand, + build_row_expand_div, + build_row_expand_mul, + build_row_expand_sub, +) + +_BUILDERS = { + "colexpand": build_col_expand, + "rowexpand": build_row_expand, + "rowexpand_mul": build_row_expand_mul, + "rowexpand_sub": build_row_expand_sub, + "rowexpand_div": build_row_expand_div, +} + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", + choices=list(_BUILDERS.keys()), + default="colexpand", + ) + parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") + args = parser.parse_args() + + print(_BUILDERS[args.mode](dtype=args.dtype)) diff --git a/tests/npu/expand_dynamic_multicore/test_expand.py b/tests/npu/expand_dynamic_multicore/test_expand.py new file mode 100644 index 00000000..4700490a --- /dev/null +++ b/tests/npu/expand_dynamic_multicore/test_expand.py @@ -0,0 +1,182 @@ +import ctypes +import os +import subprocess + +import pytest +import torch + +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() +_BLOCK_DIM = 24 + +_KERNELS = [ + "colexpand", + "rowexpand", + "rowexpand_mul", + "rowexpand_sub", + "rowexpand_div", +] + +_LIB_PATHS = { + name: os.path.join(_DIR, f"{name}_lib.so") + for name in _KERNELS +} + +_SHAPES = [ + (1, 1), + (2, 3), + (3, 2), + (7, 7), + (15, 17), + (17, 15), + (31, 32), + (32, 31), + (32, 32), + (33, 33), + (31, 33), + (33, 31), + (64, 32), + (32, 64), + (63, 64), + (64, 63), + (65, 33), + (65, 64), + (29, 257), + (127, 129), +] + +_SHAPE_IDS = [f"batch{batch}-cols{n_cols}" for batch, n_cols in _SHAPES] + + +@pytest.fixture(scope="session") +def compiled_kernels(): + subprocess.check_call(["bash", os.path.join(_DIR, "compile.sh")], cwd=_DIR) + yield + for path in _LIB_PATHS.values(): + if os.path.exists(path): + os.remove(path) + + +_FUSED_KERNELS = {"rowexpand_mul", "rowexpand_sub", "rowexpand_div"} + + +def _load_kernel(name): + lib = ctypes.CDLL(_LIB_PATHS[name]) + fn = getattr(lib, f"call_{name}") + if name in _FUSED_KERNELS: + # fused: (blockDim, stream, x, y, z, batch, n_cols) + fn.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ] + else: + fn.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ] + fn.restype = None + return fn + + +def _make_inputs(name, batch, n_cols, device): + if name == "colexpand": + src = torch.randn(n_cols, device=device, dtype=torch.float32) + dst = torch.zeros((batch, n_cols), device=device, dtype=torch.float32) + return src, dst, None + if name == "rowexpand": + src = torch.randn(batch, device=device, dtype=torch.float32) + dst = torch.zeros((batch, n_cols), device=device, dtype=torch.float32) + return src, dst, None + if name == "rowexpand_div": + # avoid division by zero: keep x away from 0 + x = torch.empty(batch, device=device, dtype=torch.float32).uniform_(0.5, 1.5) + y = torch.randn(batch, n_cols, device=device, dtype=torch.float32) + z = torch.zeros((batch, n_cols), device=device, dtype=torch.float32) + return x, y, z + # rowexpand_mul, rowexpand_sub + x = torch.randn(batch, device=device, dtype=torch.float32) + y = torch.randn(batch, n_cols, device=device, dtype=torch.float32) + z = torch.zeros((batch, n_cols), device=device, dtype=torch.float32) + return x, y, z + + +def _reference(name, x, y): + if name == "colexpand": + return x.float().unsqueeze(0).expand_as(y) + if name == "rowexpand": + return x.float().unsqueeze(1).expand_as(y) + if name == "rowexpand_mul": + return y.float() * x.float().unsqueeze(1) + if name == "rowexpand_sub": + return y.float() - x.float().unsqueeze(1) + if name == "rowexpand_div": + return y.float() / x.float().unsqueeze(1) + raise ValueError(f"Unknown kernel: {name}") + + +def _tolerances(name): + if name in {"colexpand", "rowexpand"}: + return {"atol": 0, "rtol": 0} + return {"atol": 1e-4, "rtol": 1e-4} + + +@pytest.mark.parametrize("name", _KERNELS) +def test_build_kernel(compiled_kernels, name): + assert os.path.exists(_LIB_PATHS[name]) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("name", _KERNELS) +@pytest.mark.parametrize("batch, n_cols", _SHAPES, ids=_SHAPE_IDS) +def test_kernel_precision(compiled_kernels, name, batch, n_cols): + import torch_npu # noqa: F401 + + torch.npu.set_device(_DEVICE) + + fn = _load_kernel(name) + + x, y, z = _make_inputs(name, batch, n_cols, _DEVICE) + dst_ref = _reference(name, x, y) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + if name in _FUSED_KERNELS: + fn( + ctypes.c_uint32(_BLOCK_DIM), + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_void_p(z.data_ptr()), + ctypes.c_uint32(batch), + ctypes.c_uint32(n_cols), + ) + out = z + else: + fn( + ctypes.c_uint32(_BLOCK_DIM), + stream_ptr, + ctypes.c_void_p(x.data_ptr()), + ctypes.c_void_p(y.data_ptr()), + ctypes.c_uint32(batch), + ctypes.c_uint32(n_cols), + ) + out = y + torch.npu.synchronize() + + torch.testing.assert_close(out, dst_ref, **_tolerances(name)) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) From 0956c8eaea38ebf8bef6ece70dc31fac8f0b24d3 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Wed, 11 Mar 2026 15:28:15 +0000 Subject: [PATCH 25/53] reduce number of tests for expand --- tests/npu/expand_dynamic_multicore/test_expand.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/npu/expand_dynamic_multicore/test_expand.py b/tests/npu/expand_dynamic_multicore/test_expand.py index 4700490a..5cd0d30a 100644 --- a/tests/npu/expand_dynamic_multicore/test_expand.py +++ b/tests/npu/expand_dynamic_multicore/test_expand.py @@ -28,22 +28,12 @@ _SHAPES = [ (1, 1), - (2, 3), - (3, 2), (7, 7), (15, 17), - (17, 15), - (31, 32), - (32, 31), - (32, 32), - (33, 33), (31, 33), (33, 31), (64, 32), (32, 64), - (63, 64), - (64, 63), - (65, 33), (65, 64), (29, 257), (127, 129), From 36cd417e230a84a80870f564de27539ccc7c4a75 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Fri, 13 Mar 2026 19:23:34 +0100 Subject: [PATCH 26/53] Manually polished matmul optimization guide (#82) * ignore artifacts * remove unused script * rename script * remove _256 suffix * rename * rename in guide * improve numpy emulation * better size print, update guide * Manual draft of optimization guide and figures * move early version to experimental dir * move up dir * rename dir to matmul optimization guide * link to optim guide * 90% finish * remove old guide * consistent title * font * move * remove copy * Note on frontend * msprof link * font * swizzle explain * smaller img * plot FLOPs * unify figure path * fix figure suffix * larger font * remark on manual sync * ignore artifacts * add FLOPs plots * fix typo * fix typo * fix typo * fix single-buffer pipeline figure * global grammer and typo clean-up * more explain on swizzle * move figures * typo * update frontend note * typo --------- Co-authored-by: jiawei_zhuang --- .../aot/matmul_optimization_guide/.gitignore | 1 + .../README.md | 14 +- .../bench_matmul.py | 114 ++++ .../caller.cpp | 0 .../common_utils.py | 20 +- .../compile.sh | 0 .../experimental}/README.md | 0 .../experimental}/bench_matmul.py | 0 .../experimental}/caller.cpp | 0 .../experimental}/compile.sh | 0 .../experimental}/matmul_builder.py | 0 .../experimental}/run_matmul.py | 0 .../fig/cachehit_N16384.png | Bin 0 -> 59260 bytes .../fig/cachehit_N16384_swizzle.png | Bin 0 -> 58588 bytes .../fig/cachehit_N4096.png | Bin 0 -> 59806 bytes .../fig/flops_step1_baseline.png | Bin 0 -> 90408 bytes .../fig/flops_step2_doublebuf.png | Bin 0 -> 95227 bytes .../fig/flops_step3_swizzle.png | Bin 0 -> 98623 bytes .../fig/flops_step4_manual_pipeline.png | Bin 0 -> 100320 bytes .../fig/pipeline_N1024_baseline.png | Bin 0 -> 71181 bytes .../fig/pipeline_N1024_doublebuf.png | Bin 0 -> 73000 bytes .../matmul_optim_guide.md | 337 ++++++++++++ .../run_matmul.py} | 0 .../step1_baseline.py | 10 +- .../step1_baseline_numpy_sim.py | 111 ++++ .../step2_doublebuffer.py | 10 +- .../step3_swizzle.py | 10 +- .../step3_swizzle_numpy_sim.py | 0 .../step4_manual_pipelining.py | 10 +- examples/aot/matmul_swizzle/.gitignore | 3 - .../step_by_step_guide/.gitignore | 4 - .../step_by_step_guide/optimization_guide.md | 489 ------------------ .../simple_matmul_builder.py | 26 - .../single_buffer_matmul.py | 9 - .../step_by_step_guide/step1_numpy_sim.py | 92 ---- 35 files changed, 600 insertions(+), 660 deletions(-) create mode 100644 examples/aot/matmul_optimization_guide/.gitignore rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/README.md (54%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/bench_matmul.py (70%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/caller.cpp (100%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/common_utils.py (73%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/compile.sh (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/README.md (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/bench_matmul.py (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/caller.cpp (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/compile.sh (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/matmul_builder.py (100%) rename examples/aot/{matmul_swizzle => matmul_optimization_guide/experimental}/run_matmul.py (100%) create mode 100644 examples/aot/matmul_optimization_guide/fig/cachehit_N16384.png create mode 100644 examples/aot/matmul_optimization_guide/fig/cachehit_N16384_swizzle.png create mode 100644 examples/aot/matmul_optimization_guide/fig/cachehit_N4096.png create mode 100644 examples/aot/matmul_optimization_guide/fig/flops_step1_baseline.png create mode 100644 examples/aot/matmul_optimization_guide/fig/flops_step2_doublebuf.png create mode 100644 examples/aot/matmul_optimization_guide/fig/flops_step3_swizzle.png create mode 100644 examples/aot/matmul_optimization_guide/fig/flops_step4_manual_pipeline.png create mode 100644 examples/aot/matmul_optimization_guide/fig/pipeline_N1024_baseline.png create mode 100644 examples/aot/matmul_optimization_guide/fig/pipeline_N1024_doublebuf.png create mode 100644 examples/aot/matmul_optimization_guide/matmul_optim_guide.md rename examples/aot/{matmul_swizzle/step_by_step_guide/run_simple_matmul.py => matmul_optimization_guide/run_matmul.py} (100%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/step1_baseline.py (94%) create mode 100644 examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/step2_doublebuffer.py (93%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/step3_swizzle.py (94%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/step3_swizzle_numpy_sim.py (100%) rename examples/aot/{matmul_swizzle/step_by_step_guide => matmul_optimization_guide}/step4_manual_pipelining.py (95%) delete mode 100644 examples/aot/matmul_swizzle/.gitignore delete mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/.gitignore delete mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md delete mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py delete mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py delete mode 100644 examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py diff --git a/examples/aot/matmul_optimization_guide/.gitignore b/examples/aot/matmul_optimization_guide/.gitignore new file mode 100644 index 00000000..2672482f --- /dev/null +++ b/examples/aot/matmul_optimization_guide/.gitignore @@ -0,0 +1 @@ +build_artifacts diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/README.md b/examples/aot/matmul_optimization_guide/README.md similarity index 54% rename from examples/aot/matmul_swizzle/step_by_step_guide/README.md rename to examples/aot/matmul_optimization_guide/README.md index 425fdc42..d3b92e57 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/README.md +++ b/examples/aot/matmul_optimization_guide/README.md @@ -1,3 +1,5 @@ +See [matmul_optim_guide.md](./matmul_optim_guide.md) for a step-by-step algorithm walkthrough. + Usage: ```bash @@ -5,13 +7,13 @@ Usage: bash ./compile.sh # Run correctness on all steps (default) -python ./run_simple_matmul.py +python ./run_matmul.py # Or run one specific tutorial step -python ./run_simple_matmul.py --variant step1-baseline -python ./run_simple_matmul.py --variant step2-doublebuffer -python ./run_simple_matmul.py --variant step3-swizzle -python ./run_simple_matmul.py --variant step4-manual-pipelining +python ./run_matmul.py --variant step1-baseline +python ./run_matmul.py --variant step2-doublebuffer +python ./run_matmul.py --variant step3-swizzle +python ./run_matmul.py --variant step4-manual-pipelining # Stepwise benchmark comparisons: # Step1: double-buffer vs single-buffer (both non-swizzle, auto-sync) @@ -19,5 +21,3 @@ python ./run_simple_matmul.py --variant step4-manual-pipelining # Step3: manual-sync vs auto-sync (both double-buffer, swizzle) python ./bench_matmul.py ``` - -See `optimization_guide.md` for full step-by-step build and benchmark commands. diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py b/examples/aot/matmul_optimization_guide/bench_matmul.py similarity index 70% rename from examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py rename to examples/aot/matmul_optimization_guide/bench_matmul.py index 1aa569c3..82fdc0ba 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/bench_matmul.py +++ b/examples/aot/matmul_optimization_guide/bench_matmul.py @@ -18,6 +18,8 @@ ] N_WARMUP = 5 N_REPEAT = 20 +PLOT_SHAPES_NK = [(8192, 8192), (16384, 16384)] +DEFAULT_PLOT_DIR = Path("fig") def torch_to_ctypes(tensor): @@ -83,6 +85,85 @@ def _time_us(fn, a_list, b_list, warmup, repeat): return start.elapsed_time(end) * 1000.0 / repeat +def _maybe_plot(rows, plot_dir): + try: + import matplotlib.pyplot as plt + except ImportError: + print("matplotlib not installed; skipping plot generation.") + return + + style_candidates = ("seaborn-v0_8-whitegrid", "seaborn-whitegrid") + for style_name in style_candidates: + try: + plt.style.use(style_name) + break + except OSError: + continue + + plt.rcParams["figure.facecolor"] = "white" + plt.rcParams["axes.facecolor"] = "white" + plot_dir.mkdir(parents=True, exist_ok=True) + title_scale = 1.5 + axis_label_scale = 1.5 + legend_scale = 2.0 + + step_defs = [ + ("step1", "single_auto_noswizzle_tflops", "Step1 Kernel", "flops_step1_baseline.png"), + ("step2", "double_auto_noswizzle_tflops", "Step2 Kernel", "flops_step2_doublebuf.png"), + ("step3", "double_auto_swizzle_tflops", "Step3 Kernel", "flops_step3_swizzle.png"), + ("step4", "double_manual_swizzle_tflops", "Step4 Kernel", "flops_step4_manual_pipeline.png"), + ] + + for _, custom_key, custom_label, out_name in step_defs: + fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) + for ax, (n, k) in zip(axes, PLOT_SHAPES_NK): + base_title_size = ax.title.get_size() + base_label_size = ax.xaxis.label.get_size() + chunk = [r for r in rows if r["n"] == n and r["k"] == k] + if not chunk: + ax.set_title(f"TFLOPS vs M (N={n}, K={k})", fontsize=base_title_size * title_scale) + ax.text(0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center") + ax.set_xlabel("M", fontsize=base_label_size * axis_label_scale) + ax.set_ylabel("TFLOPS", fontsize=base_label_size * axis_label_scale) + ax.grid(alpha=0.25) + continue + + chunk = sorted(chunk, key=lambda r: r["m"]) + m_values = [r["m"] for r in chunk] + matmul_tflops = [r["torch_matmul_tflops"] for r in chunk] + custom_tflops = [r[custom_key] for r in chunk] + + ax.plot( + m_values, + matmul_tflops, + marker="x", + linestyle="--", + color="#111111", + label="torch.matmul", + ) + ax.plot( + m_values, + custom_tflops, + marker="o", + linestyle="-", + color="#1f77b4", + label=custom_label, + ) + ax.set_title(f"TFLOPS vs M (N={n}, K={k})", fontsize=base_title_size * title_scale) + ax.set_xlabel("M", fontsize=base_label_size * axis_label_scale) + ax.set_ylabel("TFLOPS", fontsize=base_label_size * axis_label_scale) + ax.set_xlim(left=0) + ax.set_ylim(bottom=0) + ax.grid(alpha=0.25) + ax.legend(fontsize=8 * legend_scale) + + plt.tight_layout() + out = plot_dir / out_name + plt.savefig(out, dpi=160, format="png") + plt.close(fig) + print(f"Saved plot: {out}") + + def _parse_args(): parser = argparse.ArgumentParser( description="Stepwise performance benchmark for buffering, swizzle, and manual sync." @@ -129,6 +210,12 @@ def _parse_args(): default=N_REPEAT, help=f"Timed iterations (default: {N_REPEAT}).", ) + parser.add_argument( + "--plot-dir", + type=str, + default=str(DEFAULT_PLOT_DIR), + help=f"Plot output directory (default: {DEFAULT_PLOT_DIR}).", + ) return parser.parse_args() @@ -165,6 +252,9 @@ def main(): raise FileNotFoundError( f"Single-buffer auto-sync non-swizzle library not found: {single_auto_noswizzle_lib}" ) + plot_dir = Path(args.plot_dir) + if not plot_dir.is_absolute(): + plot_dir = base_dir / plot_dir device = get_test_device() torch.npu.set_device(device) @@ -179,6 +269,7 @@ def main(): ratios_step1_double_vs_single_noswizzle = [] ratios_step2_swizzle_vs_noswizzle = [] ratios_step3_manual_vs_auto_swizzle = [] + plot_rows = [] print(f"double-buffer auto-sync swizzle lib: {double_auto_swizzle_lib}") print(f"double-buffer auto-sync non-swizzle lib: {double_auto_noswizzle_lib}") print(f"double-buffer manual-sync swizzle lib: {double_manual_swizzle_lib}") @@ -204,12 +295,20 @@ def main(): single_auto_noswizzle_us = _time_us( single_auto_noswizzle_mm, a_list, b_list, args.warmup, args.repeat ) + torch_matmul_us = _time_us( + lambda a, b: torch.matmul(a, b.transpose(0, 1)), + a_list, + b_list, + args.warmup, + args.repeat, + ) flops = 2.0 * m * n * k double_auto_swizzle_tflops = flops / double_auto_swizzle_us / 1e6 double_auto_noswizzle_tflops = flops / double_auto_noswizzle_us / 1e6 double_manual_swizzle_tflops = flops / double_manual_swizzle_us / 1e6 single_auto_noswizzle_tflops = flops / single_auto_noswizzle_us / 1e6 + torch_matmul_tflops = flops / torch_matmul_us / 1e6 # Step 1: buffering effect (double-buffer vs single-buffer, both non-swizzle auto-sync). step1_double_vs_single = double_auto_noswizzle_tflops / single_auto_noswizzle_tflops @@ -221,6 +320,18 @@ def main(): ratios_step1_double_vs_single_noswizzle.append(step1_double_vs_single) ratios_step2_swizzle_vs_noswizzle.append(step2_swizzle_vs_noswizzle) ratios_step3_manual_vs_auto_swizzle.append(step3_manual_vs_auto) + plot_rows.append( + { + "m": m, + "n": n, + "k": k, + "torch_matmul_tflops": torch_matmul_tflops, + "single_auto_noswizzle_tflops": single_auto_noswizzle_tflops, + "double_auto_noswizzle_tflops": double_auto_noswizzle_tflops, + "double_auto_swizzle_tflops": double_auto_swizzle_tflops, + "double_manual_swizzle_tflops": double_manual_swizzle_tflops, + } + ) print( f"(M,N,K)=({m},{n},{k}) " @@ -228,6 +339,7 @@ def main(): f"double_noswizzle_auto={double_auto_noswizzle_tflops:.3f}TF, " f"double_swizzle_auto={double_auto_swizzle_tflops:.3f}TF, " f"double_swizzle_manual={double_manual_swizzle_tflops:.3f}TF, " + f"torch_matmul={torch_matmul_tflops:.3f}TF, " f"step1_ratio(double_noswizzle_auto/single_noswizzle)={step1_double_vs_single:.3f}x, " f"step2_ratio(double_swizzle_auto/double_noswizzle_auto)={step2_swizzle_vs_noswizzle:.3f}x, " f"step3_ratio(double_swizzle_manual/double_swizzle_auto)={step3_manual_vs_auto:.3f}x" @@ -258,6 +370,8 @@ def main(): print(f"min FLOP ratio(double_swizzle_manual/double_swizzle_auto): {min_step3:.3f}x") print(f"max FLOP ratio(double_swizzle_manual/double_swizzle_auto): {max_step3:.3f}x") + _maybe_plot(plot_rows, plot_dir) + if __name__ == "__main__": main() diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/caller.cpp b/examples/aot/matmul_optimization_guide/caller.cpp similarity index 100% rename from examples/aot/matmul_swizzle/step_by_step_guide/caller.cpp rename to examples/aot/matmul_optimization_guide/caller.cpp diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py b/examples/aot/matmul_optimization_guide/common_utils.py similarity index 73% rename from examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py rename to examples/aot/matmul_optimization_guide/common_utils.py index d1c2d6b1..0b152cae 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/common_utils.py +++ b/examples/aot/matmul_optimization_guide/common_utils.py @@ -20,31 +20,31 @@ def meta_data(): tv_2d = pto.TensorType(rank=2, dtype=dtype) tile_view_a = pto.SubTensorType(shape=[M_TILE, K_DTILE], dtype=dtype) - tile_view_b_256 = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) - tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) + tile_view_b = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) + tile_view_c = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") - tile_buf_b_l1_256 = pto.TileBufType( + tile_buf_b_l1 = pto.TileBufType( shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg ) tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") - tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") - tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") + tile_buf_b_l0 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") + tile_buf_c = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") return { "ptr_type": ptr_type, "i32": i32, "tv_2d": tv_2d, "tile_view_a": tile_view_a, - "tile_view_b_256": tile_view_b_256, - "tile_view_c_256": tile_view_c_256, + "tile_view_b": tile_view_b, + "tile_view_c": tile_view_c, "tile_buf_a_l1": tile_buf_a_l1, - "tile_buf_b_l1_256": tile_buf_b_l1_256, + "tile_buf_b_l1": tile_buf_b_l1, "tile_buf_a_l0": tile_buf_a_l0, - "tile_buf_b_l0_256": tile_buf_b_l0_256, - "tile_buf_c_256": tile_buf_c_256, + "tile_buf_b_l0": tile_buf_b_l0, + "tile_buf_c": tile_buf_c, } return meta_data diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/compile.sh b/examples/aot/matmul_optimization_guide/compile.sh similarity index 100% rename from examples/aot/matmul_swizzle/step_by_step_guide/compile.sh rename to examples/aot/matmul_optimization_guide/compile.sh diff --git a/examples/aot/matmul_swizzle/README.md b/examples/aot/matmul_optimization_guide/experimental/README.md similarity index 100% rename from examples/aot/matmul_swizzle/README.md rename to examples/aot/matmul_optimization_guide/experimental/README.md diff --git a/examples/aot/matmul_swizzle/bench_matmul.py b/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py similarity index 100% rename from examples/aot/matmul_swizzle/bench_matmul.py rename to examples/aot/matmul_optimization_guide/experimental/bench_matmul.py diff --git a/examples/aot/matmul_swizzle/caller.cpp b/examples/aot/matmul_optimization_guide/experimental/caller.cpp similarity index 100% rename from examples/aot/matmul_swizzle/caller.cpp rename to examples/aot/matmul_optimization_guide/experimental/caller.cpp diff --git a/examples/aot/matmul_swizzle/compile.sh b/examples/aot/matmul_optimization_guide/experimental/compile.sh similarity index 100% rename from examples/aot/matmul_swizzle/compile.sh rename to examples/aot/matmul_optimization_guide/experimental/compile.sh diff --git a/examples/aot/matmul_swizzle/matmul_builder.py b/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py similarity index 100% rename from examples/aot/matmul_swizzle/matmul_builder.py rename to examples/aot/matmul_optimization_guide/experimental/matmul_builder.py diff --git a/examples/aot/matmul_swizzle/run_matmul.py b/examples/aot/matmul_optimization_guide/experimental/run_matmul.py similarity index 100% rename from examples/aot/matmul_swizzle/run_matmul.py rename to examples/aot/matmul_optimization_guide/experimental/run_matmul.py diff --git a/examples/aot/matmul_optimization_guide/fig/cachehit_N16384.png b/examples/aot/matmul_optimization_guide/fig/cachehit_N16384.png new file mode 100644 index 0000000000000000000000000000000000000000..cddf5c6a27257022f51753a299aa0f4c81982fba GIT binary patch literal 59260 zcmeFZWmHuC7Y2&RC=Mbhr68%&Eg?uFp|mh`NJ)2hD5x|l-HgC6bPXvfAV|m1B}#WU z_Y8>tSKs&E`{Azp;jVT4FfL~1ob!wQd-mSXe)hbRmzBT<6NAyv(6FCMK6#FY1|ma4 zyYvKX*)nw`gb-XiuLAD>-W~jTwb$4tVVDd(F1Euzp1g z$9u|3!>0HtJHk-y7NqdAdW%`jy5eGqj#Yo8mS?H5OOm>q;XrQESc!q!ExPBCcccS& z-s({c`%<9!K52TvPMwjsGQRnEk`$Zx1KRbMmQG%wqlqyqBfE)*BLsXslfyF%@u zp?B$6AbA4&`+AueRrrGy>^Bnsn1PQUZrNbXRTf_7K61QFgC1g%h3u+qbIJ}MkF#}n ztx93Uvq*ls_~GYcE`h`wHgfaB&#!&aoKsy9X^xx`4{X$v>c8JSLZgx#t}1qKeqMp;)D(T*W<>s!FKa!C)$ga%bmezu@P~MchkGFEU z#Z?12`l6wsAyO7k{>_h`F6@8Ip84sd-YHqt+$7ug?@dZ-t>O|A7S_oV2L}ht^BB(T zC~FWZ1I5P0F(pc>_4tKyV5H;8H|n$BDZQj}XQaQs<<0wEN*nTjSPh8w*I?&tC3d}X zk!(>#($v#d9H@u-d?)z&vS{Up?ZIr4f9)2!B{AVKOWe1O%DOF)(SJ-?NEocSH1R4M zQ!y+mP8PKZ|FQ&J%S*n^);i>_Z~o&$Z{P;Fe1ty?p}qgQ5OuTH8s3v^qXTcoDgW!% z@?n5|fn0{>IR5=u;QLdA6wcRow^Pp7{qOtZe?qI%7uM$D4gSw@m4PtH8azDaNdNQJ zAUdHn)v&1G|F2D1h>!7eh&WUk0x^8|q@F5VQnf-%a07#yn%dmh_-oHR2Nxz|wDe1Q8PdXlYaa5EhpC{LQO0%Sb`yN z>3YO<{y>+Fmt6XnZGaenhbYvEGOAf7Ack)km8x6iR=*GqET6tcQu@TbKK6T3EL_P1 zlkuYvdrWb)aUf~CMImx2)^C{U&<33W%IPSUJpzws!~P3ueH!s!u$MIU&8YtM!1F&p zz6C7uou;uY-e1?9{Rv!5;jAfb5#sls_WBMJmoM_G-welpVy&t)a0vh3dhi#iAACGA zXRv{V`_@38@Y|=+hg4kQw=2Aj3hDknKSsd8a(QqDvF^;cY@m>J`?Lp5W{d&~mU!zo z0cS$~*WwU85L$CqFz34lECVg|C89?&sf6wKnb?#qAeR?&^j<_5-oxXPcN)yDi0+86 z6aELp0~e?=q9*=+<`HmHAxThb%txR9FsHCRkdIN-U`z#HzsdL(;e zaJyMh))~VVB>htUoRlx&&+>VXatdAnMncIETubO68 z7^OvQGnMW+M=XwwHe_}(Kl)(ZkjHQ|^rY$eiW*&Dh!NBJz6a$anF?GgALn80{((h< zdd5g*r@Vx69V5yG0j=K)Dnb!fB<zpH z*+<5s-)S6G4S8P@oAM`hDbV1{si;|WI%ds}{pe(I-CD9+)J#@OXn(yCKi}5_enNFK zkkm2n;XttXXYo-pzyD2+)ywA^5icUgZQL*P-lo?r4JuqcU@LKT?pC0_q!}Re#ZVyp zUA9=0)^(0V>y&9f>@4L*D0#rLa`{d>q#%<{sUn|0!Gr9yrWgZXvG;%B6tlRu!aX8r zlU+Rk;}Np)8${dbaCGWKbV0?!|=`WdhnD3Gu`)`t?1RO+&@ zzzW^I?)XDFALZ)wzx{2#H!75S+mee~t9&d2eG?KJNRa+dtw>lF0`9Y|%$9FD&Y<hPoEPfk84eB9g85%pU?z~cx%>64CR6LV}@8?VUtf6U5#9C-- zi;1(5jsF_FbvQ71qHE14Bk<9?`Bm&8SK>m6ec^;S{cyIPvtQh4LF3O5g>Owwfj>ET zaQ=F{81Q&a1M3^8_~kW(f~ZBEnr(?NWxvfIE~~0nHJszpp88Ptv-TF&8BSlkU{(TX zKA4Alj0)x>g@gjTf}#@CeDB#w{&j!7o51uU8CqOWc7a8oEM^=%Z6o}2=*}O=DX)j# zee1D~EYZdN{|u=Kg}nos#e@J)jJiu84eq-qj1kX<{27oME-o=*Pz92^cMI`oWQV)f z@_*X)s@Nj?`c&-{q5VjtIwRrch=aYOx|>F2aBz~A0Zojws-0cYNNx)%a@JQ^TwE^N z3AwT8pz}A{)ery)M?dgDObU4=^cI62WVW`&X@l44RbNUlc zl}+n($tZ?Yz_u^ZWrxS@IJlHCu6<@TGU*R5=kp#I2jh@b=Q~ue&`6i>*lv2*wCdF# zLiX75^uS86sGxwY@}$CSS?;uXb@+6n=43mlZaN&=oSpNz!E&2}ZgW!-{L0iHN^o|TNe&gHKL(YH>Ap0*wi_)XgGkE_}G20BYwVpBIqBR`4YDa6Hi71dh`B4b)N>``u-}cBejj`W3-HH z4TW9`ugRA%fw_Sj*RAIB?HAY0y`+0doNzKsnF}{|oJr;pxE7Y_ZCpIO|N2x(X zUyiXZ40(3XFE1A9)a+E$jM?@EJ9hFr4XIQgJ$P(AIr*bnX=-puq*=qVx~22-zLlWI z;JBNY+#ciH&5O0heSYcKGCet`I;HNGr9zD)BXLh6vvz5!e#FbmcLLMfh*dZ=m>nEH zwtwHA+#sWyO=o>pNf+emT46!O&$hVwBdU~U$)f-We26sSr_Z2OlM8TfjHdmjVj(Sk zLvmf)EpU2#l7)pOX{(8#z;8dC7PN?{O}nGp>@(ojz`U>i%H90 z!}UGKC5SHu=v{k+p)?QcDsX8qCCQOf;}vd%F27ZOncB|yT|h{#xQxE*JKS0Nlb!N_ z-Hy}DLw^3vlLU9qJX!>}2!N^Ryc-^8oI@GKCCEd#>3VWp)Oze#rG@zx6Ny90i=wy@ zg@$~&B!T5wk3N>lhzJ5pfv?X>-#b*|5D+knk7cZ?2Z_sU3M;Fq3_oeY9&(&YQ>Tw8 zCb2B*Kve9LdmR)d9d;=k#^nTT_{w?*UmRYaTew}TS}g(WBqp>)xrWzBWNz{$eqItD zjHazn5gte5lg{P!ET+)M~^u$-{9_Ia_swe9$^zv6uWP8c_Y{jOAKByUa?Z#a|^(toW=GbDgcYbSh~i>re}@KZKw1U6+dKI z2930Q$k~a3 zdx$^%5deQ@*ZW`rSxc1=E7w1%my0_C5Lph4`~QFR`C45p%m3|yQwZ!*oBLdZm}i@Q z{(_77WRyExvM)9Kt&G9LP2YE7(DYeDJ07pN9Ft1eh}W0p2}YwB^N!=wp`?c+C(hS> z(g17Kq<8s~#yZ1N5M9{Y#vCp+T?fMm$4`0s9cyf4!9#HP6>_D-J0ykU-6?9F8i!Vl zS%_Gr@?gUrBwL>mbY)S~=V$igpTa=I1DM6NE}r|pA?jkh__;zB@cLY{p%>gagLH=b=FY<(o|e_xNHsh=98M?6PRVZlUJ#8#I$2+CodF2J540A@-WvXbg&WiQEwbQd^s{g zV;Kq2FgKvGxLP4MRJ#Yud!H>ES-C2q*{JWBE@~r)o&wnV^`C|*ihnrFv*>6JeS}zL zD8#>i$8tp1ZLZeJ+*xmQMZ2Fy`ds6v%g*4(49kpG_)7B9V70vsl+V9PJ9s96ou)hqOBo?{9ph)8obfzbzUnU~R7dwQAMn}N zblhc0f9*b@&WppHWyf~Tfx!a(=A{N4!D7Ifj1-%E7;{_wypwl&koVAWZYItO(VwMq zI7=&SZfUtwmjcJtah?L{fHEl*Q=lt{Q#f98qB5WeFfT-F5Q&`0x>2*0#Nr_@iJX^S@@Q=0Sj+A7cy~F*C~>`6u_mu#f&OrBIy5IQ zuhR;8jC140FrFOW)T^cvvw`8#>TRaOX0Ia&lH0e}nuiJv11DBmXp*)r%^-6$EE@c= z5q7h(kDb?&S6R-T6p%I3I$<@^D&1`ls+{UGAreX?g|NOSb0P@RPmjngkzu@50)G+{ z(fA(8HMmGm>Qy-@T@Yg}kVeD0c^#H!RCnh$e;B1DuvFbiSV&Xr^5gdi@e$Seeg{Jar!Ey2WjZi5Ya?lhgKC4dJZK}YJDxI8FEhxZo3ik zcxBB8ig1}ta5C>q9iIvMhSoBLD$qTI{ThEiw@F<7_L+^oM|-Y)2zpPz+?+ zbW<>ihkWLVDap3NRB8xx8DG=%Z5cg%3HHTY@w@2z50YN8wRj3dyz5Y0hx*G&jH>YG z61pdREjLl|HOvjX>1RjJ104SMl|SNHe_<;l;PD%qn#8lRv$rBV;kZhWWH=fpMnzg$ zqYB*&_gKaPFtcY*+ryn_+U}fc`h{X+V}GcF(%*c{%X@?X?{e^|3r`gqemi2r=gZZp z8NV7jEIVQhL-?x>Adv^Sigi##_uA69HaACGJAdXeyc2xCB)CINhd|ZyU_PUoC3Zw% z#4L$7F(hrWAUC(e3fkQ71kqmDBPPtX{_qPH*uci!w?QG-REA(9LvOno=Vu(G8}U5n z9P-;?S#SH|L$)*NbsjtwO;!cyg2128Y{`CsD6Z=zY+dE9bo#4)c6B1maOXaGV~{=g zY6jPFHeARr6pWU^Je*3|G5p3?1mxpdO=zJ|n~~m(FHeuA^guuVt5iOwbbfSjy%5O1 z&XG6K--1E%U^AO_ff1=kT zd=5S{xLukd_hQ&;h#j=l(hjYThA`8q!V8}{k=Qrnycx@1l*;zIzUEQM;QXkkg2)(NW!~-sfyxvASQN8zCW3s5b`s_RM;bLO@={gRt0oLt7jTc zrBsrfm=VyDyr2Li7-UL@dh#VOdJ*gzWko}%zuMw{9qVjz&-24;aN5nR2mM6Yr^@|$ z^8K(p;((k(2QqkMp9-mHs0sOks65jh^Ni^${qe#M)8@04l2N`5=YffJ3p)H+wsl~G zd5(^uM?}Mi!(QBljwyjfCxXyJH`$?9_?5#Mq{b0n2(h}yH#id_y`sg_!5~z_d1eXS ztO0KtQFHi_-{FVH;qVNJg{Vc~;r=Iwa)Mk=@ydeiVZmK%ol4Tx)rue0MVB!MrGUtz z;1kb{U#H_^mA7yC%8I36QmV2eGX5R-8o^qo$vUA0ltONop-<5}BW!hRzE{A$PlPSj zO`)?p-v~PmDWm&oMxko8`jC1ypwF{7wPeP+ubM#5^lj11yFSei>S*G}3Xd{fXdJR% zIG2R1*IvevzVQaRUd-#jngl(_v?vJ+Cj6j)2A5Z?%kY(IC(o=^u;Vd}E>Zr1&YJfsoU}5V!oFnFK$4-xo^_ivJq1#~|!*0VNTaJSo z??jbjq=!nFRLW#V-M$np3YN za`bZ22~r5w#_W4#JlXE}`GePG6Zz&Z3qkPc@%ubWCcj~Gk4`cWmjvtpK6kNHQE_dr zl_76$ac<|na{i9W^vqBJCGoCMsc4hyH5VuSJIq*0qRE6-lbnqxB$P+au z#f4{RlHB!Js`pYS!~@%52hV4>*PF>Z-BzA?3H{0#DR0h^lN6N7dM#MlaWb)Q0^z7v zT$TyJk|lT5K^op+TgQo!OK!&6BlHwh`E>hdirp_DmN!QSUnVH?I6a(5((G1)c!4oq zr6;d_4Zk-VUqMemKu_;|edd`GEJ}y%HXnC%oI*2}ayjw_=cu}%;K3YJpiId-^AKM^ zmUJh3O(Ehv5U^}c!4TaZ#Lr1NKv1~xQHIw6O^r;xx}!WN03#`rU@{%AV~3hxk>?3G zf#2fo-tAwyNReoH0T-BB`6(Xv*CBtcMuE9@aL_@j%~{e3-m{Mz-s1E7pm@1;zQe9! zcE*YbBY1yzmRv67rDuN3>0Q52xPN>OopHR}gwe>=(EqXB_=!TrYZcO=7ZkJuV7`_FivCZC zUgiGEY=vyDDeY+MZYNX*E`EAMay|({kz^+QA&rsI#6qkB?j0Z-rX-l0_6yU#{dXnr zjGp!uqQsJ4SN*CITKy{^6cWZX(AP(PKC%FKf2`CR`&(yg=ktv&O!`3%q7l19R;j!} z9>;+vXNVhJiV0B9^r?7HUtHad^@HjTNhYsF%${|35(CZKGtj8jL@(d4ElQ97c9&r^ z9V67vW&IJlA7ky~s<1pVkN@OAC_Vu;MfsW4{?GEk1%D!fDt`RPl9>2!o5}<*OOoaK zn8-i&`?n1whoNk0f1w-24*&ZLBJ_Yw^~rxj%)xD*BK)lDerW2jNvu{ahi-wp7oIEwJn~#*k^&vbawb z2_M^Du^Z4DNK)oSaoct|Jc@rmB2x2(o?iVG*Fz>3__yb;env{-tDqz|ISBYLRruTB zppapJ`tyxz)?+g>HD&i81(J30$vwI%*PAV1;dY?%EHgBKHY4i*0f1wCad3}#% zz#4kq*WXnV!!UBdy-hL>$;WjU3|CJ6>MK(*BvDpPj8`*do~z;^3e@ zP=J)jgSXlMiLj0nJ(;LSCSjG`|B3eL1=l4%x#Q#GB8%r?p>*b`R52g`%)r1<1mswY zi;G25Q&S-Rq>T+*MenS1*1R#lI1Y?@T6sQlhSv&F0{_FRi{sIPNo%cH^fEZlRq26| zmR16j1|r*_BMDE8LI*fVI= z0D6J*lD~S?Fu|<*O`+r;7#v^pw-@?PdXZ8wa%qKyJ$mb}gM%~1G7#+#X1`wRWD1j} z&aQyB>aFOvyY9M5W%Sw!W_~TD)tN9h6gBzA`yuwAeT9Kq%(I2(Gu5rRr6rCrW56(K z-9hltRpH>^V9#g=Tz2?I0Rn+!!RSRIxp??os}BtxV}OAgb3s=g#WCvS%;3^G)lJ9x zbK03VHega%?_>Dp+kE@^7{g5*%?Cs@SZYNMXvVSt%~*g6#ck!Bg{Grc@%H92(|-EJ z`S}8c(pCrNCrDAC$2U<^XxSO!HrB;@3~{UK2sQnIX?PY%D9d357`CF)`s zVymGy{PKC6J!;GK+k{if$=!Bp>C*T^9j=v=F6&b$B~aT0ZLQvJplopSn7ae$&$?*u zG0{U6#P?-RbBz-vup5wX!`{7ncOM`dw6u*%92uT-6doOH7{{f?`}(xacPkjKLw zji35-@Z!D7K=?OCJr9>XAE=`|yfq~{n2>-EzQI;AFB<%WYATCt`%r-`cP7EB<0wdo zLTK%)8On>&+WMF?FEWsAOUY7yD4#3p7XT7zph`wUcBg#qcV| zMRZd=boFG8dc110BJ{6Z`G14RH%15Jhms3=j;|M2Y^Te_N$a?|x#fKRd|YMG)ZE-z z8TuHPRvZ|=UKKnZ?_c8w%2QKQy1f<^%G%nNivxMPb^0nQQKWp%))nn#I6m)jUxYNj zFKdhcH@k~kh(HFR#_8{_c4tz;bFbucQb>o+x;pQrzgj=Z4D(h)8aK_{@5gpA6f?A;n5s9X2>P&R3bMy zyarH__AnwlyG>kNd}+uiX?tYBjT2IAK7wesEStJgqgvb56-FxHKB(=!-q>PMye4E} zk>j!dO$PyRC8zO7q_W$5TGSCPAE}Pa$kJdzgJspaxK|aBDvp#|C4F9RB5Auc<%8x0cdvWpWsvLH0;@LsG@L!= z1@8*w6I4`riyW8LM?Sli;*iP#DO7_vC>;Frr5ShyVLtTzRfB$W$o_sw?&lqK6hSq# zIJGh$U{(H;u!cBOZO!U}Hj5`HS ztByCKYPJrQ-j)NHPL6+N(>arB10aL*CCFU9XnXD(lFP%B92|@qi~tf7UvU2ip4&oZ z#?d~EegmmtwLL;x+&93GTUP%jw&tKe@2GlbX=wT|WkCk=wCA^D6hW2mK4ppN{ED|I zl+ep1kv%P*L;~rW-h&2)4c|J-8+(ni#+2^3oJ_Yz-nGa#Hy~xT< z00|o21$35zA!I5rSo#ZvkpvX?C(vYqNq&|7 z4+|d2Va&x=PS7BM@7!z@=Lq*u=Wd9(c>-Xm1q7s?C z?_Hd~aFZ6pCq7s?5m8V`HZW8)HR$e|lWzl?i8j3TifF59f2r~|LgDT0>uEPcBod;~ zd5-xMF}WS~w)*9!GZN1P*2Mx5|&j*Na*ntMV@#-3{%MfM(!XJ{si*$%5k4fN$?6zhp5gg z0T5*>Vqmj4p#yT*>$yGpBCfon^ojP3c@57hXjoD zj7qgcZI?6mZxKw=jrMN(Hv5+2Iq0w&Xiz2-I(D;h9;qE73plgY8;V|04ue74pJ!eu zIcIj|mz4DHDujaSJ`uvYkHUeS?@`&7+I$4h1M?HuFYxe+v+`*)4}g*2C;d*?1;{Xq zoJi8^y8=3aZof^y8kKi!R&+63m;kGfJ$wMCTtK7lk= zs;U@+pc!PrVR3X}ig{~sb#)i3iB?DnR_3~G>V-87;LpX2%d56i4JD>3)LCLE06d-|&k z1cX9hAqs6Yk54G==UE6LGza1^nl2+T<$peAN^m)g>3zb<03(yg!{WPlzU;4So@o)G2P$1|T1IiNuulelf!fH;_qO=Q9Wic}6 z$xDX|Dm4pPR7q|tJZ?Y%<7gT#q=dV7wAncV&{i+KHo3i~8=gRz>5Kd5k*4tHM~y6= zx)Xa%+)E=9Cx|=|CByIcLizO0if6R(eVd+uKRxht-;A5s>U`|Jmk;oKZq=;{GKcO* z>&?rLC-y^d9ud5TQ*B5H9)wS5%joQ`PFSHBs^g)g;~1S$@&2gcP?2w$fj{?X{&&#B zu%u5(Y`#90+5NroYuCyMa>qp2L^&mL%wdgsL(z+R_w(X-YA}Q8?_DFEt@Z_f$H#!f z<%JVOB(bGd=DNamieyB?PB#TlH{LqZNb8P-1i&ei;=fne5HCFlet3IR4* z$QZ=YrjFHQT-{taWeCWbI{|x1S`X|hOF}#&QYwv!6%q4$OB~6sp(wKxZQOOk1R z*{}R0x;ia29{Dy_^Oxo1HFz=Yb0`LOagkb=Ab&#}DpNd>GDi?ND+mdNzOSLIP9nDH|*iOgU7>=Eg}u3x4|D|QJoXmX+s@wuv~{Nph#elHg| zwm5-)3SWBOjh}EyG$kq)K8>#o>NWVWYPHZ=w^@S2uhZEpBNIw@1QscobOD#+w}Lj; zI}oQLF471U9gh+eoi5<6BpE$nKd#=jtTAyA$^ATH8aSVB9qtWvAMe-fw_CYCNX_+| zQLR2XSUlDSR%p{J!0zeIni z*FlEYLXyNM7nPu_8h-4K41r{`OSlSz;qE%e{(&lvsw(6U1i zPKvbw?=eB)s*eepUlrV|b9t;(_9*~?3OQA}L3VQ|0OIUSsi5Riu$A&z zGICscq25$|mZ~*hkBt_Z*=3f~8~i$f5hmn7J?@U;qjY)22nQU2y=05gvTi|{x=#p& zy>!qQ&y&rh+11<^UckA^6f~d-Y>(qka$<%H^l{3kO^13N>DP3rd6qG7Lxkwu}1fnLLG1x4tzmdhdFw4@%k1n)>8q23nh()V?A290cma_)5{4$+o$Z3L{#8 zF$+kez|R9T-Ir2p)DgcP7(s`t*vmS>0)**~duzHiFjY2A2t7PjzAllFH_q}=vANUA z==>Y`37{}7kgW8A5eG<#BZ7ui^21^g{hWPvD75wpPB(LKa1fAGTD$!^u!8JB0Or`g z3%*+-2YDp0R(lx%1jTMZq4}uZuDBjmc#Jm_llv9L0-XrO-a&-#?~r7bn7g;J@Io7n zekC>^-+R~J18LOh={|(iXA-ZYirW#v@MJ}?M3I6_ix)3y-@LET(fuSs*DeU#M>jH4 z-+R*U1~*lrKv+!(kjH0Q3HNrQ^b&%mv^#*f^>}TZ-)(0PBo~l5UqwLGfyBqiI#3#q zwNu!!?Xx!aiv&}=T7yNdgRWv6&_~`x zU+MNu*gz|wYUs$Sjd#B_@%kCSX(>g$>bUTf6v;Zg+kx|aIs#ZSG`DOZ7@1t$eY_Se zrYeiKWeA4p_D-&NamXHDPN9%{ecrCcSmBNSAV=tE;~UE%gwxoKo!N7dzbCM`5#Y0x`_Ae%@~n<|+l!x80m_ z?CsH`^Z-8W!+x(_{N&7y!oFC~)AyP6gHhV9Eh=wXmlt(TD{$N@D=Ml|5Q=lK07>i* z%91=KRJOd_(kp1u?NzTjtLd5dWRw9ZP6)2Q_65G?7}3xA&Ghhp-M2HujL z`ww|r^n*slq~d`JLD}zgyP12lcwdR`etE5-N)0s)j*oZ{H|l~vdS~y3(U^5cc%AN| zomYIj(r5W-Bk{a5i(oGGMyJM(gVbhFmi2Oi-UFHkF8aP6TphpXU{W3TbupckM-GFz z=xo@2$nUhKRyD^vK`Br&l{FYW59u_{$t2E^9i(J;AuI17rE>pJ853SkOYXMBHu2`U zc^#`c#A{)fT}z@p$NBr0Yl8OLl)15t4rBO|0_$J!b^sYlA`#|-E9el@S8rL4L2q$* z(Fto=Vj}C-!@XoX)97pq4)RaR<49{4>=7Dg#vm2yI(=PD8rOH{uJh?NT6=S26U2c7 zgGDm+3_S>IdrPb5cF^~YlxeFs_5?k5AVCH>aDAb)&=Z{I%m_?Bd$Mz#6h%Dx!D_Xy zc>Ba2l>GRs3Kg9FImCYAaC~;9bPBLZxb{-I;Q4@TP7*=CQ;gXT zhcYKf=FcYb< zqs4$&XHKe*EIP1jrJ0`C?EeeJ(lO^gEEyZdU~{Bpe`Z^l5r1t&2Qs~4r_7m%P|`4M zBA|~d^twxe!o1{s5I*C4`XpU6*7F8u0%?;(VaNce-zqj^JyOGRT{HutmC+ye%1`#? z^R8;h(~NxSy6BxW9`q92060m;l~Cd<0H{GChpUwY&3mf?Y=;0%5e|7tS4h3!KHXBk z3aG5GCzYQaDR1r<2&aIg%{}boo6ZZa2kO98Jvtig1whH~<--2NeuvXhiNa(SWk1;e z6~b-Tl3WsAp+FGoN(%dKlkRPozSwyz#Q@Y;icjud6dclTc9CNch7LQ^>ou%I+T4pY zk)jE+Hv9gO26}>n@hhLjVoJ_`CJaTVg%HzkJy+~XtEG#sWP_FUg8RWR{Gd!%OVi#} zTzIZxNj!Y958v0Mrf&58aFRc6*3Z1EyBMMB6lwS~T*&WiV06PzYplk+PJZsOaf)^f z^tb!AyFGGpa>iu_Bt-MP0AF>OvRG%&=$b3iG--tW4mlbMwtaQT;gl{i_99VoSuSu7 zcED!rnz)lU@G3FGEbAE7nPzQ4O$k0C0jhH~-uC2wv7bI7gz~UYXbKR2Fac+qI^_rW zO@}1H+X|{MQwi))q_qY}8T`dwF*$8gy1|LyktxWG2^ZQWts6^7t>6GnbOK_y1CMzl zKYcJFoeMzAYV=%+`wRvnG}j`io`|oadymr7yJZobU~0T7;`g@eiFqn4U!*slMkVa{;wn; zh~upc!Gw4@d0jo$tpgGuMKEB6t;yS6r$+-sh-(bb84UXB*?1%2b4sL{LJC<_S@X`s z@EmU&(_VWPFaxAt1ze|p;xK81L8uN0Hd4ff{%RZoMi5VvjJpO@uy;l6H{<-*=^q_c z)*M#0?7t>BqJv%cN6Xrf#j2*g8_&D@y>FR=aW~}+*+%cHIp;b@_2MTHq42j#STi2i z{cWxuy8vOy@#t&Cq__zcPoQ3~4=@QG69G*Bs;W&=f$%p#S(LM@BB6BG<1HaSUTCym z#+iG=bD;%@7NpTX@$_}Mytz7%KegY61xvQeZ8Y83ZSTaSU@fZMCx6&h#T5F>KV2;5 z?>{$hC4aZkmNx(0B8K{;ks_HgHdD#eR%|45_S4}j%4GFA$Y$M7ZtQub#t47E8JzC6 ziHoimS_c&G>;}oAjPo}Cdk$nXEVA|f^uB@yaS@VvZ+Vx<|WN~T9 zVCvnbec9!6M%E6c7#I6JEZ|l`Eto+-;^H!n&nR13>sK#P0*BMr@UHZwgC#&AjJyi$ zEJU*SsH-P$fug6^Z(VZX{S{fB-3C9zn=nt_da$>5`f*OO%>vV-50dw7$Mw~%MLK&` z{MCp$I5RTNhAc$`oOJu0yML6c34v^VeWx!|(GxVb&jkurw}_%dZW8DN^#xS%vJDt6 z!5_nYdd*lj`o-5R_XY{U13ocoSY^^CLnXmk?LzH<~m2>$0m4UiOy2%(WnEKW&D;h8m4k{{~s z7B11hspTaLLVfRr4Zs8uHOD~z7~ES7Ac0?7%g=nT5ZK<<#~u?z9+|UGn&}w-^xfgP z)kSLqcJE5G^2Y$Fh~#0gXX1VV4|AU9SQO&|$t`4G@>kCEpJofR{S>PP%~-};=1O2U zlL56(f>8A^|BF^iA|Um2*3~w@pnlFaK5 zmSe8tT!4o8WP;11tiDEncHO#}3g|mxzQyt8cS9;FQLzTk;n*^fLB$NVBs@8Y6RuNb zI$odUgsNbE3=jh3%Q;$=NUHF!JQ>+u8NQPWrd2S*= zQF|^_9nv>vM#qtmLHNGADnuAJ6J*%>?VHrp)-ik>tOhxLDhfOmb8doSY|E+uPQKaD zoE!z{t5IqCGHl;$J0)Ozf&(KG2vcn`fABE>uvIT^+_kt0HA z5bKQzvKtiG5D7F9D7Crd#9Gm5kTfq>q-q5 z45#Aip7hr=>P%8$dL3mZT%pAFz62g}$u3Qy?P@7S>Jo;J-_0(*A{yEFd;$jLy?rAi zxNE#C2jr2db~DhN<~NVwCOmlyuzHqDp&7pz4U~<@2cgGuGpeh-yeva!W)-B4Q1X>< zV!OBEWy%v+ewwFYQXE?s4++j->W_?7$yXqzkAHwJ#R3@Bi-N4k$Vih}+qTs-f7RWs zt-Z{9#+ZyHfDt~|xfBxmKM)oAis3FM!)H33`4+oUBi(+Zlujj;!$0aOO}w?@<4784 zb!q@$`BOLLw^L`#`o62EkEz@7bSQ7bmN~z~Mr1eGVM~3cd9XEPn5kph@wY<;h&UM` zENwdJzN+9NCbYCJ1<|6)%F5$ZcAx?e)J|{vX9% zFvry#%dq zs;(ATTv&*8yuFh~8g6vY+62h0qh^|m{^df^e8g^cv6^CMSelu|-F=?Zp9`p>r;tjI znOhAfDTsh|biSl&{pIpYGOL&&XNX)M>`(PqHJ>75gvl!LQUN(*;bw~)@7eE+9KtKS9UzTmz z91xE9`}-fYP82s1OzK26ZqJuXP(b4MO8}C4lU4c-(!FbfiSTQp=|C0HwJ2!=h=Q z0t@=SYAOEgr(yt^7D}T)$o<;&xWZwlJe+EuU>H5HA8==xgA)t`UHdTb8ixwXbiN1+BZtyf)|#cXb3_ zT75na0V<}0oRxWrGj~S+Y&fQ1g?OhRtH^%EFMxtL<6%sYNv?!Uf6k>o8o3gnd$gMP z{n=A-O)uqa6Fc<2fEn6aKJDd+O>cWErW6%!H!?dD&wFnH!GAdMCf+dSrTz^qvNKOY z3%Nv3CbivaH-0wUBJf>7oBG0`Y~Y9>ffk!1^1Q{dU1Cd*iX0E*>;?sDF=~6TJ~rYZ zstP-lmD^VvQUjb@k;>dd#7+!Ooe4Q~_U4Rvh@X}dkd}=huAgk@=jWL^(7gV>`Y4|7 z><-fCuU?7nthlXKCbQ{!tqYzKNrh(QWLJ#>ZP%GVr>4e3{^0WvN`$KGhi~2Efu&+4 zM%~lS@mZs#2(lW7e&tI2ZO1Grh&l&x$xe#4+x`~;x1#MZ8EHi(wHV1^a?4X+94;N+ zlW=~!t;;%h`jPdi&@^_G;Ca;I}={X%B6I+?cW$P7DO+&U$Kh;@Q4VP?pU3eUa9OV3lJY3$f zzgrTGMdmP3!bFm7WZ$*pl*Bu`-=aFIt~ps&qMODU1+?*;$e!B8VKD$dGLtR@l+I6& z4MvZ1QztY~XeD$TI2sMkkhwGAh7c`A$JWkp%&yz=@vdNWk>D#)W@mdWK%fG|NBNk> zv2QCXcmYus7f?hOU74Ws$TPWbYy{$vm~Lm_((q7=I6nAs$?bcQ({Ud${N?C(-^N9C zu0$d}IK8dJH1XoymD2HU$}HDVN7a(3FFvr5N* zIU&Ymq4GE~J%Bt_*3ntbDgY!Ymi)O?Zua(lDiSwkqk)dteQ$;SfK`OojK5Tpmg>w{ zNCP4lF@)gEjn{l39VL#1WXytOY`_1gL^I}&_gN!oMP4jg2heDG4` z0W*M*%QLCkvl)K#b94ECpkfFphv#|sUAa%Ge%wsq?I8^hpeh7FR6esYx^jr-~yDM!BT1?-d&$0<@xVyZVzY^~A^kgMmm|TSG zBSeFWxYQmA2``tx>nO}j9Y!KKh9tC_mxD$u?T*1C$o$sjtl^4Lfte!T9T6R)9hG79 zgD_n{M~Cd`(E}2Xlj9tU=MiVA$BRs0Kcf!7)~FwixyV+Sp+dL<^xjH)urLwfPv`+a z6PXSrB*0FnHH}^2WiE4@_(Kzt&J)Lws=D=?Byp0JU2HE6Eorx(9GLsfEi8!FB)5%s zOmndcnBKFD0~$n_?XwHwlr0fPp`LRqBsD=QAGGdSS9@!Ji|vH0uFuT8?B&2hwX&_Q zBM@99ckbv0_-ziAUA=@s_!J0b!;^WgpJN?he`tUlHkmS}f-br%Cq2EXBc5lPfzpAY zWPge(cR@iWB1thz5HbdfLnqz!`mh`@Y=}rfG~U=nXh7qL3fU7kgs0cOM2{hC<38Kk z3IZWs%~z?tJ=f2s3Up)i3=A@&1>0l;ZW^n0?U&U8$k{)xN%<;DP@wc#DP`B~Q$ z?|8I!86V{zgva5@EY7p{d#r2ecfhPcsg^%qf^LDSiAhL0Gd=sLI9#g+<9wSGh=@(c zn_g>V@~(8aeBbnQYSp{8Yu9!hUr?Nc(<_~2VD@%<=HMstCt+l{?!Gv8Ht+0;oGO&0 z@N<9r)l5k4o3X0U>6ji3v8q8G7C%SN18-r1j@aoEpvkRsbGX0%Cqw)zF=Oy@{2U8F zoTLNgc3o|5*YlhUDCS0r6(aAps2OqT8I6+fV1%XA7!*0O-U?P_kJ(~?71D2HE2X%| z518emDG)q|yDK^L!68}HL@5f8kNy^fcJAqvNTVkIQQXC19Gt9C%~16;^&Bq9)EhON za$E(1;Mi3gG00~eLbEYAawJYU$u1!a?`LGYp+!ce&Y=>k+S}k=7hNq*zranGP_ymD z;bHYtvZM;kV1J{<2{aUuX3YawAWH`G;dug){2CNHM!w)wKB(uyXLe}E=EhG_1TBXJ z?{Vsx_yN%f#DXNxya3RxL~i-s_p=|vze}$gE%#05wYXjVwpdfnnBEGhM*Maw+*!#% zZ?X1{R5@4z8|vw`^>(&rSAqNbp|e*M4r#w5XHEf|O3R)Vd5vi&`H0(%MW7A0(yg7j z^5xL~LEc+KGpnxEU4ndJl=}=mbIHc0u-O}BC=v1UTq&o%a?v(EC25G$e_`cuw z#{I_q{Q;M;hvP_`v-eta%{kXIpZTn(Jfn;1$z^+Gh z{k25U_7|w63{Gb2z1K6^_GjB){#-jNXULrkB>S7CYZZN-i$r}hg{D)H^(&_tC}jLj zI*4H$!`~W9C2E|NCiDFwl^fv%uk+1ILVNNFj5T*io|%4gXi5&V%|k)U<3Ou8a!?BK z-<5OEzI^uG@{;3juY3Oic?!w-5ks?LUWKRG@iy8!$-;@F6e!&uI2Ng0H57IiFz$l~ zF?gcpbTqG))$Bzgi1yxDCXNJIz8I$uymBSeDnU6mF9%84q+YJVLz9Yc3(!njBjMo> zZn^9l+-+6va6nWI6XeI}e(Vv(g=2j%k$tS9*dogT@e9iw6uu>z$1q-?zn6OM?tTp( zwT3Btzl58R-P=G!q^t35@qXcckqAw9ceh2>TcgD8J=b@f;RIi`Yzjqm3K#n)bu^tP zxwQ@mckAk8ygvrrz17rp8APjMI$zx=p|4XNM2PaY>e_opBzCAPQ&i*;ew{g;^rX_< zrd1u$eV9j!NUTW1Ji)Pspi*lya+Z_BW64Rc#sA86rfjtS0ZO<=%bwDIb8fclFgJ1v zMT1_XjJfm8DSkUXy-8u6|D7WM+tmm548-5g1oQ>sLzg%FH2UTIju+PvTyN76#tlX3 zZN*~N)Ve4uoe#sXU}}JO2(EYvuWP?_x>KyCRRx;*HG*=Pwk5tgH&B&e0Sgu3zIAeT z)^PvsF4k~myqRPaSMlL0jbSjr_-VJ8h4&Th^j$CpMjpqn^X;n|jH_AmJaqM|oeWEB z)y-dM^0e2LzfAk4eVxtFEPo7Elx)81p6D^5#ond?@4>)ic>!3;-Xr| zeP#aRBa61|TpT=rJA7=*_$K?x1*jMOQVWxqoEkay4{86L3CPM}zJhN5w9sz-es5s~ zkU%>l-jl`_P90^O-{#F;pm?lod3#JCL@yq>GlCToyes2dyVh~SLg>ajA& zUjW9S8FJc1!t*Sei(Hr!6|tY!^O_|c0Z|Q2c5CbB97BLe&s@;$L50jR_i;smbN=3G z=9890M*?DD-3s^%zNsMLTU|lD?4@OORnf^oC}yMk8k$@yBpdw6P-x|b!w+m~ubjdh zn!RMQM17vQ+Xd*YF(nINiaR(t&7H3IJd{fyL?Y8(-~lF!tRZl0pu{Gv)uyf#B^e_( zGab7`A)61Fae&+A4kzGHSY&)Bt;A*2SDneqM~H?Ql7fNjj|F~i9l&mI?ew|yERd*+2LD$7#>}M#Oj`XBnT%G zh)k=R^ls4Ix8|4DKEuR^e6LBWGU4yVZcu@-DSj4VPLmOC3q#ML0GKw6|gF9M!C(HV*`Ks0Iv zY655AKNBzj!;sCG+4k#Fe!)iUbk0Rwl|VEks=0!9>PzvDonR=?Zd&oJl-k7qk0rhX zpUZ@AKNt{+sP%Qd=V*Urdom((pM{7HxV3wDk1$=W*Q+ehlgvn*c)o$o0{!xVd== z%AAcVOO9Mz-Qt=gX98BXs*A5Mxt{q{@#*n4pv)q9EY);;u{2_Mp_5W2b?n*XJYA-9 zyj+!PKpxURO<5Oe)$Q4%3Pz*K8xz^|izAun?w`IK+(^}yIJFai`F#A&MU_3$z zx6RjG%(s|eGT-)q7djqeUJv^^cPBDZSJrdYea-iq%y)-4#56hAfB=Ch6m%RL8#Ai` zxXlOVI(JDJcmO&KC{@e@R0~u7hg})!Z7G1wTw`9=GqFlcxV zz3KrSL?&&4W0NLoD8qOeknvgV@DA_KOwnp_#=+gS_m0%Q*@6V7tu9DnZdR-Rj z%p#I_u-{zwaG=T8*%F9AMWatd!%2LxNLnm!lfyJB_%pc^E}t15l9BNIWm( zNI>}wW{Uv)01$e%`D9W3!~zh#lEI(pN8-EZf9EC6U{v(116!z@%lfu1DK99o6& zzKQovEKX_MFZ~WM-7WbnT%h6tR6i0~O@Ml=PR;WD7N^$>WH1h731E}JM@?kfk-){% zWQ-mI;usfgYfxGZkJtS-Km%!_`Le`%Htn6e1yFr%o>OgGg;D_PmfVVdc`Y}4p1-Z+ z)0B0|$G}(YyM|V=r%JWVft}Lnor;3vI<&j>*Mltog>N@N!QK&)@#}GYT?I&HeMta( zOo*1vpHg+cUD!J|lMyC3=$^z}A0q*<5``**EEXA&eeewkNpV$CL1Dcp`p1Pm3(g(1 z4Le$~>kIL(ftGs-_lL`A6I5b$#qUK$0~m`x>@Sp7jmtk&|14YIyKW_^v|EfzILh`JUjZczTk_1ISQj35XlD56y(#+7vLi};bD$1{{S=^ zNDZ&&`pD^Jw8JY2(h|zD@{Ede%(fF%!~#0{KXQ4h3lI!+eE7P`ZKA1i2}?Xj2(3sF0Qh zfNigums)Ny_4%aqvVIoOeNf`wTgFfle2V*HE(lp5csxOHP%)&{QKvS6_ls&!q z!9MMe3=qIHu#of5#8*2aIG1*QolLhg6?(}l^yaUSdGFXij<8mkqW77kxpp`IdN`i1 z8sXa)5+7RTs&V_EmqS+sEaJzu$H(JeOINGDh{Pv?U%@Cq^kqo#S#nR>mRoJkl^zAl zsT--E7wtV1jGo9R(^0cKAL#tL)VZrC!xbp3wEd_6Q;3lB_|y8=l-G$r380Q+J?cSE z&s-D53oQ}=dOKbObbo6LfMyU_qkEP3P6g$1^h!nj?|rkA0+r2)xs-&T`a;@Ub&qOw zBfIOxm4+Mu`elm^1U!7dm$!10C!KmmmIKCO6c@al(5&Q*Yz;u7(S-^P-1|ubJeyh_ z+gqq2O4uOHwZio`OsQXiJIjbfLcek?X3F*E7WbB)qE#s4<`$+ZJ^WD?xI5o-hSuv+57Zlky)MWIA3c5fymS zNXjZ1vJA@4n-Ys1X1BC>+g<1Sb?KfWS!Q!n)uaMqC4MjtAJeai0p}_uaLnD*^8@FM zp7e(UqFakrueWY7Kb_HHo@a?>{kY6@Mvs54x6FlV`O24{ncf5!t35wl zO_&cETseT$NR}5!7rX2kxu5+9^vE!TS9HwpqGIbw0)r3&r?uTn_k^?7#fDX{- zd;-8B0IBfq!Hfs3O40FaWLbbIjwFDL>8`?8hhF~Hfx$e6G8iuaCCuBaq{7eV{2mF1 zz3LI_LOGYmolxd4zv)b~=iNriC$~}{a`+?^xC{J;3!s0z)D-i5+@3A3=y7o@2p5_7 zk}QahMOxDxwrkrgs2n{~?1mx{8qE7unjGKawOI1&0b>36eLttCt43gieR z+H`{%WInC!$U99%R~cY_Pu&cRXBUD#bO{)QTq`d=_ge!bmdEAb0;RAy(W_lg{+f0* z5<(h9)I#b@!lCo)F)}^{{Eusa*%uH(J$#scDi~AB$7sG zJ4-t>h)hh7hYQR1ed1$1ml`%HEVimCoSJ?ZPuDY=7b7+S$WgP{ju_%I^EIPSs=T2; zC-uc4$DC%Kzz+Z5<*iA+cb!%~G?Yypz!aj87$|UL173K{zYrszsNT@S?v;cC1uZEl zQv|74#rw@?j#2n|lW+47Y#iR&kMsUurRTaHKqiav)-wr+h?oe63Fu+dV3Cf8PQi0O z7;xcXg@tVZau*T9SP%I*D}&xNGbzPAS2gCsEH!iza_D?*xEMN`9@Z@ELduq@C~$}s z91FHAwQFT1;ob?YUENWA5+BrPIS7W2d*Ul^oz~ao;&SPP)J1MrQm1r379AR>wpQ^y zyvIP_@WOGunwzp`GcI_kRdD|L@Bvi9G&CIJYtwol!*LOF=wPlR{XYK-Hy(QAC1)B* zBSjUF)jDHHb5_W8aT8-w)y() z$ya@el|wR#wMc>*36_68V|kTT|{q>>-1sLbqeM$5-pnW`c)W1Nu}Pz z)V?>Qm=U5z<=z5oV+r3hOS!jMlB4R&qq@j2!@4MOkl&NGUOje^4a?xp2vKxSt$eY{)J4+?UQLv9NJI0-?RH3M(&U-GC!uc78n zY6SYGWU-*upk8Pnw(*Pa9I<|Mk?ZGDL}R22i8aX)L9%r{fWhP%*VUS{v}aQB-u}ow zb?4#By9<;OcUp6j7XOYMI%4f2;8w%fUh9b-1zE$o$qPFk7@IQBeJ3oo@#VffY0)mj zu4$uC;N_I<>nUsNls9@yjt_%Vlkr8k&>=evRYHh)wKpRCuQ?4M{QaR+U<`cdvm z5C>U@!SZcbW@j%@tNE$!rSW;&FGY>R>UIiVB8O(%d8bZ`B*e;OEN_pWiCFb2d#6`lyGzZ*Ws1vcx7-xmJAY4) zV@EJr`{)E}WFna)NY%oZUwlEx!A&gp<>~hI#kO5Uvp_=arWW^cjA0O^4GBCA?-C+! z+6SofDv&0XuegsBdl$ zhNVlUt}Dk0E+i&aL{1A$+SAV10mo(@!bR}g{{kNn?W04l9JRxru3YG}v#12}Q+Nr^ z8zV;8F+Q)K-+Q(l_@nAgu0CbAy*U<-Vz;P6fP%NE9M1@dZz5`LH}Y+L7I;$%SF13Nf>$kDd$ej z_-wwmf3-&slYwzwwY-I$UGSXV2Bp#4*DzeHCw=eP(CpF5KR+>A@lf&K^_sL~4hX;x zd-(jHusKNLYsE;2Xf2ErD0I1Zv@-f-DGKRQu=B22j3QvAfJ;LN|9Q&3EQV-s3h21j zd_!0OWtcRb4Y`lBGf5O~7wpq+&`^xC{T^yC_GBwSdw6?!=kyPW_;Wv(+ahD7wzo&1 z_oLIV9=>jW&1aC&w2DH#@gE;^oqw=pvR~sKh!b$C73&}PaB0;jE+Z2KFONFv?zN;Zt)+L2eu~tVO>KbwQ5vifEGk_|C-6*_gJy@ZG zeZhZi1mpe8{8v}vur3}=+9QF2nd+>|4L#Bmm+*o*Ls`)Ut3}1$a0uldh~c1f)?FQ& z<3y~?(p$-0(qGX+*t5)pFU0z{q8#fa(zPsy;#)z zRuD#C;R3m<)1kH*-oY0Xd`4fE{b)CyW+i9QG=tUr%iz&{h-kvfwi8jZ zC|rR!Fhz8w$lJoukwg*D6{a8R)bhfGzyX=WE(fWVbzNw*NEIDd6^w#G{GXoik7Ovi z5`%mUjL7~@Ee&1^pf8~MKxzYP8k({tUaAP8#qKb<%MX`!sJ-Sj=^tv` zxVIX3XN-7Xj5sX4Su+p_c#qe;i;Ui%?3ibyAQy{8WM4h!$nrdnc=0$Yy!N}v+JYb) ztpG6i$P93KzXw!8NZ@#Mjsm;a@{KE@RwnrLKQXK5+7w)F28=x`nUC;EZy)4COkW@~ zGEFzz)f*nxm#LQjeW2$_FrDBHtNV=I@oJ$=JT$vE`pNw(r!G0C?TS)IZv)#o7WZ6{ zjwJvmhyv?XBpDyO#``-%g+t*BfXlJBJM(ExAbw!&2fx&n#^xWE`CAu+u7|=J+Q48x|I>DXWfK z$4cvE;x>w{$qR&^SDzkcY=mg_aSTx{#o&{#L#)UvU0yl25< z^W;vIo^3Fy;=NdTBCo9c3aACiyPxOYpQc`Q&z;bM|2;ef0zSl11?9*oPyTr16cEmf z7k$}LybMNPzm88#P*_?XevXf)*STF|T{~US*y+H}$u+Jyn6H`-j*O5R61C8*$`wWD zyI7gwZPYG1n5)ooI-H|LlajJK-(oDFVU7n_Qy}>F15OspuQWKU6;Izt?CtM=Ei2O@ zo@tS}vgD8=rO_uGtSZa`?dQeQ~7vJ_o&?(<~@J&SiEgbzZxxh05?zI?h`wxR) zp@xg4+j#d-PrcbcX?`|$ycDg|a<734+TR#8h!U66%j8l_N$#yJT&WYO%NCRAE%QFt zaJ`A04(~-A5luf;AC&)8Aqgn=noE11eqHWuxj&;?R||fK%MUg{B=M5%8s=T!yk0BQ zj#KfvYrvx6w4O8VlnD+fQZD^+xj*$@!#!vhW!=%ynP8+_iC!h)b)$9^mSj+1;0A_u z3kCDTE$#%h@E|}z#?yPlGf!_mUHs{CkKOflko#sctR>Dls?mC@+549^?rRh&oqw*- zKk^YE_0hxSLj1jUebt`%aH+~hGbbyq@OW+&U;Mo2Us;KED7?}K3ndm6(kTv}FHJBH z`}gSU>G`INj_!$5k}vKt&Q4F82)7sR(XC5y(Y-b9yWHobBt#$R?G4s!aOi@(7gB^| zGFOL*W&_HllZ58x)|_j4k!~b$Nhj8M2Z|%_K@lgVoi01aogp#uxjibS`5c~tstXnN z$w=Fxrj9f?Z-`S)ydWkf&7sL4`XF88%o>e5;B3LY-C25*i(6-l`Bwo52v?fS&VTud zKO9D@JXXfZH${MB{)fSOV)<}VbiP$p?P_!(*YgT2pPpYX%Q9@6Io)6-3WEsb6$FDVSzqq> zs2P)(2pPye3|g_@s(=Ddf(Mm-vBA{0l}59Y{I-V}f#!Q0hYNE|8ukJWKZ+^@ZtoBW z7xVynt{ZzJ0k}{iYT_S03hJhmK(a29v9T+>fBeK)Odx=N5|Z$iXLl{S<l!JHlFdM zu*rBw#%%fgUG_jAzK*zWT=-~xA|D*!93X#C`Ob-NXYA%ou#X(eknIM_tt&yA&b@e>qNZQ z52{(z@X|#@?NzU@R#$xr(kzD-SIf=iUN$Yklsf-Vu(Ke)_x)RX@Hl#^DC%gU{G+0$D4$sT0Bt?pPG*C>vvXA6R}dBkS$}UmsEs+-kIGJvoD%3j z9fy3N*x+X7oW5U}oi*z0bYg^mQdG=9@(j0#os;s`baEjK6DOPQA~Td-xOIFS+a5KG zl+L^-E4v4iV{vqpvVM-=j$NP6w8iQW|0N#&m$W55R&B20`d5y_+4Y3yg9&^kfm#{$ zL_zmn6v1n5pPpc!cROY=YfkSe(bph5g>g$m5v{*pzKUc3)}^HEcmBt>6WD#1mb4WX zU4J!uRst^e`A4-(uwFo@gO|bv0)jdzopL@)z3uwb%eRD=Ho&=sZ0&0XQM}|v>R;Us zlkj7rDs=3*YMmYMqI%U!BGFdu0ByShL$ooQo}ZRmGhYYdd$!z8$BmwCg@?}cZhkZC za9dxTnV2rh(k~PK5p#Oq6{gNOZw}vnR`~{~yH0W+$47e6=Fuy z!-r=r_dP9gGCi=|_D7AU()UxGEle`JrwLq>Pfn{QOg1lO79W*w`QW6*5y(R=0GUtz5!qG}oGp<+noe zcwNd{dVan9Ys-5g>$RA<`y2k@d}rugl^)HDD`aVjs!1CKq4HX>Z~oouUH)DOfD<};m$5G!bIJ-V$aW8 zC9qqg6z*h*;lw8Xgg5;L^o7LJL$Oq+tm#lL(Wq0{h3D0k<7zwA?&zZ&BLO$%YTHZ} zc~)%v7&eWL!s&TO6o0)IBM;LhT7v(wL%@mEMg*_hJ!H5BqpA8!{G@k@ zWETxm_IyB6jM*JwNhhQe=e%xs$l@HFGh_O1FBRPnpRoQqRwc;(-^=H}OUaiFiWy72 zh@5!!-|zkFg|Ff(;8*_t_bw~q6R!4i{~psnvY>Jr_yaW#+quIxZ^*HwIw^fgpo2dF zZ05Ka$vOVvss1yX;#6>$a9U15Toz8LQ^BFWwjrI{OrWUCyI`)?fD+zH&+$1r!7eEz(>iZR z56|CHu?*OcM%6L&VSleY=Sndq#xydxe)i$tHVFg`-Pvb8!OF1;G$Koyead7ga8t6BlucwU*Oyr2gq@X%NmwU@ zR<3TBOu$Axjw0g45qC|y3XP7Iw?Ge4f7o;+Q~iml!qWiabPJkd*~}l-oujAiq{3ps zL9zY)Uzd_r{A(v(c6Rfs)P6DEe(%dOUw(*={=wTtb#@60VQe_B-Nbi3&+?qAc){Hv zT$_jj6D11G04c|4$%A&1eIFg|y^5}EU=|q#TYLY7M{xeiJl_KUB91F>1T;m+?pjm) z#~%525ELh!KoYRI>)US5!xJ@>6?qM^sAr0LQ#TGr(stLzWpVye z+Ax5^T<@>!U3P!ACwEyG&Q#A5c(YaCcy{c}M-`=P9k)E`X$(K%a&ILgl!xq3Zy&bF zOmZDseS5cYsIjf+Qk)tG-SFS^H+WX&S(H@oItr~cv5(Sve{)n`GUKuy?jdygQv8{@ zWRV_2*(#b3m+D+82Vxq`(bC{%5l#i^3o7F5OSg2D+|GP(wNqSZqA8n7djqC8xQW2P zuecWP2%y^LVI<{9zyQGQ)^p5%cO_h;aq5`Q*@j`CqkS%NO{1Mxld8Os3 zxl8`X>;py(3aT{CT@*C)`Q3dKfJ_r#7UN-b>7D92pvlimLL#?%5~4J}VPVpAnj&dB z!_Lc4c2(0q7FX2m{#vC=wt8NtS$Wv(d-7gDh`E-AEK`vpOaS%nZls1?hJMp(MSZ^d z-FGUDP!;ja(3qVfDFpesGbqsLT0&hG`0soOu=h89FzsO@`OyL8+K}_-h183$aI5+& zQF30DSxCA-f`}TgIm{zPLE%hj{8o4eZ`M>(-Okhk2_gH6lS{hV}I@ zs)S#yYN9Qred~^JV-&+@>K`7TG+6EL^8~|p-dmwNBs7M#<&16@bThH8PV!MOd5t>s zX;KTfv)>q1YL&GJ^aPbB$r?SMhJ~#YLC^+;pOhCXuAT{Ox~-edvW!wCy&QMpy8S_M z*^xV7SMhxeVqM0VFUyp@Iyq82YXq(@D~vDHL6H1EMko@d5lIf`DY5+|-BU1W)nLg| zH0D7lyCI%-_}3*~+#@=r;_COwflsO3`M|Bh6<><6Jb$D!4yce}kdxTqOQ8aYmiHFado;D5Zr z<>cSbk9pCKAu~g=6|@er^z!O}Wi3loixXDN52wt+mRIgg4@pS`g>-Z5m2fA;SFp0! zsKlQp$Y)I}oE214U!9lZ~&bpIAF;7i<+2f6>w5VUnWLCAT&_a~rf6wutQq zQuy4nJsYe4xpJOi3*Z{&J(seL;0o`d;6*3n0zW4RT_|{q_WEpdpLzqA+e}ylC+8*| zqd9{-&27<4?iom%1!QG1zrUHcy_{;(xRDC77LDC6vvLu59GI<}WIb!3E%ftTZUWT_ z{f7%MXD6FB4QBJay%YF0w!v_3dH5dp$J#izN1ZC?p}>2zvv;$vFsSU99yh8ukLXt3 zG?Q!?cE+t=h__=VCJN>gDmqPai^vy)SIa}cVoHbc8ccMiQ3<6s`?IJBY)An;L{WpvcKv7W= z5VOJkSM~}0uRu3ezI>FB3pRPmOKpdFHXN7qP<0aX;WiYl2XGhT@O=>WAD@ABeN=x@_Hj=}*?1WFc>3;h`eBK=i`t_4?tb z>z6M`5?yN!PtRR7&)>FFU~=P<4XWX0_SAh(UvAqp9bl&bJzjP&CxGAex+0C8UM8pG zNA-kk2;d~&+$Sx{r=-&7&6!JmtB)PPt|ul__Tvd%W&4``s*R*uiQkJC%eNX83YsTu-NijS{g7YN0LMnddMtnoE zR2x2Cdj%QHF$dTx`SdX*{|$SmT5x zvYr$MAZP{BE7i|sZVVn=Dg2bSpZDFiBbR*8BD|+jSf8X3Q!Y4H`&_8qA3Ko6Zo{LxQMBx;_T?5P z5@$Z<4FtLQB>Y}Dpx>>;@Ey_jnw!n;b*wCM@LEg7DVc&Jf#j#EXG&G8FS%8$f2e6P zB{RVR*+m#kqR>B_fxug=^8FY5r*DliTA2<-I6_7kv0tY$aj$Qyuk?Q=Z&4t9I{`(N zTRX~!mYG|cy)?CFuFH|CnrEl2=p=+1{kRY_QE}tRMg>jTlwz&jDZn)TCkXyKf%5X zG_9J2$R7VXAJhm85(>L&9qp!Lis7X$cX8-?MG+0o?@R!ZTxog_cj94oQ}`;3%$w?f{vIIVx#>Ta|G zPdk4pjvFaiGHVoFy#ZN^X+E1vAGI-6{1s?PS2}ecK4~X6BDmugqC6kir3Cs}6dhLn z)9@5?dwL`OBc5ZT=GcA2iIZ@wEzXkQqg1S4MDO=1hnmKDn>NMi+^_tAS?sn1Hp2ZA zmHEBb>BWNs#O4r5c?nj%5&eURcpqhFHeiRigLwnGC1e5^6b+WXu1?Xj5#G(FG}U#L zGbs2r(6ey4HM>zXt2v9F11B`qqhd8I=B;H(ygOM*4blusopTcC@S%9iXUDCN{_AiY z@vJ;FWuMIhZ!(?C;ZgJ*Gylittk|j`$z@ED)d$e(81~_yuWPw7nr&d-s^RL;i7bRP z)Q{z)R+58|U>wY>%Yrw#PQ!)qzZ#48{m|1yi?{PNNL?C@q_*=LeFMe~yb3^miQGaD zFES0rG!3>$r30xu73B3n9>5vZCbLrCm?5IQ7S}hz^Jo)Sx-Qi#NVknMi>963bUK+> zt51JOVER$oy;mojdq*b##uO3f>KweO26k!@Fkvh^A*kfE!h9|S>NkufPM9*xyZ1bB z54k5Ojn613UQ@hTb2Ij$dgdCa3q8DCJ^1%JAzcT}u-%WZpZ4xe7D+9= z-yn+U(ubBx4i66vx~xZYtE&}(@&){MDlG(SVk~tE&GZt4?h~&&Y+#mUz z;V{uwQ@K6~Uh@=*sD4H}#9HFj3a-*=`i1#1+_F)S!QY2-X=Mn6!DkUOD{P-sd}whc zgqqq<`lz*UnOfZ^Tu)MP(^ar!Qo*J@LB-Miw!_z7rSNIy0g8#9n1~L?jFq7 z$)rfV0<=d$`LqI1yH{Fzt4iNG<12plLCYchMgrzR- z0LAo|SU6c!w;ljHa$}<}2H(o9F6iZ4u}+Kj;~MiLm`SUF(z@v~E`{HlkM()D%5OSD z3~;+#NO(v8Z|FxA9}W|BHR3wz*L9RAFC+tmbpn<0^*Cum6y*&UCdWn&*Z>4mMNvDC z_oqr?04?MJ;B>Cv%@fwV7S`iA%^wv5O>%C1Iq{)Y>NbNpI} zSn#JAT{Sf|zZL}G%MR!O|K2G`=uZlqB!_^{WnerJCiDa8#c2q28jI^pQc_Yk zM9kwR$Q0I^G8m2p6pG1Is#*WWY!wk=1TKdpAXSiYa!ywt9v_pOXZg{D=;H!kxjNwp z%=9h>HPzJ%o0hS|9^B5@svqHujEr7|zvLz)+gq;*oVK-#6uCz~#mFK**cr=q4)J$+ zXlH%H%sgVT^URw^I`u7k9J5{Je&;eePJP=x0VfIRTk-?^E;HThJHM6}aS0&=M1NH3 zeC43DILW``YvLMknB=5yp?w7#{u(tOCueF}JCNWyI9qO{piLE<>rwn1ehA1SP{y>V za7ytSgRscO9dLT95FiDN<_fezNcEajwq+wH8v$qAN-GM})db8Qs(8%#uuvOiH$J`75SObcOfOK6+)+$do zNqSarCzqD2AfmzWpsqj{UwxdIm+kq`<5-d4F(6LTFCs&{&`J0#gr9;^*3Vm{IMA;E zV;>BMk;iMD$IRku#Q$^#qo3~RzY56r`qtmrtaXNbw*pE2;1TYy-yVMX341Bzdj56n zNig4uGyM80@4y|_+ADdkm8XdEZ*;YRspOAVz~&@lINl4{YV(9H+Gr#wtD>Fm9RIIO zS~n5PXU0di2a(vNbwd#n^sISYJD$gm!&TS7Elv zWafcFBT|_#L2!Ie6iGawC-r?;kmMu-Ji`w8G&eVAB$^yA1P2hr%7hwgN!K*W%+p6q z8k5Ly)U_clhi&VaUHU5sNUv=ggM*t-I6H0$d>%=kxBFu)g)8^gZk7ZSpzV;XXO~=Z z!g;P3KehTuI!!U$u^g=PcouI=vK89G6UBQLeCpKba+>x2yTEUDwImc2k5NgVmi~(r zVDjj0P9zs_W&;B+hMewGFf4C+F&Kvn*Y63u7@qKxff%}%#Z-0z zEY~~PQIeVXIPtC*V`5Ux;v4ReUenaD`V9FaAoc;MN(8((Q14iAL~bMTBkT0U6ZXXM z7qn6Wd9efsf>OKpvC6y~mGxXA)_5DY$ne0`5K>d5sFRNnA6MX5-0#{0c+bkr@E)C7 zv#|p#S1yCO$y6xYOQ_EcJC}(zGK5JmIev3|qIaQ2c+2WD{WlW4nK%p_jPO$zNJP^CkQ7MyvRlnWYq&=xdy1-1KpaI$nSOe zxH$N94Cj0|A%9ze#6WVEB9?Fv`Zr)y*A}EILR36Pdaag43IWrb5Wov>7z)$;{CPMp zW$!|pl~uEu8Z6rns3wiO2(-^5>3u%2gC4pgb@c5CB064~I4Ba3=wH*iAzD2QzPIk( zj3e}F;=bKG%np|NP;^$PdlQ!dT53BJWSj?9s-x(xap)lCXA4`~zM z{!>tW69@R(a4k1&uD|Y(fdmfo$!Z%HUk^Di1sR^#^=eRzHUqK(<+h`c5GWE8d)aNj z@Fc9a5A)J>X&YlCr~N2W1ne;PsN8$7gu zV{5NhEfZ)e=Olq&A(ou!uN@AxRhYcZ|{_UC?zceoj8Z5|^yIjVEAC^C=_sllxN;_r zt4gh-EMEG2`l++&i7f`wAgiyRAKZCE?b-aW{nbGv)Nr=~fVpyQ>y`fd`Vl9s0cDJH z(phDHcDA-IqxJE>kLxvhPe26|HF`)@BpKOlP%{0Mhs(54Sf+NmKQmUtFo;;(v4NRxQuvHhIA?Z5)B;A@(TdA$$|j z_Nm%nX)d`>GV+NVu#+Hmk65pXnC3nkIeXQic;ONb_>~H1O_^bwFz9a!9B6S#glW8r zMnun|=oAZ%J_)aEB*DrG0l!V$Aw%FmNXi~NwD}+&)`=?OXHeu%H&Y?3DVy> z9p5`U*PYs+!^6Gwj`5Yjf_5Z9sfuPd_6~mwDgZGO2Tqhm0Ye<3tD>SpV4GD@F$U2K z0+WUGtiRlZE%@r(nVmUiizXrBxM$eLk`Y^o1xB+Lt8=@QsmQbCc z{VY*#1G{QQgi(k3>aC(i$^EsH?aT5Dv&jQQx~V=+PNFYIK016B%?zo&zEo-&=pW!n zeYU?&@3V7j4-5=M#ItHIW0H5QvXK4egk}Nt<8S%jxbgmZBky1(JRmSgr�&5OI1` zg4Nl9MQl1c@*H)~*6=*?T3LpVOa%H_3IRRQZtutk=dcZ=Y+o)0oih-iBIZcj3v*yx z_Rjge;hsUgRLM=DEQI-BMk@gXim|oW12R-io86z zYE=O$T+aL<2f;XlYC!JFZg|Tk2ktAiZ7(`MUSJDj(*lI?C1%02zeVwv0x&vODy0kq zKJ-Hi5|T_{e?Oe2kZ4qbSzAiW%nUpB&<-3|yQ7Au!2u_R;e}UgHEGWah8g1?@R^84 zJ-JM*XYrqvx+z6b5eSR8h?194^pX-1AQ;!;s7RVwgPwwvHssSG9)kCoFMmdetI1^+GYhKun|by|c|uX~p>N zi{{}8vl_+gscB9Q-Vqc1S0)9|KY^*-@89)|kpE0f>F0n>EsF_|;%$dVQvVf71hz2o z&M^7{%sIbiXZvSPnN=S2Zr2(Q;m35?zO8~sDjH|+6mvHEB+SmK*$^6)&$gJS_d_GJ zaMPRseIX#@{rT%aYZ0jgM@yaLMyGQHidTH5@cj(j<1TX2wbLJ}T1*ZaAEUSI2umQn>Y0>ZfPw0d9>5*;r;60+it>p@Tm#9Qu8b z!_sS)*`(YRqYp6tALiaVEUNW;9|lAidWMjWp;Qp0yIUGT36YitY3UqbXb`2lyFt1I zX=y>aLmH&vw>hEbobTs)|9-D){@4S@Qj*>GpEk7OtyC-1~5f5v*26q zc^0zhVTAsI7k~pFVu-iK;_0PAIJe~^4j(K7WLWSf`pbUPNIGj6SvZ#8 zta0KX5`zRvM$^dBUUJprLs7t6Cx+eC8xS&RCAt6t+|hK`Hd(1lW7i*%Z&@7h$S~7e zg-ia5U{QQ_f0>&HW!;qcC7*IC}T>$GTWYMQ+g zN;X*$Q1F?(TiaH_yC={O#{Vyet9K5TVJ5jZlnX#|X4ckR`bRucK!zmq7U^a9%NY&t zZ}7=?z!Wf8$Kr7_%U}xNoiy+4?UR@&LeYPn)+ufou*FO<+nI|6lJUv+RzC~UMusDI z5r)M-^PF^OR2+;4NT7muz~YeOa?$U2_AZi^MFP_R2u2nV4#Hk$n)b^FtH@71#sT(r z7A9iWv%KPB&Xh3cuve0X&Z^KiF>pL!>6P!m1de}#2>{9?gCK_7ND7r@30!8=*3)_3 zg(tFw%vfqDFi`l_fyXV+};=v&ytH6AK0PO}R1!nD9Dspmi!*#WnFM9zxq7jh&<-D~7giSbh0L`YH zABm{{+6;I5p$2*BpjL*3;r!Y*=$j;NP0^H7+fUla2gsH|kRD2SUzyPSz)?y~-rWPj zV2vHKg$zv9q`pHWIV~oLSRqJy6=9*FGODW4Q-J&qT_Ts&`V zTX+(+m$oMp3Z?(%0=(-1`FCHo_1LS%iNv5sM5V)BnwMRaUTFi{jWCv5T22LN-}K3& zN0H?GPBKnTHB~h=vYMJx-z=d#KTl42@>O#~c|hrRqYt>ns5|e*bt*6jtgWq~3y?tR zv4MQ(8@7kIGcXOF0;=Y@&Ogt(L~rGSS7yLNr6sV@qld8i}bq+)Nm?;t^SGXdaGjF{Un%SpBFIQ0RtFrC{cBoGLfj`Q*5 zwJ#P*I_?t9jkCfjFj#97fVq^z$#|*nhNsXscK9}beK@xd^`!5$G;sZo@#MNNiI?!~{Du%0P8}Fah*F5U$J!{Qv zFb}8tei4KjI)%57}p4v?cQHR>;5Ez~iO$I?kl*NxeN! zB}F`*sk2v2ZZ|YjY|Tsw59D%PX>re0L$5qv=NX8N9=+?N`}d1T8fg4*mo*YVbmqDn zJSt^lX>A>&{D|W*9wY5#pYBZsT2BC>ArJx6;s!k%@>$?X-hfB@wWVn{I;(hIjHb#0 z1min!9t9P3fJhZBF&euu6kCmuZ>5dX+GWpN2F1@dNGW5{xS_&GGAf`TmhZj~^?(O! z{$6;Gp6TJqvo4r@5(A3$lXt!7Ghv=sJbrZa2&IxM=CBRtg;qo#)RpOK3kA>5cX<9(rHR5f_Me0V1UX@T=EK*rSH1`U`;E9AC4Z@5JkG9&``f^z;{ItTK?VV|=g7E@*{f z7ylp}wJlQh^kSx3cerX>=$Gj4=~@JU@_(;C@b(Kna1HElvR#s+alHd+V1gj8&!0ap z9i7WViJUo>-mScbLCHuz=5SljJi60JLGz;^A*nCa#SdsRrYlVj%BA7+ewe*V93%}h zHq_a`pf<^QWgGsQLT4S4$`6#kSUjIXs(j8g7OMNI!{N1RJnka?Ula(K&3=USqE3q8 zCUhDOn+?5BFvt~X8G0B|aw_v(L|c%KbE?Lbf34Q4qDJjJ!uc;X0FiB9h=xnIW6@Btzmxayk2*oG+L+_o>k~$GLji zbH^WiHvLMep;zS7+POR@2Y37Ba(*A^+1{0d=CX$EL6QbYoEken~_RA82NsAr(M5x{E!tlUX|q6jq6 z8?jR{chu~nRbwN!s%mV-#i(K(&0C?Yu!NDJm;`&4HIE|d(|C)?JE_&a2KU1{rYXA& zW`X>DjkI$DoRW8aEgrP$H>0H2hnEFb-Ar-H?1Lnz%8zIIf?d_&ACMrExdV{y!9YAdJaE2O`G& z3fv@VS-i6LB4CJ`*�lpr<6z&RAYvu5veE4kP$WD=F^cbW>B)VdDH%7wA1ZYtqqv z2Tr2-#C{_zmqx5nr5447!fX_8Lw!gX_VQuKTQD*t0c)>SMfY>i<&@428+5wcT>LO6 zI6T?!orH9fK3e?5)R%ifKk2rzK@r`>FR*E}Zr0zSu0+J%lhT$*9RG}cuJjb2w6v$j zCr&Tig7bD|s*4}`9h~6ddX-T#K?)12U5P-%?Yf1Tr|LCFTm=)yoE8~lKL@c|bs z4&?U*{&lP6qOgT+ar`AKu``F5HY||l2jHAX#>d|Tw`p7!$w?o?0TT(w@uPeQ0WxB( zjVOS>cP<}>MJ^Tq6;>5BKF6+8bb9{CZo|z92b(6U0k( zT>tF8*j4CP^D}nptjk;wZIQ1)C?FVj3<+KWMM=NCL!L`sVN2Ze6#BfR*r!rGWJ zRwkV=x4l<8F{QgFJySVwUN85&cLBmo&gcJhCx0Hx0bRt)xb5p-^PeGPK!Y0rxK@k? zCIUdWFfmI!VzCOygkV${<=YId8E$?g98;b{lw!r%;uN3MyfmV4MNiKXZn;M0L^P}L z0g<{83mNyqVE1g2pW1PPTXIvyz6+uQ;_=(W*G*y(g3ZG;lsY)^J23&iN~R3*0i35R zb!%k-?;--$IxOr<(Ss8?KU|(EB%()9gUSPHU8=FJL*Yv6+sXuZpRFfid9&H(tqW{W z)$p_LP=o)xL>TT6DfGftUf?(Vq=+lJ+NHkBqu(pWX;|eXThd%n6{#;~DF}2zA~B+r zZ2A184Vqm^GQKIq2WXjC#Bqq_Gpe0+c*1>H@KZEjtwpw68p^o7xJnAcDs=vqMZJ+D z8(aHA9@f@i8*&rQFzc+X%EqUzwj7X4(uGVJE?U{Sjn3Ke6 zg)3Etn4@j@0PNYT9$H8MR%xgH04p!v)7}-tC{xw<}%}UU?~D=j~S{ ztG7z2n41C|jb85yW}|g5asFD^R$63me$;*-Z43XCpjvU<(I!#F5vA#!m#f28_CyD) zT)2~``~CSdYgnD1CrZssIQ%0aDS!7Cv)};n<={ODCm_>_BtbT>itB6feAK-ne&xk! zvUHGm@x}}#HIct64pS@!k`i_mr0Rl2y>kAQJ|h49b){iKgNi+0dxLGn+MAhz6nKo} zxF*79D_J}8gg(|B;h{k{gi(qUN3tErwW9^y5B4zq=n%F#1}-fTcv5j|JV{}f=ARb< z$VIGV?G9J5qY+TO)`|GWCgCh0+6MG0HR&`w-sk4|z${gE7TzB867qS~?sX&f9|6NZ07n@&Bp|D(sH~*JM~}x@!yriT zyYk1$hpWF>V9B7wbiq;)$Z6)M-*=kHgnr!@IyU`IKF)mNL}o=86!g*qJ8^$C2=hgz z(j0n^lOXDN)~8KFaiuY5A!;R~{FjXZaOYw{^Y)~-=IL>zdq?pGPe0yw8a1}~D3|oi zOJ#Wd)t2D-$JatG`d;C@^ILlb=urb&Q?6VAH*>WtPv%{zy8CSlU-n3GTJofwiPPAL z3O-$DupTs~u>7+hJCYZfO2nV@2bXI9_KB>bVtDPWHI2a15f%k13JTkO#M0=$($KZ! zJ49T%7yB7iC1|0;#l@u#LLL|xIDi7N6WWWpG>711WuetxfY^W4j#>7=Yjq!25-=fcX6xpf*yZRV~ZM z$Cm}b-|osSx=#_ENp3n&@gBa5$Zthe>e3;=h6=Td+ct_iz`S{q&rGZNvzKT(1=Olw zJRl#u_EdtrFz{QLkoa+jGZ938c3-H|{fW3&T;e!o?`K*yTCB{{?>dRqTgBhTP!|sr z6`wh)>x_KeEVRm84&U3WyeLZ=J3vqtqXSAXL}6>u~q-YB@eRc}9zyKG}*}j1~B&GX#8FZ6ZP=IaEN* zi0ti`Q}_@osICi3E$e;yFE8Go1U$d-l?HMEo}-B?0)I~@67%5$QK>ZGr4>LUG(kwVJ`&E>s0=)(=Sx88T z0yMq{6A+l9fGT2RVG$oh{Za~GVF&(&Nmo$udr+E<%t1ZU+|FucJJ@>z#7aWhYhth* z@)alt#g}C-77e4nXW#Vut@a_a;EA?ygn`~o#b?O&R@ws7-P6Y;YkrUW{_$@tf*kkt?F$8(P7S3vO0}xMpI&YjknU!!P4rh*(dk!v1jEX7JNPX`Sf_eS|eKtJ8I*|#s zV^{riHzZfFA9n6U$O>nYKX>rwDW1Y@HGz~)T4b=&_wMkhcpr&594w8t4S0gi4t4y+zcXgvY!AN?eBi#^4HMMBK|i zJii{K6)R|Q&eN8wu5+@~iw3}$HlsTp1V2kN`Qq%xWbDcBJW3GwfF>|?hbxhCSrRp9x*jY^-sNd~qA{WlV*CPPKLQAEvgKk!%aA)RNiKj~ zJ=VbdT^mRo7~pMGT1o)2d$!G!!`2^Y3+F~2o^4wFom9#hAgthFz3acj z8KA=vFK(*g$MJQUBnq$$D?W~)4?w2V@8k=$ z3X%i|hf@j*ZQ_H48UTq-8Ucg3+Goc8qdHBl;{vJsjnfV9N=2T)(qO0QfI>#XF{RYm z#3>^{g}WnZNlHpeC3U)yPkRBp6De(gy$C({nl4U{Pq?1um4s?rP+dI^kEj*>^rr6t zFF?H`##q<4-q^lAo#z%_)^2Ey_4jc?n(TeJGlKSRttN0dXBYKw~x z4zD#TVuZaZLKW&D8nXPAWVU_)qCLf+z&kW@Hx{9pY7v96xfS{l7wB!OJkj@HC_nV< zUc1XZHWd(-Xaw3!KzHmZ^*mvuF~3gt`Ew)@X16R96C9yF)}pzwS+>2*DsbK6bnH3QGvz%Y17qquu} zf=~`Ag3@<%2}cwS(SzaL7){QK3j#opg84RS?$)*JFFlP_1@CdPnA3~HEg8LMXbT;D z0!UzrfTsp$75*$u2bBA<4)4j5{)lF%-k^!FNa!>HddwulcT7(>z|H9_@3GbqZS=DV zV=-=Aud1qwJEepx>}N=T*pU?g!XkK>8`c8M@rnOxjX&ZC+&*EL>Ez@jDFot?E_mTn z8-!&*H@8iNzhq3hhQ~G6T z-za1|TXp#mp`6GfQlq|A(a__U(D;>Q4R%ntGjp5$UkuB;c$E{;RX}mTWF%9^ zBt~t;;!L4Vwc@oVZg`lCg_pcJedBDZqYs>IR(jb6UxX53GdrK16==y2>Ku$ zIOwfEYQ_l#5-W%riu|l`h<3hT>{Sw~K3YaH0ZBO5#yv3zOzhaF%oG%KM!fgh)bEB0 zK`R5dOIp_%rz5}#%mFzJb&_F6Vg*yvJ|EVw+6Sp%Tn?imKNryAC1vG4_IGklg*^)L zWMRJOtebGupL3@C?EctcVCV>Y2m{tC4tO+;hbuOn3Hsz)TU%Y+tQLMld{^4`I3&A& z0e3!bfCq0x?HS{b{O$<^iU7h)dLjKYTmRz=Q8llCv$LV@t0kO=orS(DHRiGI zr^DKUT-4of1F}ud=CoP97ib?IbCuSTY1)-b?$rXqhOQ824mz|24E$!IHoW_~Uy7tX zS(JKKm$L3$AJr2ogit9-Lb~SN#FL z+{wd8<(He-*o&~UA-cysb@yvM%8Lr?X3a_8W}E&k48vssZH#FR z>de*6pyr&0xw^wum)~GX8PQ!n>&wZI~f81hIkLbFbOKFD>c~`y>pDx6 zNd4$`qv;YC20vi+avs*FoQ58gwk%)c>(AH52bP)3fj?g=*I&SY};M-qXopgLxP-OXWo)A-zSP!@ zxJ<%R=VOW~1*BJY&(K+06dRt$@nzL!@Ki zBFx*OZJS)_`8e?FHN|tb8#3{L0EZc4c^HX9vB4DXC@<6uc|T-z&^t(A%{!Xr;&W_$ zW{%aNll|GGq*^-LkcC-yc6n5clM8D+T<$7dbp_Mpi!p5(2Xc{eRW@;Q=AX2LCniXQ zPjxubyTHhw-p*Thih?}%nod(7=h#mb)3_aUQ^IgeU=k9Dg{bs?-Ba88Alf z00vg6GLfveF+z~$fl)MXIs&i`V}U^k^jRcGE8~Ug3It|Eovo9)Uazsq`Dk@7rziXC z!1n+yACsA-C5K^qCPa*G4cQ0ZN`F5C<$s?Tju|rlT2`oPttspcU z8GB?noxM-4ujg(|cwTV*Wee`i!5dM8xgbJjZ59Jc0aEA$Fsg#cc90G3baHa?&SjRH zn>+C*_;w%U2RCzJ)8eIA8~{Z~I^&C}bvox6SQ3znnjQm_2IRBNs(kj*4sh0?mZT^L z5c@L#^?m~sTio#iiiM()G6E>>0VH^e zLft;Bt?Yj>B3%`kh-^tlkPiti6^=exyoT&p1zFC@q? z!(_Tew3)f2vvNA2sFMw=eil#{bGe#EH%>jV^2v|9p>dn`7XA`=Zxt2~D82v^gnyZ* zNEKu%6tTt*E}$>EBgPWE4^)bcYw|5K?4!Snr%BYFth>TGPna!PskLxEVsn0XWl%!)?q5tt&%a%Vn$aq1tB&R@>(Px}o# zNp}<+lUi5S7ZfTz)9DnUd*XG-9rA@A+X=KstGtb~ka1Mgx-hIL?xnl!SU()! zW;JcEfSPCA>T#rJ(Ax=AwD)7i`Vah-?E#zZ->>HY#kiZ88MD36EfCL63!_8Lyrg^m z^rAC!E>!gy3{)xA+@*~Cu|!%PyIZap!bFc36}EM72Ece#W*{!iB|}>JCI4<3^)~^9 z?Z&?y!5^jUUvKkefoTFJwQQh|0XAn7pS=Yd*k55=9m_U%({nc%;C@vG*k3}8gZAog zzXg_UmZhLZNPl}nK;dOO@^n3c+D_D7=%OPNjYh#X{tTXmTa(4Eu<|u^IbfQjXr(=bfbA2Z?wH&)Q9TLnVDI&UQO{GU1)ZG59u=o#v{fX|~4XdjG zR5RW8%$0>kBexX@6sMkVzSll+hjANy7f~*$kye=NimUPCHsX2kzJ4(8Mu-m#>xy&b zrp`62aK!9dDaPm+86)JDG#;6y;+fuYvTc}suB=SOnBxWQ_?ru$Uh?7a(2kghC=xJZ z0INpvRBq+!8I=|2TZQG4U0vu|L_vW)=dsVgPPkmnB>$wf^Bf!f!b$CbJ@VX*bMx>Y z5zs7Xp^{Ok`!`KxEf82aUc2K!TzYMi^wBTJpnR5egN%tl)(jd=Kxm_%-^_0 zaDO<;ciCYG0gx>Br_lq3{b6d zg4on0y)WcWi##PdcaNHk{R08nluauvWPF`F=<*i@O{Dj(0>dU3rGqE>jiWtZg#(4! zup*YGiFDM0iTC@e62j-C1pb8V;$nrR1LAspCGY#C_Oah-{KOH>toD-9?!SA(hnV{G zRRLpfXS(FKdcM0}dSgCZ9pS$*vp;sE5eHyXiJPwM>&_tM`HUFBC?XD3!Cn0vNz=Gb zAifGTh+w^4B_W*u7=*jGdMlv(=Y00)sWCm<-x`-^#^n ztY_<~?2U3arKui-#LKRWZ*n^h#{Jj4{PAt*z{=)&cZi?jids#%DLv8tTBzb7vjnENM%>)7W;IZ; zQz2flN@AX*fpr7q-O6DZ1Z3sw#)^vfEdOI&0QM^}a6WyOqJ-OaRD9T&m_$RLySk+E z|J?n}nl@hnnrtT8B3`ze-p6m|`{2&5&);=3HJ&_iRv!un1%E_VM z{P6=HKO){Ld;1Mq0O3bmF%9iymCCpP{$fwN;ABLMXbV&v208tK*veB~NwMKejrpe)H><2!?WcFf<+dr4#6n3PUlyB9*9$zdpU71hy$XKv zJ!;w6EEFGWinBAAnMjU0-YJ>}Yc)2GQ=GUltz z(Un7G>(>Fof+X0FpHpWO9|$6e9;WWy#;jA~juf6IW!meE@M3fO=h*Rg|0b&bGRYzn zch4GU1j7-xc^&1VHyIfj2a*v${5KOZ0vO<^=;ubG?ufABuq)teb2~ zdpTx{E%%4&;_!RAL_X@J=jo;5{<0f>_UJ#PoXOBPxud}pO^S(6QkFlYz8#2dQ`X)K z!H{rB+}dcEfRnX6c%L}{ts53RU*iF@6Mp2$Y93*_*=O0T_<|cEK`K7C`vd zpCJA5HK`=lm)%`!RaBr5Wj}TelBVS+GMTLGht0OrX{=TD1u%SSOfBqvEz6^ZR>bs< zL^{iMG8{BRs6}n}TC#0Ih=8K{3k_L_jpT#X;3vu39F>^Jf!RZp?9_VZBKt6zTEjYZ zA%0HU$%YI-LxusJtin1<&b*fl30nTze;~5{5rm@Y;k+SUDk;*N@G~0QO_e|+miNx& z-~G}w-q=0Z6>ZBzuI1gvv`(zTWFITD zyhYfU{l438F#1<$<9wGR3Sn+lnkr=JKIBe+3^3O8Sa?D}`*}Fgt+T{TDJ!p_R0x-qYV#+KcHkRtZzkLZ5g!A+)J<2lI-@n-DaE8 zTK{;hE@s!jZ7ck|eX`(DT~&L2d%D7Gyb7-~=c7rrD$G!LL|o<*KrELNtMRJ^x50AI zHrlDtx?VpI15~fQ4XvEOUd%zjh2}}D77d?LhR&Om9G|DEi@4i6yp)k^wu5?H1HZ21 z_kjaou{j3Zn9sLEu?BZrFiNj}le1cE?8c?TOf-2>7x zpIbHD%^QNYXu;=HTzpl{M*ir2FQ2Fd8Wp1ltR_snE^>r=<(N%--Ck3>*afYJOD@$L zYgEfxP=YC!X0IR_blXkz|F0 zzaq-NoWh@m)d&}nkAi*EOkl~)=j;nfM8uhRO53t1_pzDmfg%C!`h&UiLdMCDh9gC) z_Y_PP$g1)cO{%#`KwYad#bxdemR~TXce*?kY*S=gqvA@J^nB_G-0rDFQ(H;;<-Gf3 zX3jlQ9~IDaE315C{K_Yv$&8ntwMI@gQ>ma2q9_Ybzpj6p^!) zFkp&~fpI^78ent9Mao-Q6;t_N0uUx%c;7MMj#Kfuy5{2u;hS3boAY&G49niDypwbH z3Y$eSRrSj3?9Uum@w;(7oRMOo_5R`vU*Z!2wYDMQ?WVzspeUSOv9afkuUa*~l^(qw z?_!TeE814kx8D(q%`e^|6*eb|WP~N3g(2m))1PlB&i6Ii4|wtnR?QSz znny}zSEMESdXj<+|7$J)>#-mTn6)#iM<&B0V4Gewmi&nvLYp|r7fPah`T4rsJ~@?o zZAgz97;t0zvUk}Mk&zJVT#j-8Inhwa(BaWhc2SZ1{&~AUhHn6SkI$ugu@)d=2@eMn zXs+eugv4MDtNvs%pke$&K|w(gghCF`=3cakv-gf5;Nd1(5^($ZgSCD68m065#f597 zA}|*v6coW;hobH2!q8cr=LgnlEkpO66Jcjeb zBAhEuw1b!j7m7f=0MpLC)Zm)Tb|kj)&Eb(*mfGTI?iz{}&L1?azZNr+P>>$W=roBK zwF*cd#uQrkS_M<*p&NGSGPjJj7Sj12?%pR*5(pMMs8iFo2bro86P!z}_BXKu=`z}B zo{I`kI1JA1aa6E2%vYI3Luwb)j$NB_()$4t9HYrl%17l#MqT0>9Tt5tOwm9_s1@^6 zm$~I)yfiB}*Tg}{fp)O3FZwRS2f`mZ0GnqwSOW%ThAK~f{Yo`3Fz~KOS4)fGXnlyX zGFQH-3>XDSQ?Nv~V0a%{15^&yg5oMlYt@f#av0DjW!k6>O7yRgs83TL`c>EEwDk%p zXi{Ke^~Dk*y~}#CEPsDXM7to&Xe9fUQg^(H8*ADLm0NWWX1$DIlPWi^*@%q*b*M;w zX;WR3$jdy^!-9G+saA*a6J+T|q36bS?M$ND65LkspuAg9`Y`N<$T8pTuz#RzM)~05 zublWFXu2P?9L61522ugYal`QN+M8*M(q}*c9ztk~C1z+RVx-;T*;H4WpWnkn z27MV58%xTHN3C8H_)%(ZdYW(T5!ei9yTXzV_(DfO!ENp#&I7{3GT4nkC-VjI;5{V$ z5-JsZv~F^BIFOx|qq+X&LiW>DUxeIhS&rEJR#1YP1rJG7_-nvB8C<_@+2Ib?Z~Q!2 zXp=D0iW2sw%-`X=2)YwXQOH)vZQ+rKQKvJb4J^&6+W1$1`TIa52dLS`Z7OXb?v`p> zd=;HRD;V9XsxMfm;!88j0#k6SHN0>p^J4G87UbEc0ta;HANhE@Jte~I9nO?qUHTj% zh+}uTJE9NxGpSS7ce~o`C`X#Mz|_BG2^dju z^}@GKqAYpwN0x8ja&r;jU zG1yhtagm(!ytIm@2rIX-7djbJtuP&??zp!u{BAcXNyuZm$R{DP7U#0fndfo22}}Je z5zK%@eC&Q4p));)Goa7+{^rpi;F@S|MZo8cy7iRkyTPDqWD7VsfNsgv2Img~3%%lm z1d<@au`k)_=u_WU-Zv5AQNNQ=EljL(bFgwzQ%!y?HPeu~UuZZ|Qec?=?(j%tgY;$c z(=-Drl2PPirm!3q-K3Mg2Ri~`^sK4LWNg&bNnzMbEkw%tEOm3V^`@_|Ct#7pvD(F# zak-TFK2qF|tXQ%vtn%%*1z4|zn!s3yxOhGJDlzi;pFkTB;iBz&wyeeb8w zbhsjcqU#G8pH^Sf@o87ESMS(o&Zv!hw&)7-%BkQC5@fb4SwKc3!G)@x>LZbF#`m2E z5#7;0JM%m=)YVNSI!a1P_`5_}ed7YAqriyeC`hTp9QS=mC3=H{(N&Ki8S4NQ@=ce> z%Bu>ap!{OcLpq74@Ki8HId-RTBZ?2slA*M8V17ABADfz5=T7D0Jk*#AOtjbq1Hsu1 zaAFYZf#xcE^6ktUs08N^i__iN%=NaJZ|t2f`VF%)JFw8AaFCuLWh~b_ZijAwQovf} zlPlAaHogXe+I<591X)dnI&vFiuQ>(?P5Jzed{F4xR6zv20#fs7KYMlPN_JisM`XM2dhcfA9l)R z>1Q^to>*JaXPzzyqf>cWkKGcT&8Ca;3cvp;7;kR_bZ7p$uYVu(-{rmR3OpfeftTnv zh0rXQa0xYg)u@yLt@jWQJNhsC5*HjWHjpd0?MZG1Ct)Wq$RmGqya!IDoK6pg)sE4# z$xJCaK3dW#JURg^y+MvILcPT#GtDCpI`jG_Ae3 z@Agc>4t{!9w($m^l)o%3m3fK(BEE|$q*})?5#5H>(r*G5NA&% zi@)P>+!cdF(}Si5&mU5ie9DjXET?-C!kCm(grM!4m-sDrc?2y&gEGvJih;f~Wbm7G zqF5I(=BbfFUB^s>ZFi}^ykERkU!#;nRpC9{?X<_(63_ZwUQ5crKQgZIuEwthdFb8d znFYB;>Rit)ik^QrLFe@N&V&(6-ZcP%)=ICNs~{3^Y78SbyquiVSdI}kXW;p}ss5h< z4qMP&GV4l$i%$61YY%CG>V!}CG7x%EDXG+<8|#wQ`>g(j@y>WQ4t;i zce5%%I;EWXXfvy+qN!;HD4c|V55YF5?LDZ3G(rpVurkv+B36`>2a9DedmL=Rq_>UU z-%t-A?dt$OFTjVnI|u|jq3^!KCZXD5RT5iOtHD*)-p=SIv?J9~QrJ=WmoN+5C8j~{ zcyE5(A{LxM^e6&fnvuR-i!g`F)~QHp>@H_6{UPlj7uglp7QV|dEYqgF1!dR<%J8nL zf*BY2geuZ(2@YpKs6FeshyBe<3G&7csSgr@2Vy>BE=jy}<>H9H`- z9VP_BC@Iz7S@hPkChv7*`aRF3HrDtAqr?HdzpeYE8)NQ*#1Z%SWiAVHpW(nYgR_)c zr#vX_k;y(KpLN&-Oj~cz!BRf0Cysf zIMR+=z}y`CY(dLnh?R;c_ zlW#2-Dyh&3XhF1(J(tkKobXLE2})ib;GNpGQQVzx(VcTyPab|%DLsl6Jf^8ocf0|=6n~4s)6LXqe$9USPcP8R3sm0wn;-+~ITg~0uY%4|e z6y`YRb?ZT+d>U){ak~V|j-N}{f|h>nEz#b@{BvvqYG{QATlhYlV?}&=<9-V&lwQkL zQ8nUefpe@434KY1)8mgOeQK|1AgjdF7iP_*16)*-egnii#EK+ z!!=R5y*9kmpGgd|37T%=$kc2POe=4eCbl=Ia01H-&MIE@QMk{7SsWijt;BfzYt69y zsJ<6*vM_PK*}6FsL=!MYV!gg5eM#3`lm+s7NPI*|kK-z=9r16RK+hF~U(og#4BDJO z9VLC&^~%3}+1bc=@ucP^&g_xu>$ONpI&7MnqR{{fr|ZdOUV`UO20CN+@RVvh;c{SN zCR$sP9v=J=QF>M%C zh-;GS>w6X*w!to~kRkSD)6hzND|qg_=VdV_=KXA?)9JudmG@oiN8dU#k$8}~#wZka zv8|F3J4Vs`zy#0@I@G!Dplw6{PJZcj$3lZN2)(NAGk2VHFel~cP(;?aL4yWIaK_$P5HS3`K-zB3z!{hx{(D4b5 z?Av+oV;oFHGX?Yr)%Fup94Xn2d{V7SMgNpk%b)S}4tY$0B|vOmospCC(*2{5D$X5g zClgB{vda=JLdGh<(3;H=B|2Y>wh}NZHyGtOlghES04{zAtQDyWp1}h$&u9WJr6eBx zPVOw)OnuBwUbw?X85R`xQmXfC{Ts@g z{Xk=Z32%5HNf`JkB+mo{!O!2w>j6dW?k{$jH7NT*yVA^X$g+5z4u*d)wP`oWdQ2lo z0_Z5!$ec>)Sur}2Em3r3rWly-$nQ%I$c~k{uf>~$8sbqG(P_=~k>~}Tb`2);W1%KN z@;AUdOe**$;#!^_9!+WYoJtF|wBD2??H>20O4n;GlO2VSJzuKKXIH*8p$psoN)~g+ zZHFN5AMM~EXE$Zac~SD72y@_Ov$Gl4&mjNXfwv0cc{_nY0sSp|xwrsmEN1Z_R8swQ zZJuWty?SqTpQFfUnXJO4dID>LPj>cpVWfk0{9-?Rzrnp1?)z+hD>uw(urpt9lw0vb zh$B0r2-5^?zx~&ONzgFnY(=?LHCL&RxA8c(oVBmPz(-dsz|D$}u}T~f)}EKe(4mMK zb)16Phxd)n4_A#Dl5kE4RQABa+BU>u@Q$;U!4aa6OK8nvAS5|^(Y#86i?NAxxs1E^ z5p-T%5wm$p8f+7h1czD|x!x%}v3m|<$YWhKBE@$$7g;-N7GCx6a z%==0|$d@eYTaSDf;JXuGLnh|adX$OzN|T5mcLn@hJzsX8H}#e26Nc%ZBpPi?tE8Lz z)=zotvns~)`S`s62xjd0U2to}g?L3lLN;P79bI}BIfc5k`%h;I9Ph@9D885tKIV0Q z7eA8zSS{p1KVifJz3shi&XOEAtgP&qfx`DG!FzIkMX_^>OK!=*rvSiZl@2W=Gs!Bl zk2rfZG&{*o`8M(8#Xc~%+6U+Z&U6Wg&@tWM%rvr{$oMX&zgvI@Oz{ zaskoq6R(S^^E3cVfoe9I=PE~Gx`6K zidMo9?9+^U{03tT+o2%t)NwS@$L*n0atr4#bPg?+ft$d-nhlphyj_ zOzOM%;@rh%N;D)erHTcF6G2tAoHFR%Avl`83;>%FdoUBjCoum9GD+q0;|@`3NSQ2} zVlpRRqN%1iE<$YeBS8uypC(Uy@}H&s^KHd@1l8io!}5U?h0JG&hCgu^MpK)HQXc*y zX8%I2zIXz4kjh))L*!qD(f=Mq+5*_9@eIvk!2kSTcR!@ML@Vnu{3$G+a`y(yCm5+= z>?8l15>IT4w+Veakh&>yh?lb^(x=IfG#J5T749jjk+WXSbks~uDriz&;R>${#!RBO zp69B4hzF64SFhWCCdsTPeLjoDL0x9}$=Ho9Dj(YK`t=T6^g!TujDW=}>=27}oiDf%Jg$+E6bq>`dRudi7C!A8= ziSbhxw_7l`QLEL~_z_tgB3NOU(g%-DG@m&P9QEY2XQvf+)%D?)zA=wh3fIMXhRW}( zwC`|^`+xTZ7ckclR zBBErI0kPO0+KV5z*a89H4X(xc^z9u{2ov-* zpefTdxm`tUdvvwa*U;`LTDg;muyI15N!NjcAb?EO&UxW%wlfCy@n0*~% z#@!k6-z@+<2a)=ByWrcsMJM2f??#9`VNU2G`{*@0Yci4paY>N%D16B9w4`tRgK^;1 ze0HbfIQE;#Eo~aMo5^Ox^Rv!KmvpyB%%tOWAW^pDebDf5HWJ-#Q~uG-_g$U32?EYj z>;juwMgOjyt&A#IHz$Vn$aX|u>fpty>?;X*^H23ujN?&g+-o=!E4>F6(7Le)FBesC zT7PZh-v>U@-Ux|(0!TPf(!dXirWNM!b^|Gu1##3&vXWtqh9XPy?mEfkR zbyB~0xYpdq?MJ8U{mq%Y@r@&3dl0c_j!Q7%2gR>5`4p$s4O%<8y?|0Oq|Tz z&a9!4=#2=+A#VHdzfh|PbO6bw{P?iP9@vVZQ03eVou+ymi>e1S+VFYv*)GF`!BFor z^^!1s^r|izl0n+ZmlF1Ic{yg8wwNO|;=_w7r?k8Nkv#Y$Tu6o~-|6%4rm~rzA{0D0w!OSgkj(=Wf0uk3dP($ zwV`!=;oLdd!$t*@<_O{~q+Sx(3Ak1zz#p1HPb7I?$WdIci0V`Q836%h{+WcR+Grg= zO0!(N6kfc$qjX8q9@`21u*6zLj&^14F$&2l=NjaZDvqz8dnWBe zjy_}3>Z!gEp4^56cBEtLI(z*03Z@jP9z;BQL=>aU<3R~K7zEUT1lSq@J;TToSL?;!wKaMwgje#n zt!c-{KjnDH;`LUNpQ<<~aWG&55*c5(kkr){Is#vZh1p|}AO75aQz6c;lB(RTf2t?1 zTC}{Rd;l4E!-5iK26kai)TxYj*iO3F1U@(=?8P?U?{?^_!DV6cW1gHW(k~1;RWw`t>B&#i>FQV0C|9`2(_Zwm z<@7C3A5?|bO*>h$XXR=iV&U_9u`))7UQm~>ZmX{U*-g<8BU5#$tYhUeE^t;$cITeW z%PiWCs0~G-=0z#h4wY)KZtNMv_IWA^{ELvkRAPo+bt$ z`_N;=ZNzY||9keSXf2HQfKr!%)pu1>p-GVtn}A%MNgnhukK5(h>&U03>f1kWAj8j9 zYooee=M2hlWD@KCl;`FzjcVK2*s?E9zc z-!u8>vo`a1*|L`Cw-oqZ{OHu#XU)5msOR=Wt_Vip`?Q^UW}|IT9(S$>*ouF?jD&rh z$!@VH-W#b3pabWLv~2SS1!ZBRYV$(UT2tAQI|KSWwTS_}WFqIO4J|b%3AQA%=i6N? z5I|nkIUf!XutLaGly+p^)rx-TaTE7rl=<6Bk+>P{ZLXthr+M$X#$nD z$CHFw?S})a-c|XDY@95&48@3=#hhd<);p+buj)5^Xf9MfITcApP(6kC;>WD>SYYy%EEXWSuO5| z^sny}+wZwzWeZ|5r9EZ9c*M=1rsWBF=uVv?Bx7vR?O#_ zDw7AJ(c<(2(*=a_?kKe+!*57Sx$_b$CR1$`J!ip@Ei)U@-32pUye@!C!eY>q`lhj_ zGl$gpZ(X@R!WQwn8h0dWn-#F~@;Hb*xTi@tajsz-_&fIBi5QdDOciDpt1fFM=7JP$)qyy;^mbhS%X1w^t3dt{gz^9^N zV-%ctK+8vNu`q)dA89l^M>&%@ivaR^FN^QVBCPXG(a>*~`MY>&c_-QPJ(cPdkSk*e zK{E743jIsR9{_t$Qhe+oovqkYkp|Ceg>>obplkGPZmdi#G5V&3{~%oa z{bVUSuekkfito3-kf^#4IXZ*15L6pmh*P;wS;OXk;vd$@? F2>`E>NhbgR literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/cachehit_N16384_swizzle.png b/examples/aot/matmul_optimization_guide/fig/cachehit_N16384_swizzle.png new file mode 100644 index 0000000000000000000000000000000000000000..00fe5f3ed0633d6540c631589082b430f2298353 GIT binary patch literal 58588 zcmeEuc_5T+7k6ZekyN%K(WdOOCRtll3}VK5s~En8X2l63}y z?E4n7j^(>4y*=;q{`-A@ea|1BEcbm~*SXH}JHK-6UA2A3l-zNRdG_Wrl8kX5d-lCS0kSLXWS6Z#|kwu49daHdAR zju&e2wqNsE`IVq7+%L+t7~ii^2#Jur&y61qs7iT#J9f!;^qPOHkDn7r(j7Z? zGP^kxb2aAW?F*V(8t@ml`#zlne-BxT@sGIs=j`t;2wm3h$Y=u-;#g<+#}#mGd2QUh zjz{;mGDKa2=D`+TdITq0`D`B!ayCi$`jMu_^cp(b(41FGRJ);E;9k1MG9E7e^gp7O zlc80nAwsd54-L&9vY;+Ud?pSIB)3e|=UvWEMlvP0!2}Bqg2uvVVJEZO(ZGocsHl z_t|I;x&T|s^D&uIYqtX83|WrR(V6MJBL8*CS8q@pxzAmy^o{1PSO2-Prr<->B=4*F zbbo~WALR&m&%mk8`~RQ(za*jO0zc0LE1VChHDNv;3>oNf(%?)~>7AU+oZ_Vm*kEj3 zmOb_2$5yf`vi%&das6cZPuK5nT>nIYqTZuIJ^M-6yC%tY{ADU{kVUwS^WC->oYg_ z{Hts!QR-L}DnBkNo?oY;dK2S{hq-SFQ}Ku-YKl1YMP;))rn*Z6tebV`UHylH5;CHs zWyNIaUe;8%V{fB#y-?WiDMDCA^x0aR1DS}M#NNw_x=hj!8Vd606OQLGr+{I&PSMPS zWAm4X=7@2II!$Mnedj}mmhr+e!>?33nq9vmdN}{Ngv8!qT&Rccbw0y<@)Q;$&e;|A zF{tH=pU^c=AG&Ve>b$Ttx^j|Rp936KqE_BeDh$OmPt@|!(P<|ZU>)j2|7jrs=_idf zThkpaa{lX1_P_NIkjsspV0ryn@t-37C0AC8K-~ZHCj%ziXQRt6>{&ivsN5HY*w+IH zu1LeUTgLx9fm{IKxS75Ljf(EORqEU7*FwUeaUor{#f3fs=-gxf5UYQcC=ebxsvIPM zAFExZOjYi_V0u~eM)%|N?a*u}zwPHSER{`=Q-CD>-}2KI21IZ<&7vNVR@Eo))Ao}g zp_;lT#l%xj5YhLDI=00P>TU9D+*7*Ui3NzipOj0XrlKak9K7KC5?D}~@;q&s9RCen zySz7Mn!?3>4~}TAJ3yjrtg|N7t_3(`|KoD10%=ZQ_)qh*?S%nm4Y`O@#RAD$#kFsv zG`|;pZ`jl3F{H8up8iaSN2F z7hvkn>~GPylYa;1fD+&qn(w?Y+q;Cbr=Gt`=aUgK$-_Ut^9_T08TRx}aP)>nlBXmJay6P?ycb zGfL?i1q;cSU#f62f=jH$)}+S2=~rIdG)O)FpAaUHk{6ChW6C~Mtvh6>_w5)36&PwP zdafW9mM*R@ZJtNk7We-6>6A3@_|3zU+L(dOn8d>GrJoHmv$MYqzc@8B!;w$KuTsKB z3bXH~U@$eBqI^Y}jmAiA+}W*)~(5{nwxFN@q5?@k|f_E|cM?mf3LOKEouuhsxj7n-dzX2U^6t!|AqXX{yJk&~(?EY?hE{^2%^%>j=S z>*LM{;G3(50$$M)>$WZ(P0RC@NR|JTsF8W-TZqOVyVzeeTGQVcZnfPb1?)aIlQM49 znD1tBCi$_&fi~=4ja(}DDeX$lb-{nEcHs4y)q!`MNA!sQLju24SFRe|%2a49$baP- z@FvD+xl5<=P$(Yqr;%jb=B8&z?R(eNSr4zF+8As)j-Z|eB{GE7n&3^z;$)7>t;;r6 zCB^!?RsQTZT)GW=R;D8nr(Qhm_wB{Hw)7_#u6-XcZIM~qiZm-e{2xRI{Ok20CY!Bu z``&3ZNY20j+rRx)3j1ioX=ZXVcYK^#507b=S?*5LR96?Clf3GW#$xlt&{|}KO2SPu zi*E_IZ{M=HXCh;wdd8dC=AN;+cEgj43D^Sb2I0Pd17+QRJryMsvsku&*4HeQ5MIsh z&&i%uTAAfLb@9>4Tyyhf^Myv__MC2K+3ncN!NG7oC4%nFb438yQ4BkW}Pdwez>n#TAAO%eW1G6qe zuh500o}M!v>Z={^pdNC#b;4{f%xUW*KJpPoQDUwWqf(?toW2A;&2}rH_c6ZWP|MR~SrR(8wwb`kZ};+)i1zI6~C6 zmi1dnRac<>AEbxniZcr9zCx{QnPsGFf+ugcynf0 z>uSwNasivGEeAbO9NuSX(}l@<*ZcjJ-B}5-Z4ApZDf^P1?>-CL={J2=Lk|>cG|1o3 zCQ2((ubd5e(I7V0<#C*<%<{19_yp%Igrich-0Nb;w3akL@BDfyE9>>s<#>&8?oi( zpH)v0h{P?NJm=701fOAW^?R=!8qbr-koL+0suuV)%^wpaoW-qR36|)41 zWz$YAZI%Wi#%d14x`*L|lcHdan8&d4mCkzpGgq|BCfSseU}DLAl6lygdvz$1BF4$M z6~YyLNxSg{`jh;=;c&jbBUjz!wXlHGbDMWLPkPS5f&Ge@fcAJeD~0TqU@%S5f;k1H zq#I<*0U3!Sy_w0KDTRfjCM@b&i%0|yPs7%AYkBg9prJ@V7anJ$*;{=}akgZ_=!=tR zU(kU>IfZoOIGpY>IL_!X<7mYmvGJYPj<X0++ed@JRrWgP}9rpPQOLi7-hOUxLeEC*`fL0IQ;-+nH(XKy-7XOgU9gWOr*P?G> zDmHNEv@hRUcX}l%=jvWeCO@J%r*Pnpfx8JXVF7~I>cF9I9Kcdv7$|~Frsqp`tE#QS z-HnZ_R38iF85FnLl^WPd34QhzaMR-p|06R%cY?Up64;Gtmvx;q|8_sWT>2r(kgIaJ z_~+fT;Jst~wd&rtDGmT~1yATYasB@%|6fb!gh7o@9`C>JgfjzV{h$Hv1A8)OBWrpw zD-To8dOQfy2zO|_^Z6vL-j}-_+%u==3nHZ9(cc~m=m?HS>%equ=*(cu;kWstRLaB@ zm}!EsC6$f3N6WDGkFU>E6pIuAFvW?y>EQCOd(M4M+0}_D+;wlXn58$|EJYk!vFLa9 z;CoagYL`1eTIAk!uK`)4OuLy-mFM;H;1y#b+FmzbDZ<354NzLK}Yw-wF9 znaugJ4p_Rhn@Ko55yfDJ$Js^AWg4084k+eymkB`2gt|w{Lan9xKOYt_rq$z#_=e}5 zZ#)ddz?;SQ&R+QKGDa)*z*1e^-LffUJl~sp+TEeCPSc&-Qr_#iE32z2UX%c45Qmc< z>)~w|XR>^Lwkf)fH=SWSd-kkRPFB`e!HgV$PZ7{BHYTsm6@2@qQmOpKXBrFdcOMD$ zjdj0ick$xI)$d*l8)hrzanU;Eo;U;NW-(0|%nV@Zi0$nrUTgF0MtAEC4Gry**K4j| z^7GqsI@;PEad`g-K)AW-kwR(HkY+@Tj%k;HTVk!rwY|_j;XPPIv3s3vUqAs0j8~oe z`N^1t`OAL4y z&=VP*0Nk@#^+3L-`pMC4t)Rz6eNG|tpC3GUz&BF*O5bmLMO;+$@z=-gM~)oX@+}P2 zvo9xD+t~Q5BXd<7D>tXnGB-7hktwr)XmHPtNOb4xDfV`Gf(O1O%PxGzy?hDoEk&UF zU3=laM!Kr?NO<{D1MiL(7JX=;`Y1=CS!49Za);B62|QkWW_p^Heh?|`aqgFwC~rUBMpU0mh$meI*?`TP?eN_PZ72 z`Q`Je_;exFnYLdxd{s0cg_U1Y+}gBac-BM-c^7VJTjOKNgnw(<)UuxC4-pB(AN#2s z%U#&2|8m9qQ?2K4!X6*9AC!cWDZw?<)6=snT}ErE+Fo8|3*uSUBGSm|JX^?6C%`u@ zm}mU})^`Tj>V+q~utIio>Ov5TnemH3@NR$HR)(^?2_)(FH%FSK?!T`$f$)8Pxh23 zOVAa$Tuq0VNQU38CZLvCEi;`*7=ea@V(U0H?jik^&%dKA`dY2qTB`wj3{DSR>?};e zr`knxaB<1_~?orE#dv2WgH{GmeeEOQ(n9|JKxCE5%+Zk# zpz6fc90cJftZ}NlFri{v+r`KsVH1^%%Ne{GsAfBicR7dh?bxhdEo|XTiTAFkj(@dy zc%1E55Cq?rEAX)vZ;2$kt(q9SUenFu&&r*C>m=3HWi`ze=NfLK@IZh4bT8@lluCHK_2Qdu5(GBc=*vLtcN|pC4rp0 z)b)jloHskfxt_sw?>5e$@k~z<@>etJ85^uP7(QQvE3R4TmC9ZZ=Tz%3t;QEFyhznQ z!FnXhlSH(gD5HvbX7GzjbLgSmED{{wp~I*^i~aOu4Gjj4hhB!)z&<3-!;28G#<_Wd z9=sLyf%{&sW@TMmgUxJWv}yKxcxO)cweATn0n5CkH}YN%bFM~hNJMb|k~PXn!%WZK z>5EoT+mfokG5>D%8O2w>4<^Z#0xke{jfrPnjMX#H)T|j~bO#=n zP^a8cY^$r%+pZNFg(Sm#klR)UAd4Hmh|2^YT@?e3l>&w92qoAW3_(6dDa)R0E=pKD zGGr2Fz;pcQub80&>dY&s0U1k(VI+V*isRzBP9m6KpY*$?j9|oI&|V91AYtfv9$H#& zXVo`IS(53qFlZxVsln!GaF^aKsc3((Gx=_u32Kh)Ucb*Dw4=OkZwM(2;V!Qi*zvHwsqH-pZVww&qYsl=btynb7Wo zqU!Lmj%(A9Cxx*~Z9S%lW0VUZPIYAXbf@Fp77S&1GbyjZ3bLG(Wo_8vFmjL~Abylv zptW-FD>Gf>6&yc5+F12`V%Qx97VpiDWIT|xH>)IEAG*TVs+>d$RuB>M=`XTR?)RL* z0s&|c`2&J2Dd{z1hcMd znUO^5zvB$GbX@IHIt;I`q^^_G3OiuZuh4Tjxjpw})pb2JZZ1 zfW+HsAkqwsibz^0gVVDsGl-Jx^ujGhPVL5)nYxF+>uGXfG~NKfsNnrTnKRd06n<0} z+yh-qM;a@On4{umqy{#^Di;V^Mz~)4%KmenljqAOlDxKVA65)Mw^9JNmWD-fSM9}V z+s0`A)nLE%;E|CW_M7glEuXTGxigBgn{Vh#lf0MiS7!5kP4b$T@~ z$XZ7XR#3jXLmsJ)Qz)=V_932gAEfBbee{H$5;w1*Dm}(K+BpVgvH^p{(k63uH*><> zEC{PO{y;HRrTucxmsI?j-9`Q&uddk zlmPk1XZm^z0`xQjfM^^tAV_K_5JLo~xI9*8bp{X`L(vr|?hmv)ZGqQ_&`_j~B*GH9rk|$NcmW zxOsJ9V4)0wVub{xqQ1+l4?bBmE0l%hl>cm3a_vWSg|=KkITQ1LQ!GoclzK$7cjZbA zCO9XL$e2APu2**Yedsz$?1uL@WRy`j17@siW!UlnV=00dUAYs$rS9fZ%Akj(5L(3#nIUSgQqi6-xqkbRluk9S7tmOq_ z|HrN*k^faqD+U8cL06f={+A4fnVvH*WI2x?ytOKd2Kv)B*U2sIcm1HMp7PDT8U=R8 zf|+N=@+p>kwWV=&=l8!K0J!S5QDNR0JrSdLQ>kA+WDJwzF^g6yUB1uqP12Ezx`|UR zSN!mC`acy35t_#e+-w5&Vm|4=5zZ_nN;{;h_$5v#RX1-}fU#qP0mIZm7}x(TCu2iU z`HjL( zQ&7D!4mG0uAEE%NPjddobKq9(eTL;9H?wzzfnvT#|GvsUe^5*b$d^~nl<9xIl$yBC z{@*;u{-cB7Jykdq3j3cg<?~)=(DhE0S(1FC039ZA@b$L(^F}m7U1X@Ak_moR7!|Y|yv6#)67+J+4<-Hh<&0 zi7jAl?gUAT|CYqKj>pC9oG3Klin5cL4&ydq3s#=689%rH?96&cx$V3W|h$%;=+ zlChC;_LY&4UFdBW(@KxaE~^l%yPK=|N-NR%KMgUzZo!y>iZL7D!vl)I7$8^Fy>ln% z@+le)OjK0VMh<}Hmpze7w3IwSu1}r-is~aSzDo%r4teYZbRXpA|1oOD962>C-kN|~ zTx>z1P}jA12pA$Lm75!*y-1o7!!e@bW{nd2|yDIMxm1I5|L23U$oLp|O#b;ddv$ zfC9An%;jGrUqc`S8!+)(f~4u;!-ut2Ja4jmdt+Mcw5Q5NrV!u)jqihaczNOVD0V!& z7msFb;arbw4i9%=qUe8XlDiCIYF)p2a`%GpsG%q_dwLBnPw#S4;+#r7U&1-@TB|O_ z1y>^)BJ-PMgYynB%3yj9@qSCdVypB)C^Qi9PTx;c1fEE1ptuTy;*dYvPhc0RY5GOxDZKxKiSx+8PAajLPV`n=G6PS!F06P1iW z!PJcch9-1&YQy1kkx|gE1D&0ZC%z>vA6_?t2)^ZJn+W@Oa?mL1V+r)^UR#AjR#oed zTtVrDg@sC#UyWGV!y3p$;@LB}iVEL%C*NH-5nhjJN4PyNrne#x9#vRW#^ry$S)AtL z5EOL`lk3tgZEt;?oCAc&F9sCl*S;4!rg5;n*mFqwVg4urBgTyF_$@z;&XC(cp0jCb zX*ua6-kcA*P*uqXayl;wc%`c{0gqTPiEwF~=H})dhJ0Q)K$|nGMrzv+q$Ju~m9;nL z8dA_k#>DiXnCRx|tiA{o;uhbx8O%(JuT8B4iP_oW9!eEWJtsH3G5~Ec4t9d&fIE!Tb!uoddYOSqxr-5#Wda$BN10mH?xmOBT?r!pZ4ie7&QNtWYcMcac1KR3 z2!RB#tN@s%on}2JoI?BfVIYpO20}HHkqx&JhftBN*=29mO_mvndCTap zkC9D>Q1fZJszyhT98qp;1seg8`bW++i$B%pKSu3*LM=umG{SZ0QxLZjFBF}Vv&Lk# zy}eEB>XI;vt*(~mXaX)9LNkWsYhnih*hTY1%$tb6J^Zgs2F;`<_j4Qd>~V~ z3wl09J{8g7X^rBUoSpqippd6M5>C_#wNJ|idP|1mFTrh^yuoe?M_&zEg`f-?=>hd{mMUPXygZr^e;jBNmt#y1ucFTQa|Ih6z?3ucO8XBTB{ z&IIX)ORv5&@S4LRTX|!ZN%X!y4)+#VgagbbIal=&X|8Bsel1H#RlXf+~6M$Y_4oJqQU-OtRTaYOOu<>@D%{Gv-wJc~f3RSHX z#w7km9bc>f-<(i!^X%E*BH;`2SHQikzRP>YGWU!2ROGGa^n?xar>S=^qvC4ep%5k& z1{URLe6NECI<(F!17kGnHrt)|*>%3pYT@H@*c=Jhs1@4px4SJf8ON>NA&fdl?rHHg zx3ApB(-$VW57QXkM;h}}H_5DIscs}!ZZz9(4$klzZM=jJRP-M;%}_}c26=&3M2~-4 zt5}mZ%ECL2+$9r?wqYD7Zv75VUajmxLGK~jRl(gQL78>EBD+MNo#o`U`W@h;n>@9XfSIC}+OJz~8dgX1v~Y20DKL__H^29Y&?NTTdugi$yvh+@Hjre9TjUj)D9_TH zG#)$C1uAR_&54d%_N>iMQMd#$2FC>6r*xSICon$X&Q*UcUc!A-<8`{flDu{+qq&t{ zbq$5C5zJP;>wz>fMQMgttpoprhM_Rg*j2&@cAA+lV+3l6BnOThqr48dR_9iJ>e@nO zA(glat;vj)mEuogaIIc&blb-C^$e+juM;gT2iuJYwp-$1aGmG(cE3;rJk~*9R+7^H zB91tQvklI80akyr7u^v|_9)jh2(wx)<5MMu!=gdvo08O}a&g8 zHZ>=Ir=L})Xg=s`eS3UAn6`^F;X;))`I?^4Nf&92n|+LG2b+%{5qpO3BM z->ZLc6dMIZQ1S)Yc`66MgpSA6acgvVR#P30gUV6Fb5XUrTY^6^W^(8D0^Aq<7t!Oo ztym2PDocHF2ww_PNV(BU;V^%OzY@SFvTF&gl2UI4Z#5;7l2$WvXP>dz{g~kq?YL&8 zFW&r^RoOBnuhXRoJRRuR^%GW~(W8d==&8)WZK-}3Qd9LR<<9a<%ZvR3Fs2;3n9lqQ zpzdQQ;~gsXyz@plarJhy#UWN&vwS}RN#vM*(L;U-FKP+D`j~J%b>sAxfJs*NAJqdm zn%8)XmUA3$v$$Q_`BJQba!bPQ=6JmQro;4J*~liRoT!Di8dnABp+q&t-f>a4J_dms zD%NWSH_41HL|ZWM^A&f8Se!&xtAkTyWWSMkRlRZj=W2zKS7a)k;b z5u5q`vaVoDFp*K1y9Qt*i|SPM2p4xuXQQ;Kv_<0KrADM+xtRrtz|rY_|e8o?}}8UpCjB zXF5WWQtDK(SjR^E{(Zj8ljlUIayodPF(J#!4~S0rF-E>Uf(4I0AMs26rdTiJct$xvSlQoX_o-96iVoDIM7NV|R zl|rJ*1L2pT5qpBcUE?T^R>qM$`!w* zDEytD3Qh!2|zlbZkW$oST+&+Etf zV)!Dj$O5(GuR+5?2XvGbh%v>=gc725f*t#FD2|K!$6A(dGsIBt!0UyBpm)+^qr}dH z*407@5tJMd#>n$4;6{VsE2BSoJ?3}iuG4Ds1m^Je0u z+i`PJ$!yiSGDgy2><7sDA?L^yuG0ecIQRSpkWIR;@RRBW5nrdn2$tV;oj*VdqMoat zMsh0mQxN#oMqIfE)h#>v`er|i!Bn$KA23v9TJ>R#-?IccfUP=3ZxxtFYN{a*-J8jh zN?lDm6JH=EjqFKe5}G>Q8TAmR4%jRw%v#c|Dkb_Hp2!}72)pcTZ76AWxVmg3_&I=Xh7wz0 zijk^4j(23Hg0&b=#I&zPKSP8{}Ll9t6L%id1e<&9Di77~G zuNuA(mwD}@=yLw2geOPoH}zUIvASmTvRk?I-x%L<3#k_)5sXyZ%>!GZ^dXGhR$iSH;U&L3ED>6(^@{bs6g=`Nt6@s2V?#p5mBgJ(I>8^rfELp-htJwM@||k$SUP7 z<5HTSLjIuGQr9?;mjL7uk`h{C!#w|@OI+KC49!Ft^AGgac1sMj1djaScA4*UGM#ZW zkd?BnYCeThgYywpaij}91^b1|fUQ$%D-o1JLDt~YCv2x!Se>!b^_eI(YE(X5xEnjQJot2#ol{_ zt>PvBk7Rd&y(S5X{sIeCqQfw~f_I5%fZmQ_-u|1ca>k=ZgO@slLK*_ zRqd6#Hbl&#scZRSNL0$Jj#ygG0>c{jV7+5Pz5I$XJixcS!mdcl`_q`Ff5S_7rt z!V%QyXAg&0`#!9MKjnmCK)87ll;uwIiLC6{h{G%(NmRbNSu7XxyX)rz0?&t{kpeGB z&&=QU6SqVX*Lg<{I~6msV328*2U-mu>)I)CdE*mhhZSLs4De$$gWd+2`BxlPO*okPp!(19zXl; zqa+8?N?aLMW#s{q-YilfSjQPaQ{S5B@;ya5!_-zwT715dxkuX{({G~uMiAN4H_*ri zIoOXZl4^CA%kGX%J`D_K#noy%6rUYj>cs<%@Dq{`ROno>V}6Jjqg&OXuIorVTupKj z?mOsCToy=SanfWhNSc0*>wMNfxVv0IEAD=VlHj;D<>S{>dH?swMD7VnFdnM2IvSx$ z5^1?PQf7v-85%_%Qfaj7$**;L)cNWSLp-WeG(I!B7iSYg|4L?a@*){<1Q3oo>Fu8# zJ2KMQuesL^hr)>(Q3-+P#1QP;=*msr0RzXi`F@Sk`zkVc?{%FCtg*U*SrNQ`41$&%Zq$e!qXQ!sVmP$j+Q-tm@ zFl07ZIWa|{vKfh2udDliNXCWD$6tT;?#R5>=?`{kr076)SXow_k^vj8zoFrQ>}p(K zN($M8_@p6C3Sl^%y&+Y(EVYxsrP7pA&vir}z_o-F^Vwsca&0qEHY*l>s}>a(AT!E# z3*~D`^b)w77rk9Rew!J7)7w{!>Ruw`S15ug`9qy@{B=VuhZ8`IseO39XJPyM!iErJ z03DLSr9d5LUfSu}8TAv_Ww=amM7`B0x`)jeX{`{=+*OV^r@!)5u=)t2Ni#Hn`K)@Y zv!7sU?lpFHKnl;=D_B&mQ!l{Mnz!g_ObDrtf@q z$paDysbi=Z#_TNeK#esQ>}D4FJ?cZ@>?jfuct$`>CXNV;xdppv)l`_movVS1pOdI0viIv-=N^NEC1#$ zfiaE~lrA6xYSR6dW))goWO8HtuJnWa*6MTVu=KT8Bn_LWtRUv(Cy4* z{0dj{An$-5`}H_RjwdR=qC&>T%IcbrpX4413A8m1K9LN>HNG*`+28Jem-bRPsg4{y zI@c5{;`_0%QVun7oTt|5m#_J?*lP*OxL0!3adoQkeQ&>hNCrU( z)+qho#bk$cKyF$*^`8HDqJWZu&qMj37Xa`hnMi@uh%47lv$8G$VPVpE$yCxy;q2)> ztpHAO+j%MT`qjU8DL2Ri+9Z>=N2%gI0OM&*3aPK@J3Ku-G3DjbHWF>NYnPQDEkcuV{ps+0r_0~ao zY?yD!*~<|63)0hTfu!5QQv}_k0INkxeAsK1Fy^LW{3#(Wt_x%gk^PDnPVN>xfCJUM zy%{01Gd|iWU$uS~y_%oZaA)2=9*Dbh#Ul7`5#(x9dLH%!`fqIjP7%gAeGauq*e{l; z(^ww62T6H9$d{=S@6HGA>I7y`ym50iNU#{l8ug`FT(VI;QRCV|j4}3Ij?Bb=8V7)B zpFHiRhQ`Kv-V0!Mxqev!?t#4mEFC!=*3i(vZLdj7khlMc_zpfjC9IjL6}#LZu~#eD zmB=b6=m7d;`KjBvl7n}AA->A6VqI0!J4*>ke0Wh=1`L0wl-FN{SK~I2Xp*WKK^Sdk zotu}RX{He3ORbcD3-DFu7 zE&!&`z6hi|Xirg^sBt9Z3;HY?BF@iEPcIpetV%8dM5|O+;Kg6|`itF>2YJW5Z-2J} zRm7x=+h&D)Ugw*XadsZLqmG*RfgU-Jo(nfi^bwNCp;HW!bEM9+hV)B{(FByDZEoUv z;>;yAbCHWZmL(U}E+s!4JAPg9f7I3Z^|#V2G`Tj79fc!C9!%k%*>3HTk^*^k3Prg?* zDPUL}9&dOm2?TIhfrk-C)5nlNsmIQcCG6iLA}?>t!E9J9AOD_ z8LGZ|5kk`5zC9IEZ)*3Wd;ZL>W7yp^-^y!iVk0{pyLElVEUZw?WFeGQ@}hfRG)%L! zzR$-l)`bgx@nV{<Xb zLG1RZ>l{vRRwP{7y6C%i!S=l0)&M5WbQ+^veFK4Pe0V^kzyf-m!#$ z(X;Ytybzsg2{+Woy@{mdrdK2zXiv!q!CjaFR?y?{-Vrt@@+6oSMbo1V#|3pnEmww9 z_Ly9SYxYOvf62;3;0?}(*m0|b8pF9ed#>!?AB?8YZ$R30dqr>V2Fo)T3>GWX6W3H0?nnZZ3Qcz*&XSm7g5(g3_&%b^v)g|mprY#6> zl*v4+e;)HfeZ-2^EmFJCd1@G&-s^naknuRpo=f1Ar4pU5DJhunmS{oP5R2`SV$Qi; zcxXcZ-7klp;99v6jVp3{hiZ~p>#6sya$OqbW^?JkMldMo16fJgSA2D&kan5O#Uh4C zw*03;wQrs*@*yAZ&SR+e$rUfjV#~z)5!olc)62LJtji-rzX)QxNAIW6(0*EvMxLUp&B2!Ewo_!|SkNWR|v2%{|o@O$(+1FP6`Nlqw)IS*>rf~(#;K?FttZG?Vz%3E zrEvzpa0LQ~$d)SgXK57s*5T=R*E!Z+&wj*?jfcm^<(cWz4f<}Kr?jMte0o=0LubSR znp`Tq)J{d}@%g`<)tCEw#n+E~XBqbuM41w`!+QE`Q|a>~vyftIVUTTHCg)^atcnE& zUDhazTHz*MwQ_C-v=Evze}3LhVo|;zndF^4H`!*oYP2)6J94XVESoep6nxGl;_A&u zUq_#_OS^U+^<)t!>{U(j`VI_8qq1fu=aSc-mdzE?&2;s2RqPCmm-0scW;FLBO2AoW z!21t{eEyXx>sR!;Yh| zSqKr2XtErpE6CE+&K6&uP0on!xMAhi@$=_Jfk>$dp6+LTR5aFc6on0;Ux^*P6Ba9- z`2Ym<&psK?W*l z)au*#lhA4sFeWo!YG#;=Wa6D|>M0=h^$nxHxOjPVwFo-Se4J4!n}~4h2@$Luwm1bt z=b(Y+XY$kX%8A1^c2Ved7>2sc34I77mBy-NtQo9uJ z$aJ48l;ApC^Hpi}EB*fRF@m|J`NRg3%dW-CTZO!e3Xg11nR&=1Rtfpx+A||>P19T} z_6PB{kMcz5H9=II4f=U_L+iGkEXV>odO{wOzl)5VK+$f*jbh(t*n{rgy^BQ`*mws9 z4&HwD?Ae@R%T~=!u;BJfY4_c7&1v%=XB|WWeMgcj;29ZH-?I=fA7T#y@%7S?&``S7 zDMb!^#g6ZcBf-v5%MtH90a@K>@vReNajMo;az^hByRWWFErjKG_2arorpdnDtkHsN z^%+%jp%MeDsqA|MEzro4azQh7yLRzXj=9Z7$-IvZz9%}4iyYFm<3VD`7V+2XkvM!n$_rY_f&AcyLu9Fg?b9Rgi6;bk)8WKrc(O@Z0j+86fuddfhKZ zvb;y#BLj^O_Pd*%l^X)FrF&65+)vzKZ*YB8(QhlmPT@(&NQPBOxD3gDw`b1O(y|p~ zpXRIAvi%d(yRvNbswMRGgA8w$_7ty7qSr`sQJ-=eN3p>B`Pw|*mk#!RJ`Nc@xA#8J zsER-hg+kY}XEu8?DtT*JvzyJ={b>ipHN{BTBILPb`Z@e_THcEMIbW?ec#^=ewI13l zvc-=X#@e1@4e))RvfF_f$<|=ocpJ^`oz;SZxtH}P-)RK^6Hg&IdcYW)a{`bymhc1E zx3?=eZQtDA$;%0fXLwalJrP|CbTNLmaz%EOmv}?aheB$&4#wsPvt`h0jdEGqWbo5; zgZ^sl^WJM%>zyW``Ok%yRq7-XNu=p3#x$kxTh?}#9KAMAJoehwWkV3m%UU?_tEq;& z)n9;-g1PTvdhlQ`Tc=$7N88x`k|kEJjInn!Tyd+_q-Hucpjm#Ag|j8Rq@h{Aq7U{F zjVN|B(d$!V^qjl4c=+Q@**nKM0s>C~^U9fxU%&prany{Z{#*|p5_Am=5@DK}qK z9IA&AW3YhgsaUP1CB~dx>c1xe%gF_1N>|G4(rRsWCtI@grDTJ~QM$amylj49GbPvw zk4m<5z@Q^tXi0YAeC71*60u0w#sgeOISaC2hQF^S=)=*3Y`EyC5uvlDN9INa%G?%Hhz5N;!3TR5b!|{Hi z_Rc70hyuU^T@?_ya!@$j|F{(JTWB>>;5}LsyJTOXnk+L{_To4_J^9+zkSr3<xng=`OL))9P$xWuUXS3UN{;bEY5;U zsKAFy(;Hqub6f8)Lb1B=v5sT<%R{TGaX{uqz!P=r#`U^#R=Kh$yf9+}fIFR_eGipp zFOXq(znl|{UL1Tm!GaRLPPFYu{v><>zV;F9`8FqdsG6Xle}$d=Ox59g6e zy+~)YC}C|rmZ~qL^1PgdG__J=EGn$Cb#>JDLhvcy^@fzo$rr1^&dueYR=9tp;MYKK zu~&~#3gUS|Zc&Lq&nE*mK0dx@LUG0l()^1VX76%RQ4JSh`2wA#*-3wg9c-nnH#}hF z*xK}0p&H&)2Cq7pwa1pKZDG)o{Q1zz-{>he=CiIL&HGv9?DJ^s(jE#hI$k?4bCuiC zZd-R7etM`KG(-$<(zBdZE(5fQtmbYeBsA4^3=c~1?(A5b5RsV@YkIX6uf~kZ7j~KH z9B#jN?MXH#U^kQQB&GtxG*1)e?qmM!TdbU#l0uifNyHkR_DLYi27qCZuIBBGe}w=9 znvl-@p28&^c2oVU-m~vEp@*+tS51!>l0&j#=U(A!tN8*ajuOYIXZyxV5gWF&rAJ*B z+8f1Ya%XcXy(SaV04e}+D+c41h^bY$!{O{Rs^};8DjyIWIV+pkn%iZo)BE~1I~Avz zEa*i?HdjqgvdolLCY8wCu6Dg~2RcQ_OD_P|=R2?8VtdWOdC(jrLr`JK@>9z#ZO4^P zJ8~{Muf2WQVb~aqGz;jlbQIyc(8q>SDPJ^jee+}GcsokgZ-_7iEdKnRdNuj)7at~& zMUhgisa@&hsXl>$TsZk>c0&i-w49sn1i9{{xLR0zTbIfv zW=-}jQ5)RJcWdFmDfY}UJ0mX`Pb8?`;4bqL8Jz(fD<#Vc_*wbFT4HG6r%Q3x=bs;< zYA#jIVoxqun(4C2%KZ*aozE~oWNc==v>E*QmX&L_B?63}f=0Z8H+pv0Y~bc~kdNj< zHyn+-OYvr0s`JLZ{CJqeUOfTO+0r8fa-HqRvC_Z&tlSVNR<4&NK0-~gb&#<5sHWj& z;M?4Y1m|k;?2W?p{dU`na&ukpOi!2HB;bfYfyE?C*VwB#5A$=8)nun;93#Gt(%|5xGm@>me^IT(=$MpnPSa!_hThr@aBhUrvD4)%jPQ8PH-Ct(c zgxGrkWb6;g0#nMZ29_HZC8unF=wuwQ)wuYp{~$$#sT1vY)rE!NzvkisUnyTKY%9!2 zPr{lXpC`^Qv>8ov_maUk-Q32Cn|K8J8%MUPZN-#`4%oti_ygCZNZe|;>FMct%=iJg z>NSKocy z?>5X-y*%!?dCmCe9ZAy{-_dzZHiB0c8wTB$W(Zl|2=K6E>|8reDY>#NBqh2Kn7W7e!@A1OvY|>QI0u`%!A=O;{ zN7R<8SvnazdV-OTs?d)iAC^dLgh6hM%RrBFpW1})mjL-nz-`& z=HY2L?J*X%d9$4x&I0Xo2kYC~=)K2*4x(BY_$a|S zWydP&-IxQUdFEReW_un!Y?>FoAB4Jwd3L>IC$T0S`I~sAEw5Cs7ARP~<}IX|8N2ts z2b5=BOt%>XUIa+eNY~9d`ped zgxveokaL1(6y!uqqfL@u9uuzuZ}NvQ6tX;3%A&?-V2!(&USxDrX1r!5+o#ERQ?=~N z&$+^$Qh-NrDS-5=(V=<^J2L2-KxX)WM9|@OST?S6k^D~%kxxz@Z2GTI$apeZvq~o< z6pUD~qZ*qoci}ilNuH6=NBS(AmT*X0H{%Lg;*lMR5JkgI)^Wa2jhst4`ypVrD-1xo zDolwf7{3u~y*1e0cf<#H;j>M#NiPX#s+?~m4-c2$9XMURu_&>NBZ{ZF)31bwLDJy! z#WSSAIw#-W4AbJn0z;cUUI8Bcu1SU9kDdW`>MTA>wwKi-RfTO+s>Y*ED)WzVN-y%5 zcwhg?DS@JBT|hG|_0Yk9`ft3n(iHRtr#SsZ)Iv!c)}@DPV7hFtMQ3CQ%f27WUw|Hf z#0Js;1MgQ&wEunl28v5eZ$6I&h$CJ$?bkx+ zRg*xiszHN&wh?lFx5L}e9sDd6?jmCmAS-`^lC0qgT{_z@%B<&r{?hvw@8>jJU7IXt zD?dZY=N*>NI@BC>MXLco{QoFZ0;vWh%u){lqSyHXBHUA6$Nl*GO-f#Uts0BU#o$gZ zM2n^7DXZo^3dqwWpmFx6+gBCUGL5awovEoQ6%`fZl>UuQ-zN{(_z|py9mEIIWoM(d z+$$F~87+W1NS$F{)bQSGfJOW*lrrP{K?hI@efZV&nGYb$sWL%-B49)%KD~18eU4Wi z6%~`H&k|=Je9_GvF|c0CeImB010hv0bsK4Frg|G{YiNLh=7!5L!DrX%W|Z*eZd%qo zyw_i4=K&(5-H9+uik9V!)Q7)9O(nkdUWZx6e&i=pJ{>u>7j= zNP!?7`$=ZBgnv@Ua-Yk@xM z1?!)e*0)p%p|qGt*8}gAv-XB-rjyO!xM^2rXH6Scyjt$ZU){%gcb8@Y^b47mi)qb6 zVD57+A^G=(8i&_AnGNOT<*vN4Pz}XIBq5U6lbveNJ8oUd20-uyN^Qt&6^!NubUE6a zrAId)1+uq#(!NOom(e^o{i@CT$Ne}RKZIKVy5F6h$!JD;{l1d24br7m>xqc&*W&L{ z&|>5eMQ2D?4dMw9Xn!RQS|nqoxV*vW(~^a~*1aGE7%cOfVlb0(9IbBqV#n8XHpPr( zr32~j0Fm!-<$VW0{;3&>FLK0eEdoie^ljL_>NoRL?vgGQfo!-*DZgzlN)h3O0GWwC z%z5iwgZC|~`^{bfl!~wJoU>DrQE{`7)q(70RotZvkiiSf>2*d`?}x`c0d<4j@~|18 zvqM^Qym+Uzc&S#(*|`DQC;mVUwcaFYfF3#*;PQ75FHq;2ggpRoySm_4UbE4Gms1s9 zQ>rO2($;AgNyT4*F6a2VW}{R75<6jKFw$E4X+p&;{40-D>h*Jq`=Wvjyb2Y?t%%Da z!yo>luV{Q_6sJ^f@WtqJ@0h?(0s=A(2 z!DjpO7yYEsKWr1A{om1oO9E4{QcJ8C+vz~uA-G%RrfzJDUUr5L8((z3gZm&jaILjcBjVs9QX!vq*;_MjM2!WX)d>hpJ z*RDX*!yLYK>sn?i(!6qHF{0dLpw_5<!Kt^!ddLEd%3W>umkaI9!r7XNigCvj^#)(UaQjR( zl*pWd_?{!}URQLWAT2RJ4(*jv3i+jMv{PYx1r=3zw#%gNo6+sIynp@b?!Ld*X_$HD z$4(Q~Ol<~;Gjb~5b^)M>830-Z!}WMqP$w@^+57hKANw6zu|_Pc7P1cWKbgk#F-s$B zTT>n7A6V2Ki^}d-KK(6ss#r{tdinE%rIC(A}SY161aKK z+J}O4Yz02@Yhet}i*RZ0XXlN6F3f>@Hw)Pv@NVTL2GZGW#t74@(5OS74&-S6ph^Km zkX?|QeJN|T+gYieq<$h$ISev81+^inBn&|NWvuxjL)mT{cipcZ#++dx)%&MKRIJKr zyVCVt%z<|E`NdJ>=ZqS^t+~Xh%Gd_xbp(qfIQXmk(7W&jeMhXXW>y#$NNcp4Gc;c zTD-`c;@Q4e-ixRNY^x<}zxI&>koiKs{+7)D1(*XBHq<)Z!Q z^^rEe<5w+IDg9Y!MBsh%xn`8-pe6c^1Jw1sa(`3kH6G#j^GEbgwsy;5Bo~0EKMQT( zSt&h0X9Nv1N2KN|dTa?cm|J-?PQ1{`~rLkE58 z#5`tr`!1XXiCcT$Bd0vnzqV1VD=AjNn#M>P9d$IIM!@_8pdAc4jw63nZu{jQE5{!$ zw=JlK779oyzj5m>0nXawio7P~NbT*=W~_1@e&w*~aWyc|1d(B;?df^)1;_GuxeZXd zC|*cOP4&czNa?kj$VDi&LA)$DGA>pjRT|76^#o4WJ{i$&QN;s*n9vGbpsDWd{7moh zw0(U+LEleHgpAZ1*U=2_jT!`iNJW^*ww(iV8%8Y+V}Z4c`&n+Pd6Sw&#S-|7&;Bp% z&ENHMdD!zHl_wDgPVCa36T2EirL8lVcYpNcv(6x`02&Wwixg0}u<{3}SZ?_8ZU*@L zh2~+PwUHVVV%eO}CzAk%FvD8j+&&+|t#)7?`M)TtaS^KB=Y4|fr3Jp}{oX+|0?Bj|!y@!;@uVjYPXT|fW znnwCnX!@#+zG{?uei91HX*?#o^wh;`b2c0~^LsUE0U~grD74w1lA5RoB5B&f56^xi31I(_P=2K=XK@b_=e|Z?u zUfa2DtsHx5u_Vllet&&Dzzo8iwmXbCb%FlRRSp;=3QL1%_a0nY?7NZw7@`^n$f`x~ z$b6P0VR=0-HJ;Ik=CKp(@T_8YVC*Brn9d3G*GaYE+}q0hWn$hpd3*)W;$aeUviZ*>?po5Tt zL7Gg+Z^%_7V95N@i$9DKRTtACNfD-LG9PjU9p)p%&DXvzTy$Ug)Fj|YK1i{VnRU<5 zL}*d0qEPg(pnn_}6N;!D0H&Tqu_{Mw8HfaaRn6$0nsKs}PnQNo=F9S5Q@4uYmT}=D^egqK$1c-6J9mM4#SsQ=Sx@HwQmrpv^YR!kj7(4|{SN z;iIj{ZC#;biTCvGW%Z@xojAS z;R$+ed%Nkqcsz-cygse*3)q^A>kvRjLCGv}>p z@Y`)|;DmS=4+_hc{=)=Tji^DBw&8b$dxa506ny%$HKWf|MWNF=qoALP?=&F*AIarC z(6Dxp!eUwm z_=?omi1d*!d5)Pc@OO{V`%OSt%OvXRN@n{-ZA(rP>1L!K#%{m}WYQZ~df(2xsmJ&1 zC0+e()3gbWQbcwpMxQ#*SjV$!MC%4(WWp1*13&&)Xr-9TJX5d2(%(kSQsR;r!|4BV zU8%}4b-AI$xNa8to1Iys=oR;y)Pa4+5$PLs5{g$&X~R2^c&t7deOhXu`&Jw}xf%>U zMUbdN@f%-iE#l=+4qNz=p8PkNn3Kjky`o}zbRfF2rs^8w$Vr@!1HL0?HAv=YlLJg& zK4ljQ92Kt+wU|#Q=k(16b|PmGil}&rKOJoR23`pvf3_gTz0gtGmu~&&l>R zq~c~;R|$^Pv>4B2+puY15L*RtX~fcVJSek*>16`EiWTJT z=0Id2T*Jb0v!h*3#Z+#z6`iiWO*ie%Q;4yIg5Io5!ylQH{{ zJfjxHRGUC$tO{t;m|xRL*#)Pw>*RGN=V_d{>rqmgs|uVfM+oqr1^hjBCG(=S7O(?c z89>TeWpX`@}q{24L;I*fm|k8QECnM7gV+J}&7##HK;m|ZUd=!5gYUE1}td^Mog zZipV)We~mPQMc4hxhr8@R9P)%+LV{{?=l|+)(!EBY7s+smX>6v4pCftBU#Cf5ediZ zD7`&DJS;LxhR~*7vTw)O6}suxVBu&X!*@IY8FF|2635A&1*V)s|%h zO%?dD^8V4oXln?EHY?Bj#jPsfc%nbvm1pKHtPhx8$pjsJpix=^#DHC&)n|M8C+< z3tzX{?WV4&QC0^b$rm(DXqb)nZ`_s?yhp8?)HWp)-I4cre8soH!YI2i>jUHHo?6Bi zb-gqq?gC%xU?;UT^GR2&#Q&L|=&2(R9GX9`fRL{&oK$kPS$3r1nI>k0 z>eqZU!0M8NKV(hhO~{YsIV8lR-+GlXTPb^FJqu33juh>tNWe5p-$mkinX(@`slY_) z)AF)Q)6h1MtNN7Xx^CKtdwInmD9CZhe7J9O^vFdDXp2cykj<`*x=783HFdJlZu(3>k|vXNIP8~&Bi_*#%Za-Ib| zk56t^Q3u-DR;38sWOaVa}7~HlM*~ZAK{CK?+Lj5bsx}wWPuJ@>97VlZ8psPoM z+(TZLqT{DvZHifzxC*S%|0{7!ihZ>BUaY}F-j*5-1veOPf;Tn@v2V_l>@{g@Q4k2w zo5|^$+zN`9%T&Y<+4DBl3vd{2U)??DAu7G!K7D?|$*npJ&yPaK3Ta%GP+P~lJS$>h zumPj6EC!3fO}Cq9aUBo60PNx2wvr7o)E+7w0F@wgbZ1zywWr1YmRi=I)-O>D`&Fza zgrWJ{8L1DQ{#44kZ&z=T@3Z+vhQBh^>eSBMFmqFlG29o;N4|U^siP*3*p`Xfhgpec z9pP@$%p;%e{fggzti10cC6Fgv zjWR_4@|y9TQ_`peRRXp>$jpvRdHq!E!1j^C;0qIJz<#TPU!>Bf5!#g3yP5#%ExO_l zIT#<&#|ds$!w4KD|9c(!F@QI8x+!!<4>gNU5+4e9+nGjjEa8FIfM;6O0F4W2@k|)> ziq_iDbGYWgBfThq@2U6K#&JxgNDh(3;*fdO-SJP0vI_?s=psiZ7asIOlHh%dDB#at z^b?cCG{8uW8%Q7YShmMbx>3c#Se>mHx7*tj{1L^W_RD9C-grqpJETRa`YXeOuG;^3 zaD{!M>LNH^YCZk^mk5}cm>~;?9|7t5@R@_>srl=$AdHFSWnzratEGW!4CRfDYAR(G zm<5ocvyi7xpGwGDX9ft&m##@*Vq;4v8wDz!d;z7N5Iuk~ z%S-Eh^6F=UdSIP)!wSXi<+qRi!Kj0p$F0;xd3mS(-OmPQlqRbKm=TPfms|@br>0Yy z=&zjH?`h|1ZT-)$p7n>u6%4VvY-Ypfw%3d3xL!2aeUIxF*}a`+&5TLHE(@+SMl{xS zJAY1^f4`y2^$C`GfnUF{34EjlZgDz{?+%U@3lCEi3;wo$gx#_7_QKI|{lvxieI58hqDYzH zoBy+CVnA6{%L@E4Hdda$nc_%PV*AF?Jo3@>xNVIjmI|%Jt(k2|2 zs?MA?<_-V3xq+|W69W4?*;%HM1iHWb%|TAm=`>1-s_N=R4n+uo^QMJnL2mDDe3ikL zOwswq`S9RG{#?5v(;z$(#4;968XEn~D^acqFhXYe0?HzwY*-!SE2PSROaY>X|Teb>7&Q#?OdA9I^yQ~Peh@g z3mRLF(`#f|Nphi!r)F3z~6sH`W~t;;MZ z2$ltFc>|niW2JTwd5;Aq{6>{0afV z2Cb*EO%j%wbi!=o$z25Hxmx+8csGM|r2PwGH1oGK=VsbK4D5zGqfV4b6#p9na{%?Ti4%71_zvM=Reac#j=T98x&w`#Y)M9nkt@2iQDhlC);x1fb0#9CK*H)yE)23OsAU_`t&qYEhS)j*h=vZ zsK%Jp`yyPc%IBhHXng)-yH}P1A(EEb>*x>IAe~PukTKUw0MEW@irq>0T)_9IouvqU zDKq4|i(6*`%T!=Zr^9k>01aFFst|tjhw{)A^ALkFb7SGqX+6ORup31{WhKYhs*=Aw zY7`^2MBRxmGpfJ~a7EJ+;RcJK8VZU)iXTTd9ox`zR8fbKT=BPcX(nh0)OA*P3d#(q z+5*8rBDqS@&rn$c2DhdiCKN1f=+914`2X_-8jtF^t1$3b?|SAcw(;33PRNUETyx@d8kVGfy0Tewf=d<1pV-UpNK# zxlHku|5zZNX-FjgbZRdMZ53ejS_mzk(49J;GxX4b$Mww_=lE+mG< zd?{2?OeNJP7xz$fwp66eC7*~Qjev&sST=);mbx}*ejp(-{{*z(>e{BGn2O@KAada{ zn!)*2lhvB350Sqjr&Y=cyx|vDAd_rp@PCaH+wyCxeHRuq0p(l9;M*BuU3L=<%pa!G zQch_$^YYGa{rUMZu0kv#2$#b?CvH1ss$^@Y!(|yk$3!XZ)Qq;y{kTa|xD@WB$OX3Q zpx;Ea&TCHTWgX$+BBHJ3&SAsXW3OAy{L4A;;i{|nF~Cz>WV(&!;UQ~SsTU;lSoH!5 zl5u-9Vda@ zyQKN-{Gl8xsNQZpA=cxjQV>4MI!Ut{tyT~9qcN+(gPAhMzVM?w;Iz9p_T6TgmlENh z$-fWGyI3Gwk>aA(!hlBOO?KbUpNX%(clJ?eqrYU1jhY)DC$}!1g_B2Kn>J0ry|Ul^f3UgBulBT2@EfS_0Fj7ZMn%+rYJq;8Tu{b!|W4`tOgi6<_s_QOz)NEe?1etjUK5nnOTrD8h z7K7{N6IuP8ZyFYsC;t&tmlC#0I5I3jSZ` z*FXL{CqiYfhEp14`G-GnfxoioKfWO9MgUGbl^9)H|FH}I`u06M5G&?vGa4oS&Qt$= z_<0I-Xx%ahiT?vCCX0Cq&Bgm%*Qx%QoBnes?;?Pef67U$h5D~N?5{z@@HwFYN1T^C zp(W5G&VPIQ>7fzeTYwGwf4uR(zMbO-V2=O)FJra8^5TCDqVhcq6!q?2^8pUF9mXTd zUzAU`m^sN`ME7%A7UUUvd{1F$^@rI8GBHa!4v&ccwrZ;%(Ox?=Z!M%A#pX&$uo1?A zmP!BsEfdzxELW87siU~Hjl*a2In5Rs;K*o>uxbzwP3Q+Aw!{uGa#9v*mNsW;KJo)ox8~ZV610=L3}7iR4ACrQg}Ls+)5) zoTf?>D$>BY5lWL%r9e+oq;_VSkrI#Ul!JvhEi_# zy#Ljqi=1#0p&9N$+2o%UuFJFiMe+9&iSD?ZrT z%ag}%u6}Gi^p?HL%F%8?a%sW?iYFICr;Y!#ia%`>84++Nm95zV&tQmxoAXt$l?R?( zdMO{t%NLq^h{>SVCV#STp%vumg64c{F@J_-wVhC|YM*PND3R-zf zN07EWWLI@RSN>M;N&MBktSqmJRZo?pLQ&5A&+D(q6GdhvCE+DoR}G7l(WUL$Jm0t~ z`k3^m;|OhvEvGfDZG;}q))JpmBA9eEY$2Zt*Br8MET&FchHLYXCf$$BFQF^%;0m=Q z8%Uc)KEHgpxa$*n6`BI~0T&GSV26q)ckpgvLZ)DI`k#HNjy9Mh0U z4K)aK>W7qO0?)C@>$&D^!bO9SatUf_UdiuiPp*#F#5b}^3vh(fq4qt~3NiEQrcuNw)DGK%k&_b<)n*LCJp#$WHi zwdC?LDYMcP8;lolOP`u!g{`6^f60N=&9B-CmQ}_q6J_c1;t%Xk!d}9%&o5XAmD*xO z;Du314QwcBY?|b=BqxZ3x!I7eg_09TH=EdmWru<;BQce!!N`LV)YcNSsdmG@eU(Rm znUMtacGnN49B`Z|Sh#sKs(Y~@*rRn5^<+DxdKU7@;i8TP9vI5*llXMD5__)SKD`gQ%cJ$Kk}<|^OxiQ;|qY_ zmZworGNMZ*j}^J2|n;Dy$E%Y7=rd*%j?Lc%eJ9s?{}DoCyv~J@tC+%Yn>7_;a zCI#U>87$=Pfg$pXhtnbl4Q>`?y5W4Zc^g3p0+U=4tqmPf(qXZVp^)fz{xI5rMtD_A zM(L+4=AU45v)6eL{8j_&wiN(xnpgAH@L#22-vhMDBK+<>2PCg|cA1Z~$U=n#462Gy zJ)69HxVp56D*3}#@l8_pxnBD`qr~oEKoFq)l!cE z1}oArujUJ0*P4cfqeS;nVo<A)!x{-Vk3e* zIp7=yk~eMtG*!otxZWcaOnB7+*t2U)Osg$u?i~|>7q6lCb#96r;WI3wrOkYI3GRsCAzNw2d zI+ch6Sv={RN(z%l3s^lDpp<4JR2%Raw5rHgB!sxx6+TWRM8iKBWdWz=!NI(1{8A~x zHgAmNfaa-uknTuT;Mx4Faoz2l#{zlWPA;%JIfZbDDbexOC5((IlW(VMvo1tnCSh{m z^AU2sI?VpaT5D6njB~?s1!_GUuHt93biw~UX(j3kdCJelZn$r!>0Fq~w1d?ydA}i( ze_1KOsdQW8%30S?(58Kg-tPcRnJu!Q2bIvVOdHfUvK+i43*yo6b4R?!Vox0zL>=! zaKTr8;juk*%Lo(>bHTOae6bMssf@Cb!ooc~ZirsbQ7%6Y-zllfnerGy-@n@_oMtX&4pxGRZfttQMw7 zx^%I;ZhU4kudU(n%5fnkD%5gp3tbw8{-;unPo85*f*BsPmVyvy^C8=n&#g8ZC55@u z5>EeV7pag?ZkNDNGuwoxvVHLR;rMgr=du!NnwVLLU5P5kOr=qmbx9m{$kMRVnLjgb zr1Q`AW0wc_C6J5d5|y@Xs+)3-x;*X%ui2V!1lH}`bd`rL*+FS=o zT}H2&$<^^|vkj9XTAehvuNb7bWJiiI>EdXDLr)5QB$OnNRUZTXA<9V@Pm4t@RRdze z^bunsL`>DtdEIY4#UYqp?cj^>Ce<{kvcaj%%+%+`95^taP;#$Uf)%guC#X7VZ=Bf2 zSB!hgp_y!8&@dGqb82~_O&R~#ymrfUZZM&$?Y_tazAXWHv(u!Le}6(s5V-6)X#FsI zsuwFLesH8Si|Ge-M9DLQm8~_x8ajj!vF?CJ&cz+t8ns^^CH+t)5c2o8lZ~}d@uYdwVsSYYh4II~<1% zr%=sX_q=9na8vq$lt0PK$vCT@rgMwmkfLE>QyFQBBa2{;7Eh18t|@T!Y%g-;{3PKWQWPeVr|5MWvHLCm8v#h+02vVCW#UUK2MAsNhT zZ(6*iM0Vi0Bc3_ z`B)i7Id4dKhn(;CG{V8c{I(J7*-VwRIAp5#PKBIwONgeHCNppdB~#CfR=-r9$u*YljrbF62}*Z;X?eYh3$<~nFTX@8hN(< z?Qx&SMR~mJ_rA|}j3sftT)VAJ=rdaHr`e#jR}2x~5l_spFEAJ{+ykZR!w8_p@NXj_ zi}?aGmVUVw{OTh>B@Y?g`k+Qbv1X$N+|Kj_5LJxRqc;V2DJv=UUv>QS3>6{)>e54_ zfDX4Sa3|K#%EiS+wdbvqeGTZ0T*<}M%q$kFdppFe2(a^&FTc%^GBYz9>#*4Zs#in+ zOEm$|!m1HhS5IC9c(Z<02qDyf93TBgZ(kjR%J3^7sx0%l-jMy^HXH?8T$tv%Fv^0T}fmO5>(V z6ciK@uY-Ilx36xEfEy@J=S(ui0g0>Ut3koR1$}%G_%B|>0NtN8kXEVr8fi+Z@pCD1^_(YLx?PIx|zl@ zki^CW1IWqNn)nVel=kw^r_=7a5B=8J^(LTH{}Jd{z+I!_CMs=OvfA6<{{X$362WMX zfr*K%k>5p7Nm&^eo&cs2&EDjNFy91WYxSsXQ6pz$}KMAL(QU1_t`gyqXYLlLD9s zI)l>(DAD>)3EsPC<6vPW&(tK;vqLgs>O-4iirLxN*dQ@`x8Ixl+Sly5S!_D=y#^VA znb#ixDGActwTH_~ry7yF+wNc30Z{H_=E3=AT2SrxsHXzBGhu-HYS9o({n_ZW3;C^7 zf$!S-t-0=P3Q&-d1~?S&Ei8r_wY=^RbVoHzimIyQkJ@jont+==^KtbEwe&JMon;>O zG9SL?`atW@A3r{tVCn!^^!^`Z5dD26q~V zTF)#Y!UGGw2@9ZauC^(zF@GCoOdC+$@NL-ydQ<4X^sI@v-X1nYsu^S92B#XGZ4XzQ zK`7WAc4M9|$>AdwX%%+B_@W4Cy6L%Vjb(DXPjdx?Uqea12+oacvSkhN?X*8B9;2%f zJZ9tfK8?$KQSdr|8bNF2ZkshyZW*|&MrWZhox@SW>-^Iz+~5}}PfLLtI6}0b%HXzf z8jnAl1N8e|>7Z>m#+?WYi5a+_8<@GsP3bh<-51d5=snM>?A-zVV`KA)U~~#I(pkx`AY6PLt9yIjh5ZJ&r@f33r(jQA_-IRwEmFKRO7l^E-zy(Mt?J ztQ1^60#Qa#>%*#+AP1!tS}SoWe}Fr1@86bc$IX;-7hZrfg5fOoUdowmJik&OioIZx zXdKL>v0#CMq?-R~)TCmFE8111H=EjsBV}jgCA3nRkmcAxR>u-VAaty)B=!$gIsCaudakZ#oC$he12pQeqS zr^J~bGHe~>@^Ha~&PV>)Ufb0&CG)InWJhZDPGUwZ2K6ZnEQ8{z`}4YmTFi-uPtc>8i7PyOrS5-N7$$9 z|BUxh&I|CXwjb-^cHdX|Npsc&fLnL(Mk5P@koSOc{`I$-(V;KwNIBjc_$%}{gN;NY zDFw=Cp7l?@d||{RUCqId!f7;xZaqoaM-}1-vxP-)4ET>3Q0k_}k{*@Q!|a!P?r^R! z7~ts)qCto=es=TTbu*h~p)Gx%|8n4z&m@-{dIVfea))nsx+z{%QW6(T=+>VbZJ;w0 zeJ+b|;j)Otrgrsh&ak)p^9>+@pXwID80rkcKfwX2>#b0f{8)zlS<>I2Zgu!0IuGKA zwRL=)2Afi%RT|&bAbh&8H5F+1-2A@QolnG0H)MMfag@P0r7D^C2p*X(Oa2Fe0Kw%~ zz~8C4fm~jGdi)hof;H4BKxl2JZAd1MSgS)IMlM%`(TPKwBK9ai+WPU58RzG=^@DM< zpv_J3cnHjG09y2ol^Cw)PP%gh;}rTdJ?jL2tc(?GbNu=WZt3IO&nfg#Z`St$u2~T` zSpf%h?fKw({`by_If1buj=-eL_05692w?M-d*A$3XNW z0PUHGz3pCUl&hzUSyL0ldlQzkv7QW+A#Lmta#_fe%%9n0sV}+h;04bB?N6rQcbcm! z0h9CQXnkX}FB+P3rOH`fDGs)0|rfBYyCxd z;S+&Y7@imShX3g5uvzstZ^$NBAE+ z4IGoNdBbdpe31gZ3?apVE;mDcA?FBV(druv$AbFm{a83 z9u}^MTw{&W03-DDtqE_MBe7MB;*oll_|a6VsCUD{w|D3nG0!+h2eQk<(KDrFtZ!%M zwX1b^z9C12Nh9?fpN1_Q#T<23wnDrlMo^q2Xp)m0bv^#5KLN6Hr&nXHn}1Igm~)IL zjGQ29QHhG^mv5YY__`-yId*eOI=nkV3sm?O$)Zuyf{!oYa&WES0q+(ZL) zW8Wc4zP`Vyt$!s3`G`O6y%a@J0UWYCM@I`BteAs-krukB+`vD;<#k=Tk#+}=+DIer zA3nbA7s1+enkkK|*%zoG`4A!wv8lK_?aJP}-M`;D#d55^|2%!WnG87|1h{fv#ha&p z+%;dre7_N4#GXv-F(X&bp9_=m#)>cSS$RmEzeZX2?n;ir?D{w>@9sNqmD8@qK&pkZ5r3{Bx+< zY;I6*oVcy&Qah*y)*r@-$2TX31hO`}=O6*FQ)=t$Q+laXmc-h7tsmG|3B~k?q4oa& z?7k0wZXW0^;)6rudjg*unmuR+G~h-=M2G?1hYk)7FMMe;-?kpYvJ{=)cVBFVq}umZ*GWmyB)2 z_U0g+-8N;*IJWFHdOZ2Mq3r~6q5JfaL}vpK(eq>a4y5I0c*lEtv2w2+q<1!qbeVAY zK(1my$&fI^f$-hm78b*2^_*|ug;nGC-m=EVSqJvOIFtsDl2M?giWw%&z37NL#ITXS zQtVI|P4&c*FWs{5Mq|i{?hwJvt56c`L0+AoE0S(b9npy$5;P33LmCrczIzBm&`WF) z6@*Xl`poew>?idHMW5fRWIQJT4kMLgTjk;unkVPPeU=B9Q&efHc$xq%z< ziWtL1AlwNqc}kQud6N{uB{15AK|bLUCJNJAQ^LJCU0n{YNO-#Xwz`9@Xlr+|)R(0R zI%P8=`m{5d+{xY^Xk%+~!jWHqh^lB}tEbn=rc_1&pX`kZ(6BeLfpE~|$m7cNw@FtL zfnYWAmbUDYQ60WguXp=VtgdRxJlZ*M1K|?T*_2~pV-JL3QRk-;bWZuzR%^+i2_x;= zycba9GhBBp43|B~iwvL~a;37pmBQN(x@>!Vct)mk-Z@Z@PC@!$?kun-@XxKa+_~I zIC6sD1Gj6qYJcY;l^vl8_oEL&-nkZXi+eKp9*;IeqGEPxl)0slDYJ>u@hO2hKh{Wd zZ!l~*Ac;3+$FykiF?_``g<_tCg}$KB_$B^+e* zHv#ElsV>x(p6Y|jCJpp{4bYjp=*WEcM%O=!-`gaQRAC6@A{NPk&CNDrQ#z+aG~o0$ z{8MMSHrIW%G|(ajr@%MD7vug<=(nzy`4l&a2p>&mYS^&$!lSG+mc1fbI0sO+tS4u4sw^ z=ti00FgKK8^84ZIeU1>^J+aIQ4%21CV%m@yjX)MaVV;ydyY^t@$?&2ap&oe~mImkt zhvGDm%ge{}(3Cugq?p747XhvpFO+LIdz(t3I_rMOALjuO!D6#4Lelmn#SZyzRHfM> zczBpu4E*2-^9DHt@UyZ-E3dApAUHU*H61GsDib)4>gIMl-5vFd^N#=vj6a1k1#5`X zpem`TiUn|nczCPK)+IwnGe10{QVqq36rq%bc*f6;e+9jm&Ua{?u+9KC)WO+VQXstP z|B?3@Nfi#=-KlgVU5D;YL0Y<`LAtw3q`OERE2)nu@HbuZCKv6 znKuE=i+5q0uU;9RVJ#VJLm#~YVmAcfAR%XTm;dJ+J-NnqSV%c)Es`Sds-}n@@@B7jK95#pPutK-Y?j0mYYqvE$^a=Ygydam4{eOx+A#zHTa^ zCw*}%ws6=Jh`A~riB1vz@|wUK01IHOp{AnB9p_OnNA&6(?La={_S8Qfu?vLKUKhXe zH{pHrMt8U1xu3wgbdE_Hb6z5RboUeeS1B3u>sx3=KAZJ!M^9dA z=ZShTvgX&&+FSXF!PD$dS7!IL*eQ1HvHw|YV8gaKAg*K*;7*G}3pG$DVYan*gdXf= zu0vsdM}l#-)DUzrQ-a}?uUtE~iTd9r{1THt$ru<4@;NLF-zKa;ch-%u-u4L-2Rv@k zYIXSUUtamt3b23HIr*P+^7Q2&U{{nopR>V&L(JT5BcDpn`GG5YTvtO35j1R1zcP6C zBa7m1B$e+9FxV5e=Ut~|;wwndx$dV0{4ld3ZE;%t8)kohCVzj@;$+NExBmI8?ZZEA z{jU=X1Qhtd9#V7csMGvgu=lsG1;QUPz@;(|JyZD?_3uyL`ZCh+X>Dc^*KPl=>v-n} z7}NoFbj{~~nu@<&!PAxT$6-J|&broXVya@oI5uhlV%a%WRqu(=f?#2wO|7ltow<3p z?f}w((H$^e3%2#@>Z-r3tqnY{dNw^dnb!y>yCSaDO3Ngwd6D@7@?NO`F(?m&^vq82E(#I3h_#^Ln z<9|L-pC)psO8$4CxM-i?msn^Lb4aI~GcI4RfhA1&k{MNpBIjS~#^A-=C$HL=XwTu) z17LG-3J36fzkD#pwOw~!nOa%J0l;ayE@bGUBLl8|KjL9K;Dfih%^H_pc!$=ola39A8DI5rBJJ+GNcBfhH!|4{ej zMRuO}kEsOgn=VYSFhzf*9bn>4KSMS8Zq6bOUv)Cj)3dDM?T?Dj`Zrqih7(r+Ih&7j z;Ef5h2;dL!kxz~cG5j}{d?8zomFyCp<~Fr!dKJE z%F-b{rbxwf8RZY`AFG#W?;UM+SdkGM)8;}18xx;psuubQQ)ccbQK+xJS zMlQFk^fKTupxBPQx9gt<{w`g^8?Vh^>ZN(VhuPQfPqGjIIaB^aE#^%q;9rzZ{ ztC_&67EeTU-zq0Wb6Hts`tf!t;XLfsOI{&r++eH8ZpT^0t?0pOS<1F*w4ROk#_!e7 zjk(`CUEyvUC1id3&d8nt-6)D~DZLY`s*!1-h2<91?<5O8Z;Zxf|Y@R!PD3uGZ;w3eRAXd>0OjJiW$SZgJ^e|eYJx)YNy+c@DElT zi#Qk&As^)y6li#|5U&ey1G7zat!#=_wM=iP>NdQt&FUM*g&Rai!Bns79wUyJ;z*mB zsY=sOB;y@p^!|qh1x8f>CkBBbe9#pJ5|inShr5b$~<`Gii!-y8G!Tn|3|7!g6R>j?dS?JhY0m#X)wc zY`qs|ocVn{DX$DL1!wxHseb0HW4&XE8YzE%I`X1)VQD*BtoHV2FMQ9dH=*W(TeOi} z)6PGaJ<5gA*n&26m4xTu3^2pwLX@62yi3<&yz2hQp716p=*xKZ2Q;+;(@f6@87c}1 z)7? zIW5=(aRajssd%u~aADs9^@F*QFS-Ewocm&)$?IB*8Ey^FO8KeqaDp=5b`l7P&9l}{ zc&ozWY62=n#hhI!^XOgq0yxDmtJ$gqN`_H`!8-Y|aJX|8<+)?88&w*mcekW^{cNik zY(G#!Be+KP(?=o0%vz!{GPA+**xUSxj?&5ejO( z;e5Tg>x#U0gxD-Ve^zgC+D281F%}sfa=%#kx|ENHWn)CBR^GhDIv=@IR7pFPc4qZ* z#VBey4ef`G))@o>N4IPNKY07!9?|N_;nt#noxUK#0>Z;TzeiER=oJBjn(zF7Jr^T@ zeU}}QiIzOQ2M!13E8vseW9i+e9L$X1R}~hT*95sdawG7UB}KR9!U>hx799Jln^W%U zCijn2XY>@HvE;eibI|vW34;e%$7|k#OEZ>|2jf`6&MG{rpg27OPGUe=ymkqMH_irfR0RHXg#XIhIIbDT^`iCpoQ+cx9aK zcP#^KTc~M^IjNIq+jf+dx7l7-bhdAEi`yXl`-n%j6$3r+2g+Dm4{@M9XaEsbSK6%?`1!GnLLbi&x=|A5ba7fru^6CQlE0&THGmU zM%BhdlvFSkF^+N!u&4&HWGirT&|=oEyBO1(T&P~gvshv;3M-ryJG>$Hy1mR#k&m*- z_5RM9DO>rtb%5beA(NAosl9;S@-g0R%Iz}^ydEklQQE?KydCYg`%unVMl+%3M3ny3 zq@J~k+U%U<)Zp$f|K&elM!G!3I7=L>AwY~_L=F`{$M#`jE_+d6_2y<<(OI?~h4F26 z2wLYXYlMYlw%HB%Irdp;5kxX)nF>57Q}x3dH-0D}f80trPe1i5j5Bht6<|F+S znqdvWOam?fq6HIh2qV*a-fwk)<|^7)4 za80*n$mGo%a@eM9_D*C}lyljCv`m$qKjjxZN<#qoq^t{;b`tD(#aSznA-pA1*O*jo zuz3L^@<>e))8i9p*|J zULV;O^y^n_;PW{7pldP^D|*+@_j#=1@~eX(S4_bY_)iep)2mGpgFh?xQwD}jO=C8s z>s5>kMNx*8)u&eRRvNlUd3O2?ap!_aqdV>zh^DXKoQM|JqDJZ-v*)NL+0*ZT#CY;w z_7!#SF~5t!Xv-un6m`t3b0N#@ZV=`P*|xB8F7^QwH%mX89R=gr#XauRAAON5SG`6K z^*H8(HG>Ya%$p7eJucsvY3rEs^%rU#ab=I-9A-v7>v~j64VmW2vyZ&Mit|A`t1hHN zMXR=3oNMBr9{2<9^GOyVdog}h5~U!~wYFwk|9Rxgo{vFSd0Z(StIW3>c)@qMsf%YK zA47U44NLVYI2W0JkBj5dfG9l2_y@iO88yO+5KYTschHc;`?XdHy-6_L<;=^qggJCa zTonyGMUFiwqFnRMT=n=nozg493>1eE5G4G0mg@3As8Iaicq$s@4-cC{l_gHPF&+Z8 zizx{4>PTuQ0~|HsX2`eP58A4_u;7IddzpZe!f|ot{?g{Yb zX$AA!19THZ6-F)jQz~bs_h5%2(=CNyi>VU<=V!=&!qN0N(81>0U(%SM-xe1YNdvg7 zGdIcjB=KMo4a-J9vv==rA{1;-h;?#*d}R#(6egr`QBhr;7KjNpHHMNeiW!Mam+Qxw z?$;5EgIEKQ1u+{8GCA6B2=ew#?2&h4JhYxyfc&fI-bZa-Go{0_EZP}O<2Be)Jg=Hz zrdsy;I)h*lKN-cEJ&3}sK5QWrCZ)~8k%3knIs2HA8+v;%_VDu5b&I-LIH8vC zXkwKUJ$dU?$yaf#Kes5Lf)Ne{Tm2N}s&~c*{&H8+(gG&6FoY&YYW$H(@*@G;(47<5 zkOSgSq`bgL>5q#I4x*ZSdwVe%8RYG?3sN}gEI}Jik0mNf-0O3TO1Y`V64hUyzh@D` z9W=1S*irI91P@9(4P&3ZNYRSo*v-|WA7l0Mun-(7aAU{~Lm}%CxyaLEF!Gf1;he!7 zD`aD#XV2cdu1rV6o5s70I2dg{ON@9%1;&OBS}?u8_ww{S{;>?OG+8LO71)Ssgovy@ zeJ6dalbgP){|}8BOaC)eGn{k!FZM?WjOw(Q;NAn1 zoGna-^g#1H@11Xt*A_$~$l+SRju)EZdlqkHaCvq5-M5nM6x@Gn(tbKix}q?itvbN0 zssUqy!Ze~5?k?kG0Cd{f*})ip{e+$T7NUo~1UfLHR_gE(eBRr)Qq~a&g*d<_h z1QE;VPoF*|Cs8FaeRfItD=CqZYm>b7=VEA<@B=a|fJc!q45o1rn4TLu+xqsJhKkt+uQ;J41i3xBNQm%g8O*W7Icef(lt5H-5y96*BB;XlR7_ zh3#OwJp5c){o^C~13}&pI6c&Kjj?Asc$}%S>*{_6s23lf+U}oTWlw$jFNDJ8fz6qK zUXZWgt8VcNF$0y6wXhF*(cC@y$qN7WqKAi;Vi7{3X@N~)IL3#Qb$53}g!8j69)t_I z`u)oWBmkr_#rbqk0Nb&cPSd!h8t6qeiy-6C&$V_@+F21Q_~3`i2f)kg>+7NrsZvwH zioYJ{O9(UKN@eJ)Z|$+H8Y|35X5ZJCA!ML87Y2V_sr=KGO40}Q#_t7%5EP5FllgRm z#4T;T)v116;*u%?Z16y;T5=x%W<~Pv{Yx?Gki?voC{e5tXWl^KQ#qYJYYJnD4GHXV z{WG0?@^0p8tZj>AeQsg)V6$I}k>R(q;wPTJIdGp@AZJJplO|b8AIU6^689&iIXzw& zw*UUCW_lHp@P7Cjj`wrsFKYn#3-BHcQfdw~dU{aaB2bT_FQ4hX#vi##_7{q-f$2~a zg;~-Sw4l_(pS*h1n8m@+hg&irb5|WQX>w`<^QjceS_EaD)v3*A_@LFu;0UiMcHs3F z`(M2rNLBy3WJ{-q;@NwWrA*VKN)O>95yNnAB3<(w2&ig*ukDP|>zh6UXQ!)Ga9b%! zqs#cY-SwnCG!aJX7N(8k`1W3uzSB;Z*N4-B#&mRf*?NfK&#ti-g1}-4nr!@<0n(d# z7D!sRhTR?a@YV4XL#8tjb<=Ohy$h^KmhFhtErM2)O0Q@G5T}9lQbFLizNfbzWyFoHcB2#l3qf+PXuM<#UM zUtclg$mD-3@Bcd1J0tnn4QRX$)&Y8=gf@GCs;QBtlvF6b?}7a33LyOYg_sNItU$?} z1GMqzaGA7OS}DoC0jhKwBrZQO*1ZLNZUI@)ekNe7@XlLRH$%f5ZeRdvt~Tz1qKu5p zDWHX?I5$6^*9{J40|r#ah>?0s>=TB00b-diOIvPs8F0W1qV+deTNUNyHUb+o$L`+q zB!JZ1mUzkOs?G04oSp};;Znma=Ehy{a_;6Kp(JpKD})z&IYKM*t_UV|34l>~Yl>Zm z*M9c><}y7H%o`6_AVtKL<*?rYlrzi&iK9)-w$?L9u-4J?ZH{(#9D~M3Ai2-YI{$=4 zE$uM`kX`ya(fQ~B*~o#S#zugWl%0{$Zt<)fMQ4Q{&o6ZUpmv#+krlxKGznSedpzSy z-op21M{sfW>bS*OT5q}AU?TL{19(c2K0eR|#l^9uUiW*afW}}SFp%?eK3vBB6Ag-p z$MwCo*S$j}U(qd~)EfgZ))KCGKDXb|cpWD%-}NAvZYSIySbJ$a(SQ=q1^12Dd%}zs z`k7iSU>g9b(!RNBb3>rGV*7CCQmNe&hS65vV~4n9%d3Nwdx$zrn6eD8wa#WYl{wk1 z_w(z)zXx)*fNV+4g0c54wGaJEZ}epGSnp0fXJ_XdV>8+zLB0_qayZ6Bz+y+i7Gc+k zSPc!@dPBkHd?-}LSHTsgN6-T3g`2$Wjby5WFE0TIZ9H491yJf8TUelnsB|j8m81G1 zk@)e$v&|PM<$aC;v`Ge#%aIE_K~s=rfE;5j4E~HyM8smpZ{bJttR3CFLlZLH-VIEa zl3<40>*6G9(@ljG3LM|g4tpBG@-^YE_2&Q|Ka2oY#p==32sQ@bC1xFQfsw8`o6d3o z`8%PmiKRk`12HK*%PicmO<{PBYcLXwIIjf!(^)(Ilk*|d0St%~*EpJt#7X@m+F|vl z@e=U6Xmx8LY(4OOc|Ig$5(p=m0JlWZR7Ax2ssNlgYx++VU+^fQ|E9(&~W-m zmeykV>I%iJtnK?oYw|w_MEBTSZnKsm8H=1!KQ1sm^YOPSTwL0Y^K$bl_y6GRb{OMb zmA@TFa#eEny|rF!9MgV)j@p;D=+#)zu{Gv*XP7#uwr&A{?Ro7;j7!_E5J9~zVB_=q zb3mqeqw#Ig;>zH(oH#jxs0yWN__+g|e)TG9S6QWBYf;TkmP>R?D9-yqL}4T>stL zyd+k$?j+g+$Lfw(d*3FlN=&n7(tz$8VW{OAN**|0f(27$CExRj6bnwOlHaRWy4Q$p z7tewIEOiL_s#CDk_9lYLv^25gN!^EDtoe2sqxva%He|+}2^Q(PHh=@?%?vQ>%bH^Y zKP|td-YnOIB(O~$1Xh#r;FK`;#|Ea0<8czT#%(4t()-8fG{p zP@VBU$IkX}bZ|)gf~Z*~|GxjTUFUDr9)4D=HaA2?){_8KP6NROL9G_XrBJVCGC3pj!vv z_LRT$4pd3aN;6!9%*@PL(=|#`|EMOOM)5CkO^pP9&7EN+G*7h!a$)OKMEq%Wv+XUW z(U#}q(C~!*6;d8H<4w9TR`aYs-+^byCIr6DH}+5(>_Xg*1jQX$>ZG+;_vyMP;v$CG zNbFu|}n3EMPGH3F0CPxly|V96W9pBZ53au(tVB(OauB03=Vm z1!!h4Q>ji%rb#&b*nWgXSy`RIdY~dQS3PYOY<>eMApI14;-oU8LF*kn5h6RLT0Cy0 z{h;li$uPL-ezHgByDP)!S`ici<~QnFw_N99?1DgpXe8V;^R?D&`UFH4+fc5+wA`H= z3p~I2awdE1?pr!x5d_yKX1W#UpCLHVpN{$6O%HV^#Y7o2eQK;liKryRz(bbF@c}0^ zlP(|0!U|_fQ?kT~g|U_U9XX$Km2i!DjIYy>Sv z07efnfO`FV@(=uZ>fHtmYY5Z5oKTYMv#eb->S793vTnYZIv*9c) zzF6@?{{hrA%r~6@&E2q-K<{zXjs<}5OMQEF6c6wj`B<#VLUEZ0jPc9@C{`%k4yvbr zYAoNxEdZo((Fs4lLx9g73y`Xr;Q;oAEmQM(H23LuN*Ev1wMD><4xom!!uB*h-0Yp6 zkIVG~^w|%TZ366f*TY`Ci~P`Q%YgR$Phdgb8myQB)aG7?&4EOh>2PMQkAEc%AF+L! zaHk?O)H2bSCzO!Ij1TA36inh79Ar7h2x4_vUXuhNnbUp`GqdxPO4^;}2E`+_YB`qJ z_s<^YbJyp|GoJVStaHR93kwd(MBeImH=&C#Ul|mhX1Qy9!+<$)+yj zwZQwm5)bFMFVQVdlMXW|GyEm)?{1xXerAT4yD87V#FGda&Ixe*@OeDKcBgS=`7cZS zGExPW-Umr5&lnj&wJry)#_v}gZQEegz4Ux;M5TY#`R(`>{3xSVozybi8o`2v-E;q{ z>mrX5j$eNYc1H4VVgP%WAtQ`;6dJoAHP$ov*JSgHy0QClPOY2(*4Z2@mULDIepGO< z8EB2{y8xAtpWi&;qHo}HJ)AjLmYgaPLc+GuKx`%UDR(Lyma{Y{O^iRS=bxG4AFJQC zhVZj_0&6&^qD|hjmp1hEPg1zPJP*(>(O^2ck zS%lq`73x1Tq`%JN?{7{6_?Pm0)v0d+vIN#?+{{XK9xlA{ql9EI@R6vxiVv$S=T4Zx zM05U77h@eK!$&;JSTT<`oB4OO=sh`hFH2s3RjQsiru;^bEGD5dj2Is##~Ez8?fF4d z(_%MJv$RE=m4tFy`VUCrKTG~wn*PKibqx3s9AZ+Mg7TxFB)=esmF4YZ0@Q4Bek0tp zdSUKo!8f-nf~rILLd9O1jT{ApsL(2N$HE;fEaE1v1vp#PQrnLYW0B|KuUHmYoYZ|Y z_hNz4?C!2vK~WJ)cK{bBCr#1dIqkZ~zrOkRA32^f``B?Lh9aS4ycNgh?TNRkozvm( zTnTKYl=M{tYB~p_%;Al9c+i46>&AhCaIQ6g3cUkbQk*{nNh4$r0RRJm&X>rsF-3ve z`fgD@y^I-R*-yd-v~em7js#Zmov@mWdDuJguW3x>bBAlsWy3h!x4Bt>@Ro>t#h&x| zlWgDbFa#KymRw4P1%3-)gYX9ba_994HmaXEZv3DUb}?yt+;tZDF3npdpt|R@=d%tq z?ms&wf!nDE!Ws3FwAokejKPND*Y=HzJs9INGc-UwQ}kn2WDTr0$8*F5GN0_Zk6EMn zHia^|^@h0bz}i~~15tI*Tl0dKedPe3qlwfch9C;Yqa|bSV)Af*pOc^@4%m{*h+XT` z!}5JjqTGML2-iU3!|J8%0vcOzr0+M zr!^4~38-5j!AJMcuC*rXj7aL&K!f2;7c&FP;0?L~6iS&&g5RFRM>^=-O5?r6^@gX) z6&dpr6J_A6!lFvt5bGr&wp<+oK#F3LARr4XYi|FubzAcSOr z`APUC9@1co5N~uXEK*fWgpHV4o(1=^pnKZS13Mr1zf@g;9S8wG0hppD>UqJd;9xM-#$Te1`NG`6t(a4B&k=`mGh_cjWD_ zKO+mDpu*imt)Dh(F!;h1{A9YYtaYO!1G*Wgs4wbatwdp_U&(*?h}+sr)tqI6*iH4J zt?`hG=&uj>+Z(AT#i3}*!Bj*AN?xnWBfE)Xb#^PBRHqBy4+m~ULWjy)!Yr$a?k`# z9BS?f`Q;!}O7z7u_)++>^onu3WWIpHXm=##<+@0rXZ@H)4jg$-lRo-$2mb3Oq|YPt#T2K%8h<#43hdOX#KuH6mxCLVWEDC z?(soNmh$xe4#JPdCbr3uobRTTJ!8(yRuu3jMExJD9C`b+TOK&H$0+^11F;qAX7tNv z>|>tA_jmU^my+c4qVgjPz?RJhg#3QKW^Zqzb0F?`Z2=pru_RTEo7A|(>YL7od#hxN61yK*jYx)%qNSP>2dkPwWZiZSIZDc3 zI#!i+H+klXV^=@@v=8|vgAVt)KxbwfBPjfnIj?CN_vgp1Dd zTvKi!yF+~^4@CG@&Y)iC>g`2>8ti|&ShB1q0$l!h6i;aRg1K_cv)Xw}SitBnAi;oI z<&BG-=ot-GvmL8aS$0wkX+4VzQf`bHuw^8Ja_xs+fF|XoibcjHhV4ZL-vn!e##<^n zsZkBl3dU}%;>wis99C++;njTWc#THvoQ~u|%}HUqqOYoDot{_LQ4BYVzQ<3y^IfCl%0CNf~01=Yhd6vw5T5BDFKl(cw`k#dX zcFv#0=e`CU0r{-_IEOq2gBK`$eFcx0?WqBQum#ZY_`_^mcOYUfvRMFM!+sApXakBa!dnoCNhqJh**`qB-;)0l%WL=p2 zpn05u-5Ip!(9!bm1s)P?V+)D)~a_tIX|FUqF zXj$Z{{oc)tyB_V@3H}8IH$;x^+L{V6)_7mSsV_g!vO7(m6m3n24xTIcri{D0##n^o ztY)5k?->*7_o>e?+LuGSt4(G8| zD!+F&jpbS-Sok#c!oQl939H%ID#Ma{0nZ=o+HybrL*325t~vFT*A@u_1pUvK&-qI- zJqDiiew^gVgtrD3`!nTw2ps@T(zSUw*{Y#WG@Qs3NWm0Kw{mWSf_2!IoU?cUaIWTK zA10UGrZngU6RYag&0cA(R-O|Jp)-FtYSEkm&k|L zy0f_ye_W)N#I(YY3rkPZ1pBSZQqPxei@?U+1Vzb2YCj}MHO}>jzp&KN+AHbTHbyiDZ?z)L!W$PN{t^0KvKv*1XV#GgDnLm0X&|Qx?vk)iA+owO|4aXj^sH zG^|~8OqNOKT?9%Gz9v=H%e)%ZX0fTb^*E9O{h^hXKfm^$6(ZS$Nn{3_p$OMa z-PRmjR17T_msVfE6D%UmM4@?%JRH_UN7}_)*-<`f@+t3j5DB6GN^FoXL%&*&4DStyf_bW^RV+vh2*JY15tb7zjp&PhPVz){8s7YdGm51Yvo?g*6aaq!Euosd(pAc zZk>@)W3c>QNrD3|1XUrL!}HGhsT^PdWY{%&oq=Q_7{Q#(Mgs^;Yrtf?xb$>uwgrd~ zC;=30W^OKYy_GEyV=?kopk2%1=6nZYF;``}YA4+AUN6i6=pT8vnHWaM#)zPWVA|I~ zV3@R^0n|u9I826Zk+y^V`0)nlaIkOOb!*~XO892B(HM|(;v1VL-f@6RhFncIz6*FX zb`A@{tcD0cCpX_kdm0OQ$V10)ZOMH{z7|Gf5iUD9WYJ)My_}@AqrM|sBiGE$JniMS z#8XJd{v*@FjhSLic6~wlajUmwE01wVWGgGZoZ4d*`@i>8q7XkYserqXDdqK0g3G$C z7SZ=|uZR2D5!L$}#OupgbC2DxY%)5vwVf_gi*UiLSz-}2<|Xx1b>*@t?4u$?f~v_p zZZ+gB3Ul>9lQu9IyAr@F=7KROeU*?`QOMECQ58^6P<;yO>e$&A*h$|ak%xJjiG>p( z9@$8p;FqxaY7V9G5NblA_`Az9go1#1v7H?kO~!owML>cxq(O1D+8Gpdk-4I+T3UYM z&OuSa-Bms{r3{2!2=$`%sQB+*7GXTUvfZ617}A`17EkPAhq|}cC8YV2SIrG*q;Ag; zOA8Jx3wXxJIz&k-EC%BVwCo=?F?EUA9-0qxOi|_WxLw+BO21~jRp9-RmTnxe?Lt;z z1&lfbW-4_yN3Z)MVsS>;Snmt;G3guVZ>!Ro(N*t%bKWKuO*ziJ@4W*jNhE6rCi*$- zQ#^xLOOspNHzzY%U0B#ROz(GkMZa9yU0efEOk3P8HDI(o!4H2Y$sXD)ZI_sSb=ojD z#E{lhJfUStT=P!&oomnA)GwNfeq%#FZl=hFQf+LKt9Ov6wZ6u_Ya^SU)|U}x3qT0= z{^yJT1|za8&U5%@v@uJ^t*C-;pwhueBAT8-442Wv!Xj&>*p`3p0nc;&$%T45Kk?Jk zex~^tIo|6;(QEb!PzJ~hWD6Pv`GCALO*4qHNN0dfXiGph;-G>!lUSA=mN`Xxzte#| z2b4wPb$3FxLbjyfdo#p!1lu0YpOVDq7J$xtLC(i3?L(&nZin^M!NEV}(wJI)>4Ta;11}iVKe8>m+p)**^=$x9;?Y2W zJ1DxO`BFWZ*;p2I1;{5Ylk=lEDvcaUBC~ z&2SizU<~DS*WH34gTAYtE2=`1Y~mui>YT>81C>Iv;@JR-?EGhW8Wx|C&7Fp5erfuu z$}LZ>-O(R;yr_JaP9cd%?Z(nT$&mDYcHCjqX2IM|4m@9~GCg-cjq^NjfzfG-JfS+J zu#f)ScIX{bVQwk`A1lh+U{CLn=^tW??jqsGaaT%vqlp2*rGN4hQN*pDh6w-(c9ezjZe%u~8F6Vg~10nr+3b1$so1(RxpUAy{ zggY^pAmh)_It2K`gCM>;z3=Gl2vG+>`M84T@$M{D9O7$3`la92LoO#B!4k#T`}rC% zBbPtpf`!6JP`I05QwPpv>XAD&`vT|=^o|KT3Qyvi;cAG-4yBz5lbgvg;?S1Asml-{ zXj;c>kyg^W$_4px;5J?cWvEXkDB0KBe?;S`dD7_f55yQ&CRRL{G84%bK&Ao>RR|t5 zxpY-AxUnQvR_fz-5PRX^xHsWDVvV{r7qk=v*&C%YH?gX8+rpU>PG9>36~twgBnP_z zfDyHpy-9xJmuX*$RbZ}x=+n+gG%E9E}Wcq=uq&~%j`A$iZ4ww zgYDRVrqxwJ&w)O&ER%V0eMtK!G$k!p@m@QR{U%s_4@Mk7*5PYnl_r#moYVr|xP@v^#(e zFoyV&Gl~dAuxjrI9gD1d=XC!Cnc>?#=&>QvVP8|lvO)1d>?Q0VVtnH}{QWsOEz;$B z8ffpZ_3p62-jc$IpjDo^KX%a&1&UA!oR^Ws3H)LerZi?(M3fsGx zmb{Lfj4easMRP@)y&gJ(z`V`Obmvr|Yher*yxU*jezB@aq3d>cFwokb4rE4A2UrVf(3&AJb1u4B;!F8SCtyxvX!V@LwA3P_+uvXt2MSST(8ER^@C$qbvA#bpfmx8 z?Z<77KoHI+iMl9I8hhX`5kBzJI$9PVfgoP<04NROXY+YQQ@bzfrlmjVwKb>+6bQP@ zmG3X%`N3x)&=A@{yOV^}a&-~;==o)W^&CRMe@K$ZWRPOy=r05H=GlZo~Y zA6(Vr-M)T|CQ=FzouSMeOlD(3h+&_A42rm2I3UqbP&98r<`rr}*3p&gc3ZxaEa<$9 zzO>gN+FDv^0mk$~JUwg*EQ^cEDL>U6*g2_8Fnum1TEkId<(rE^V62>IdtP(@%Q)cG z4{?`Q@DzllEpVB*x4CdS3ODJm;5oeZWg91EY?}5wiy5OWAvUFXZ3RK!9h<&6!4n8B zu{!lx?x}+p$16A)Vd7Ky8z1Rwe|fzRb~U)XxEe8(lP3I!Z)<}BtQu<@`d7XhzJn6t z8}+d4X?4iLuP-8xS9Mo5Na>?|KUxtv>N!%M!F~_rOEFoBa27cN&N4M4O&4t!zAuSJ zFilG6vHjhzla<3Y?geE8^Wb-1L@4UWUXWASpk#zP+?Qt zOqEn)>?qp99c*OF;bHLgUmU)9=pzk|YMYw3D*yGZTq!Y$vK-8NfqcIJw@oWrATTB2(i)jPUu1K9BA>w zjqn_&`g|?O-2I0zx%-`0&|52s?FWuzPLags{RjG=>5nrAzL2l4NjfSy*5-zP@XE*q($~t zj%|KW=}gQ0orcp&QzP8!D>`0I1nOX>^AGDd&xgQ7^5+}+Ff_h6zUZ>FU2H$FcM5vm z`m@1S=YPc5!j!qSVLstNV+$v5BbzH{^yp%L!7+Z*0V{wGPmCpoVM8H?b9*Y-6aq&q z{3`Wal~P3;!M30iTk$F<8&iOecJOBlC?PY=Bx)wWAe`5z?hdj~en|eE94M~1DN;#9 zuurXXPYIug3JXkasI?IrMMdI!3>gOpe3_xJwK?kZafY``!xF;J^g-#{@<`)RO$PVj zuu<#@8F#s|F znFqK1^Td$j+k*VEw#R2d4@W{bybOj^e@mE0(qYdZKi|e&LLCHnUzV`(&AKYcTvoz9XCjQ^>jpIK&4Bu*QF?_)lPl18b>IU}Quf3JO0{|i%PPzPhX+Cs z{X$8xwAv-8x{fFFumcTrmje1GG+#s?_Ko@W#@35!h4IwA+wAk?8b98=>Og05jVIqd z7SsJgDTqE!kKqpq5K$6P($X!swc(KkcV!u;xljd56qQ@fvuc#!!@G=6@a60G{O z(}Ah!*!L}n91J=~sAEv8cvh2PErjOwlect%VJ^1)(vQKAwQLkvXLO=3R@_75LHzx?Udb({XnTf+31eaGeb)B-FJo3jR8h3ytNbX!R~sl2I?95%r;{ z-7QcGLONQ`C~-?@tB)E;wz5FEEDm)`;RexRNA6>=?K@fco_c{n>O!@k!6Kj8V&Oyl zX|PHQEM%G2I37$cGJAR4fk}9@+`FY+iXii7`cS&fF} zj-dfj^*AusL56`EDl)+|mYd#oHp1fn*qy4%rC)w9+}!6@6t>rt!8Ax)254mqQ9i~` zI|P>|deq*hH~UvV7?%)>{3}i+i}jWiQ2MleZ%FPvj;MmQ5maYD5kF-(D;=84~~fcd~$5HaLmiHU&lpI--nahsKR7j$rw-3R8gGbO9fNbk*qUW<8I{)Ky zeh<9ifC#eM=W7YizwhtQ+XV_^k^;;R7`t%7D?eUE5)M}gJfBtG??jtx@;sNeOa`;m zs+ns{yA6E;%CFJshO@jrH8tgfw0rYKMzc9q3V?DMC#cQTSJ)d?Tis%B>! zriSF$OS|m!(phzqcy^QKigT*>cIM%H^RT}d zN@k_#1wFS^M1=qz8Tkozv}Bq`6Y+%~snk(V)k zf0a&@%t6~rg&x|NU<6e1sOCL0XJy}4+i$M-&jky-OUj}9L_uAhr~3KMIAk_}5zLpH%4cdw7|L`?zLn?9 zbNZJb@jfm=%E+%yf)TU!{Q~@@HE`Mjv_UooYgh9zL5T28FNS_l+Y16wF^#c)z-rL!TGwe z3j+u+h};%LN;wS9_L~n-r^aNeUrn!C+-4zbUvEhY*p!o}tWpUL-0C`I#=Jf9+7Q&r z+B5!uN({4<6&hk`qA>76cF0Mum^H<1@OugwOo^ky!6d3fO+T?^tsGNX0VDRIP0*aQ zQ~7F%@U+1n-S7ml#C5bw~E~An80*y;H$^gd1n7s=F6*aFe41`}j^0X^5B{2N z(^~fIn}N5Ub!Y$kK!#TjxLf^Dhb=LIk-pEKz*_wT4Tpls-;jw*(4oBhUS9Vz0d}oO z%-Oz?Q_AAcoJ6MLV$_WU9*!wR=s4HmD)Zi-b7j%0kQn0q9BHi8&u6dqimv+7NN*$k zXlh8U(QcP{nEAdW43Var_b<4JElr0HRmv46Sj-*vp7}y*`?VlkcLY?Sm*DA%g$r%i zn^1m7#~Q=>Z}c2>t-mlB=rf~?*eZ}i`9>pniRd;@eLQO~i%awB0$1}O5+kI6-vB#RU z5;_~U#bqWEZ2wA}MnVg7#n)zGT=Sp{*6Z?}LwSw8S2@7y&NrVSb!Lz7S(H?IHF$|9 zScT96+yW!j8rA)4!<9F!WcLWfQ}6idE5qwj>267n!dExBXin|E z(oVgURb0zYiFa^6Y<1aU_4rFYz+)G!K22ZWWq~9`=c5vLDO=4%0QYOPFyH85rD{uM zud=r&{3Z~w`2htE<;~{lZ%ugw`UJ0vt?Sz2wmKFpNWUECs6B7bpw=s<-sR*Sfwi@> zwK!f;B;>KJtFB>bG_Tr9kMbG%a2xt}SIF;$8Cf=!UR50rgj?wdWU{Fxmem`LnC&*K zGa|23OzfK8pG98DWU;V-cM0Z;hr}F73f>6OG#ZEs(Cnpf(|Zc&6&JnP>>y}GtMs)i zVl6OMKjkfQXe{NLKU3Wts`=nXc#w_kR3M;c{6VPkO540rbNF`3^txT7wmsF6vgxBk zEwcHS&trL(z;wjsCF<0k;bA^;E!iUx-8RIodt`OZ_joP6)unN-!8$hnM1_z*^VEH731>Qq%BpNSRPNmv?J{=BN;yMsbhq>u1H@a<&*z0&d zycEKgDDN89I7xMbJwAAtJ?~uCcSsnGm0~W9W_fk0eER+3R%!U#Qd&MQp$UvyNq%=T zHnu=9pm0Y(Kmf=2>VWAVj>l90u-)q1aIeT=C?chyqF!%OsHQyzcAS96ZqpZeo7<$o zX9vqm-w-y^V7E5vXu5?*$P!YMq10FO=RV?A(!Az5e3hy2*%rF^4I3AN{<#8|7q`{M zq1F7F$7loAB)fIUm6Hbh+hw~SH993lh@A)_S{9U`UOe$gqPkJZ)5?89HtfdGl2_9B zJnOPtDbDA1ov-588NmzAU&bbq!Zv6-qdA={F5?)h_dof<7>C+CA>Pa2q7H z==15fN|9q}HV6#$u~wPgw0nj&wQpWU>R}g*?4zbvL34GnCN_Ckw&$tiLHPHh`uqE_ zrvrQL!TLiQwnQB!SVL@4;qqaK)hQ3GW|I~#*`O=hT3zYzMZxiF`|Fw``^x(*G4SET z52Q70#u`4W;#$_}CA4dPZ@BaZ{OHse=Zj-U0W>s`SYI zhr*9UqH`xH)D&xYO~~yMow3l;&vu@D_R)`0cd7xQem@7wJ@QyESYp$c{&5z8oy|`L z0`PZ4ISG!CwhyA-@A~sw=b*WCDiTAWeTq8*YC;;8@;pVSE%}!2UBsH-pXHk`zJjIy-pFFnj zLqq(3{1<>04H$0bw3rI)fBxoQ<+A^;KH~UsYSne=|7|r>zAgh0c)I$ztaD0e0swWH BEF=H` literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/cachehit_N4096.png b/examples/aot/matmul_optimization_guide/fig/cachehit_N4096.png new file mode 100644 index 0000000000000000000000000000000000000000..7d67c6e04c4267c2e80a1a6cae0325ad06f7f58a GIT binary patch literal 59806 zcmaG|bzGC}*9T;RLqeq_1S~pK8iCP*Qo;b~5a}+7kpc=52B_o!sg2Q{4g{3$u7Pw) z*WkUu`1$xepVvR;fZg|XpL3n-#P@v9UZ^O^5rb*KI5;@O^7o`4;^2S?aBy%72+sn) zdCi(63;c)gBq^^>2>kOReEb{-hY?3!T0;G){?e$sb$mbUcyD#|YcuQutEE9!{cFzE zbz3=??VC0Q{ED$R>EZ9sc5e&BQ@+1!=!+}7%pgN_9(>qwo`)sI^!3f`_}OP zV>X=8rdFxfS1!8gj|Do3$$3+UOx|F9LB6^!HqTk_nK*qnC-~h>@xufs$>9>aL>?ME zLPlT6*K>u1g<5x2jm`vIb48>ezhy@dm3#Ri5Fb&xW5%84U&<@Wt(~1W*GZc$kU?;8 z@qfM;eg$#O?4Z|P??jU@`r@42>#vtjuL;RWxXy_%%lz-E|GhJu2N4pyJG#>wnxB2k$;NM6h&o$M?#=7Cm`}-0OJH0`Awg$zS;6d4Jzs%LBI- zomxY%`TIA2e#-C01x7mEi>^u`e@S!teYqV_wThz&p2KG`_Z0| zm43A6x%fg03N7N}FtD1+d(!-D@(x{0dbBpreO8>H zu7a*P**7kaU|u!Qh=eVQXM@(oyj(Ale<_pYp5J{i0R@TlWK^FGa>?E{G(O&(Le~P(YW~H&|u4+K1gB&>WB!G&xfw*NRZ) z`tnGS@SpmD%me2w`bcyJd;tRgjDL0TJv6*ftF?m-NvCc5O1C92P?a`<^zA^DqtIX5 z70L;Ch=E!<20$xXIJgF^b3w$KVTI*}xxE)nTxA}JknL(@m;&|M>^Sr{68QM^HaSOeXmZV8tAefp8_84RAAaJO z7|uN?9pGM+L_3sx`EoA`8qTlNrAfrUTXy!}w)GZp9Fp>mbbyv7Zi6H*UCO@NA=)rP zHi@P|uIkuQU{RF!HCvGi9Vope4eszVOhZ&PbJE)&I%bZzHf;{im9plgs}=9wLwNNt z)k?Y)mNy^2a!x%MV$CD^=J?BP`^iwf&wao<-BwE{9;(!Oj#3dY%1(139zf~#4tROr z3vX1&W-UA&FlH_;je349`(cNuzYu6vFi!Gw(4AORwokzLZ7&Y%q{vVLy}5z+t*v`X(vlSrn zTqY{jz4=YDvZM4sa7BZ$O>czEm{_P!sy9GQ=QAvsfY|RcNDM*w&FZl%eS-8aqn=?} z>%{Zh7(Xh(_{0ReTU7ELnkTx1cAS!qMC_)}|IhK$qZhzTj;B`Rl_uK-iPwcWAKmS!Bv1Vd%UXy-kFMaV4 znlGA8_tq-KOWeOBwIM&C;X&u8Cl99O=yxub&=k91R%SocRvJMgtSdCM6nyJ?1I^*p z>%4!vJuVsG;6LmcAOJ2coy5DuMb|R$)az4Mr_udJ0?w$pl8`*v;w8lQ2x~|4CYKF#zpAMHYTt1l0s3(xAd>8&#I3^DQ?7(P*z{3xCFx@m?N-m=|bhGqhw|_pw628`C{*tW_ z)yqGH`n4;cgmBpi_7qM-m)1kSkPxty#iJ*!ii(sZ!8XRmwP%BamOg*K#2>A&*KTH6 zRK!s?P+tzU4BTFkQla+%)h!n`$(KBFPvyta%Gs`LPh=a1g{<^hX#Fc`BTFENHQ~}y50$=J?9%+?AYN~0 zXhu=9%*52@uBffDC^k#<=(8;?HFezgfx9pMp@O%efC9@#EV3^F`^KW;*O{m4xjJ#c z)#EwR(Ae$T3#DCZpta5AhJ7bpsyQA$ek1T{ag`z0EYSyjw86Zi!yj7^Yg^u#UG*du z)&dF|KR}aWdy9*U2NVVL?Y=*4IqE;g9(j<7T}g11YJ1WpVpY?WFH>!otE7t3Q%CV9Sfd4n-U{ z=e}*2wQuXVx#if5l#Z>G9Vm&n8|n-@(mL0@h@jhBcHL`3Z~U^ZQ*}c!Pa@|&{medK z1b$a9sC+6hs`D7mcP|JWIZlABfJ$=jS@w;wKX~wYs@Q$w=5VP|3emCfGOA?k!-yQl z@c5JOjW_bcc~dkcTJrB=J?&Gfuxx14|&F9vnGsot!TT;Cw}AQdc@+pq7_tyGv8Hpl8A|8e)7ADfWO2i_m$D3?wg=|9a@%{Drq|u?P!<;`Oc@ zGPf}^3rZ2GSvfiVw{3YzagF3h$B4Ls?AP!vCY&D3(${$VMg?u4r^g!xSmi$N^^aA% z>+Hoo$DU5EOiSBpQ5b>a%?LnY<9V~XshA!^6@t<>C=q{8e;G)bWH6MwYI2sLN8wA! zczO3q)vRp|8ThwlPbDh4EeIpign+so=CMdk_&zWmYShr=h7RbW6ne zRTa&BWi8lvnd)7${r!^YHJvPjFAra~() zWN)P2j<(x)CtdPR(_8cWiu==a;k+Q;rfUgVxt$H*MT-}MTK)_l+eYUZ$vEP&RFC8w}K6pM@@XQcZUdg_)9#AwgWdfqSN(Ua|;?lf>)a?;v#;vU_4K=l?10;?LMF1CH zp%(8Gc_@yCW|?w3);C!}YFXe_Y;F|0=7Y9?WCkeo@_UzqqKZ9&ot4qqn=h0Gg0>dD z>XT|e2LzLp$;gj!no|h}|HJd?oPihf5b<6AE%2mpXdo@Sj6W6P=sgx3gry10S z@3f2Ad9?JZGu?}t&{e1e67Bz;>S;Cz%)D&OJ)8EQ5$`lJf-eE}NAuBr0BHD!*FXKJ zpCbjJg?^ECm%oDEsq1@N1$g8^pL00>NZfy}tC9_P$!U+-GTZ;UZ$1z-IS#mSasIs~ zkdi@xazS_XZhzRHQvAB_HgI3l;~O~t@i$*c85nS0i}ADNfBMf~8DJAoN9Qip;N$#N z`knsBq6sM3v1&tC;1A9H(jjE<#3^Fm;o$uG&7Z3S`W4j5c1u{4{k75`Qn7df_su`P zi1QzRgTQq`$FFyxEg6BhDfdzQx^bY1nS=+6Z!9!%f7d1z9cwp_ci%jD)& zaISgg=E!j(EO9Wr(TY%4{=`f&Tg(1f{fDFzAw-|Asv<*~QPs&!;hS!)m&s)7F6cZO zovZtNC(}qog#WHpqRU}g2n|wEU7IboI2+?l>%=e%rZ5v_;l64g2>6X0GTwBFzh&@C zy^wML`f^$yv8&balY<(A<5c30vQL)owFLUd(rOMc=(>dRX{hV)Ui^-ph>e$9(hMG7 zOEOj=rFL`){HiswqF2@GT5OX)(?k=yo6)2~@vOawA@`j!eg35Y(K`vD%okY50ih(z zE^VIqm+ex@FkT5C%72Wqu{1AuPs13%lX8^ykae8@?XHU&9e)9e~LI*U?8?+m;LU026!(v{+yJlR0rJ7b|lHF4C!5-vatQJkrC_h!x;e- z3e`&B8%)YjoOraE?mGGT#aG|lq$ITK%7{hc;WuV%nTUwUa+Icip4=7pZ^e`hBK zxg&<1S3ZbbDVt3wNJzM%Q|(oTJz71k#(rZK&jWPbP7nY-5If#$AC9%FwulhjelMQ5 zUuDurGF;t4l*Du}&FwQ^_leJ{Z$PtVA9_&Yb09!VN4GgUSm|Ccz5@I% ziu7`c^`J2=zjf~NaLF(_fB@$66SjC814-4AD=5dyGKw(jKr= zTgBdEvF^s_&Y_)9m3Io_OwqM_~bq5$|7!I?cFwMvc#kq4 zl$fos$7<+FdxB`SvHes-nAih#pQ5n+3>=l&PFFA$ADwp8i=m*P)6f861)FafVU_Q= zeiMr+mg>)Zt|ApEwUVN*NhJ2<<_P&U^m={j^IE>BKp=)b8hOHX@u#2Rl7XNN)YRTU zq0qe$A?Q$Mxl*q)&C2l5&;k;hCLo&5CTf!tBGo$RHq&ysCI_#!p`qccy$S_PPfyQV z2V_qJ&f>{QO|4Iq!c3IAEP4rMEEk3a-LMjL(9?@2rJ*3Pq$QRS^XwxIaCBl?C|xL! zl(I>&>XZu%Vene>TMeC|>Dd-eYfE=xr2|sjo#8o`Ej@~t!ZkqpL@SM0|~gQprP_v|VH@PXH%sTp2JLHfFXBJboH*+cB6A5qM0vGIepTkFSIM6FcjqvVZw%QM_orPk zOIuD87r<^99?!QcAA6-OKN)_qV0FhvpxdciahL8?1%Ad9)Xg7JR;A;c4m#az2BI0o zm58<&cB^H>@yxFjFO*5PhdPgJb*420#s*!Z_|(;@7EQbD@LxJGd0jY7x=vo)@zr7J zP2$kz%mt-m4M-YAK)Pl>D7a%0(V*5FzpXZHq#~LTA_2u@)^cYYsqrNQ(I7J=Rir|S ze0V+zWN7s`_*;IShXyZ%1_>kgxid(5Do*FE)%82J1Rlym*W|_u#Rm$z`Fo1W+lNt&DBY~@Ulb|$ zyvOUX@vdI*^a+kRGCVaPGu`v}ovf`BvfN!#iTPTGsVSF9(M49hc_$z z1p=+XlS+wV%K;LcT*+lPMG--}fYZGKKF^mk-Y9m_^6UJWD|rR2$Rw(|HzV6?D2q87y<$Lx#f&mvhIvy;HkP3 zJL{QJ+vve=*KgMEmS524Bc9gUTI;RyJ8T@w8gDD@-11D>$W7IoNRM`E6cP1mAP9O? zqMfI~U7c+jZ#q)r#DG#sHn}xYI#EqN*Kr<5g73iB$baRkWQ68hh9Gn8X-&U4#Id@3 z`X?o(m>Bi`@z}H72Wk7Bkgr#@2Ek4ewdjVvno)t7Y5sFHYl!GtsW+LgLr({ z8R7kySt)L9n59Q8b+V3Be2M)6#YHyXkqWmQA9E&hKAVjJlZZj!LdAoTS+Q9u>A=LD zai8OF$t+-6;D4CsLT36QzcySN`m#J5d$7~)$tB|>`7<8_UFC_SDH2T=UyCSc zi?bM?9|jpsY|f@9Dx;@1*Bti6z=Tk!ppF`~U}AN<>v?^t-!%yfiJ!fL^BeuYfY+a1 z^ONLB2ey!<@qZWm_AGyazrXMJ`va8PK*J+RM(E0~^Y)L5SLP~E|6LadvHj1Lb2|WY zB;suP{;#s^WF3}^K#06rrs{lxYW`mD*E_$!lP;XOYWP0^0Y=|z=)^O>xBx%$jaNX= zobqc}a_oOqsDvDaCw-z&$GiWr2rhmI&@<1yXG=c%yIJz<0ZtyuFbR}Q1cUtV8OV_k z>&iak%yQ#pG3<|;*H9IIa>}6~MMmp#2k=Ib423g9pgzjQvdgZD*85|jVnwxseay+Qjd{J z{?Y2XCMVTpgH0y~ydRf=^%xVL>*yhAHRzD8dZ`T=9jjDWsR?5spm1xvT>gbneFNQ; zd_;>9vl*Pb_tk)%>iMZEKk4FKii&#DdWnnbLAl~Ng4y*+7yJ-KUpJS$(OdhWzwG#T zGzJv@2@g1MTx8dS|5YKMJSTkMq_eOb-f8fkLQ&oWjLVNv==^^gSN%!j%i8Vb(SI8k z}DL>Ffz_=Likp%pgNRYg9(xIrj-Ne`5ZE*sdY-6Vv{&)3{ccmPD z4{n+Sm=SPz=Qm!IilPjS(h#I32}Wg?H%KN$hJ}X>np-aAS3VgM0vHBPP6hK*6BBSB zaz~hzhkg$qSsNlF{C_-lC*wVOSxo=P6XQV3_70KYB@6$md6kBTEz2IQndeEsz!R9#(5Svdlzm4?53`Qoo{Wonwr`WUJ<(KnBG zj|MOBs*13KPtx-)(tjBN3m7-hAE}wM-5Y>-4kX88JC2{*;C5_{ke6hnm2#VC$ ztI^hPJG=rE7L+dSvv|n+QKo-9lz&7e7BxGYUBHxS!swIrACh72V{b zsBtTM?t z6T9XMkp~*pm-xlRR*+}8X9->znm-GF<T9nn zdp~~*bM!8>+~DlUu{nU`mg4P;HFVU}R*7$(_miaTFsJ8~PVN-$j;+oyACmz3%3ner z+`I$3Bqj&{2yDmw1y{#gz-3^(fPjGQl{_~r_{qCFSz1!pCtrYR>1c}DvJ)P{F*KA~ zj32Ucr)|Hu_bBFd4-YRM-_~D26`RHOZrPSt_f*;SiLC-ET`{<%>tjSTN40yBMC9-& zoT!yc7J^Bvm4K2d+3nX!aZqmb2JG);kZW2bc(;1#j{zE;h8>rA0MP^DHzM7pNwPrj z%9`Kvt7y+H4)S7kHq&bNGM{q9}9TaLWA@CY2`OSyR2-%J~0f^lNzF!$O+8 zAR=YGBteFVV$H}5_j6`6-f?!~33C6B$1F9+C8o&E&ffM*oZ*O9?yh_W^t@VZgc&%B zS+xOomt5uI{G- zD+qJL5$K->?0Bc1ms9tmI&GVPdgc460#3a9oS!V~@52ZL#s8%W4uo&eg~eA&={AOj z6O~RDViLpx8f)(qQmw76m;9^EX*nMxV;|kX;$2B#aXbnBHGa+~WRqX({&+V-sWRtw zxcePw3nA(?^<1Lnrv?kmPz41W{x$$0$o({&nT&-3xcp(+8{MB6)E}GD!oqX=wy{S` z4}ggQaG8r3-O~JgUH}1UTjAw)CY5}=79p+tl4zp;Ooo6S6B~iq4hH)Bq5Ryz(kZG2 z?hXJ1wdekUSG;?y3IN&q^*5rZ_hK}{Do@}l0ARZfwKH!^pH1*=BcT(tAFEWM6|mhL zwi(Dp=MQGZ3p&)=x$SRS_?R!J?|pP#9x@#&PEM``I1J0*8yR+n7sf3S=LgGOkfV;x zoZkL>7>tpK*KV%pR*xD0Smy7SmkS>(nTcB|Dk^#>s--Ipb}Nf72glAIS`Iw{z{Fyw zStXz9u`&G8Q?UypcjXjZI%~?bM0pn2dQJEPF@EgJ59%t@3RvbMv)f*2g_%J*2Jd zsbc1nyC+CwAAsqUsQPB+u3k#qeP)1qIICv<6wvDaUYhfjqJ{?96PVBI=|F$K89hC{ zcT}^o_xc+#eD&_VzHGh4x-WOk+6FpPCu=x8=?3LkC4M%4<0)F~3uB9yV$JF{0WhxZP=mwT zlRBsye+999|K=O9!`RGGk44l8UW>%>*)B*N1S%YBPEJm-T{W`wNSm#N-fCkNrqKXW zvAGh}-3*9wPNs!UpqrGGRI%rd?ZIBV&q_ol++n&YO00!wChDOnX4tOAhV<^5ujoe0 z9WiNj@x{e<&}Z-^iv6nNql4`)*CKWoRb4RjZX=iy)!dAT#r~Wf^i)cO*h0ispN`$0 zcLw-jZ_xiEHjm#6mA0<$xKv~}b-(mmQ(WVm6-}OZf|?+BD%ZA5DH*H^bsvv7cIE-w zoJn^NZJ?tQOWL9j+k!HH5tB+#U+|7bmKNqr(Y)dT>!z9@PzbK>VajQWk-o}Muh!O9 z?_&VvJm693^%Ztyi&~%UTm&EWkRMr8ykS<}Uyy{7vOVaLgW|FI_V5^j-p5$$H#R@M zU7ZL97v&(xdYNrXEV^$pY|n3@(GoQ~DTBYzY+l(x~BQoQ%9CsYhf;xh~lP9oN0z}TWcc>^PK5NaS z*crySA%7t}3L3VLY7pweqa&vsIllz`eicKIKU(39aD=1?bUu?!b=dq~P8$G*CZ#R& zk{ag5)FrcJSZR%guUcgbQL=;jv=iaA1I>V^z**eiP+dLs;DCp;)*m1^{T z$bJ)aR1ASw0CJU@Bl+nvIf_3$3xmcNtIH&cbe7>*iJ7T03hH{ch-#n-q%j}1Ocivh3^szwt2DDs8J)5I`@+Kiy{G7cy8rVJc zwk9A6oNYeZQZLk^M7dO!T%Z9pDBIZnHq-o9Y+L@hwhNMUh#56#k_LVPO#_GnL8<;U zoyTS&*MxW&s@9vN-+v^mq|%wbj6@=tgQ*t>B&L%js~P9I<)FLsd{9C}B4S0Y$f!d( zUUPVEZ$fzU)|?vIriJHn`m!@hI3XMf3$U3fAG^KhIC1+|nezOke3*OE`O6<`QTv_K z1ig2}Olajd)7!5spdUDYmbFwWW%`PgWuqbp*g&8Y_uRRkHlQov=RBFkpzoD^k;exjc%ezV0ZJ3z zQy4o#|8bg%kaDUcq*~kN>9TeSTOrZR*|TR4uU-T1ksqS8i<-p!0ZxTCLzf7NlZORTIpJdR?^pwGX8O%T32oWs0Y~ zq#BuCZH~FbX`CD!o>Hv~+v@dJy%?{o$WTQ6LPNMus3lS5*)2UJ z4RMzT{z{C19AJiw&aEy)&PwT#qkUAM71Y*w&R8s@fS_45*a~CfS=!^xn4^ zF0y(D1Hj4r@q?=Ijov`zyWgKxZ4X%;xd7datl=upl2P~B`0X8^;~lXzH4xli2Bnmh zW?6I6$l6ZjhJCHxF3Q=n6oc-Hu6;V=x;dv>6NV?fyt9#bJhC&@Y;cr6zPg5~+isw> z+qIP7qu;;;GZfo-P0@-yQ`gFDs$KOtKJfPBgWa{C@I490+sk%7!`#o4*<|VE(33qN zjYE?EIVl{_lOxtr91(?gzp@&?Knif?4s=1Zsl88wt4P^bl=FHW=yux=8yxTV&$fzZgjB!N_xLKdJ8`Ba5oj2|LzENQ z8P&Jo;i*OyG_SL;g?c<*0)!WQHe$zjr+`Rng)r~QHRfARcRz37K25vZLKb@$I@4=wJpy0`yE`F5M*aswxI9DH(4EG+TFclCJ+I-LRZJDWgm8 zZ(jI?I>BOF3S&Lc{Q4^c}{_VZWfzck{tiVU)n-+xY){0D%Ui)Ihw@wL@Kn3K5w=WfY zbgu|#z=@-R5ItML+)xCvR^Ml>%csD9*`HNdT{P!7d_KZ_e`=`5(iMxozcFVEWFrh} zjol2ZfJ?Qj)@?mNTJWDR0QZ70JV`&jB@UjZHY6$0U#zte>Fx+M%JEgf zO{6?{2ub%c?Te(I!cVG; zjq1);0AUxYD{p`*Xkct&K`$?IT3N=uIsPG$%7gIOB15 zpNH507hwXqip4_U{6@Z|j!jCgLSrXC;%uUFcQv@RFek0FqZ-Y7VPmi+vg91wAYX6} z<-|jZD!iH;?=7Tjx1zvayJ>cPu5HCKzYnYU!C?CmkOOTx%^d9|9+w#i-d8hW!B@{X zZB7fh@wEoe^!JiqF5nxb3Vea!Jq-5QCGer!lGp%QUC7VGz^kv23wuvqN?hv=+owsQ zs0NTx%=k5rA2<|Hkg-k7TygFVNP)&vDpo9@^JHt-zMHqoml(c?4G2e|WqM~PPSe?X zW;o70zwTMF9U<`^^(gBY0*%mHqPr6TKxUSaYN^uZ7rtRef^KXp43w~HTFT3kS8 zDh^vhcWd^MHS0toRlAd+-faVWg^X+au{C?N2kpm)%APYH%-Uj0T*Vq$dTjhVwkFU4 zZ;Uc))dKw;`>RyceAB^FAV&|qgL$#JYWy9k;S;Yc2Iypn+h^~-IEVYfVfEg5^6G^o zL(N9DHAJqeL{<>7vJJaN)*3VbY`(KJIE5sz2n1$ww03u%U-^k}eM%-OW%_K}`D#}y zp#I^O7#Ur*=nNHy5z}j####JRd1Y?GC*(I;EN-Qp)EdDuCscyIJ`Dq-G<#*--ZaJ7 zLtXcD21; zM4v-uKa^7tK`d1{e)IJG2y)@6PhAb(H>`FV?bnUbu5FiUWdzoGK6 zMr5j?e4TNTUhYCAo&=N#G5-|ECmDlg8r0@^6VojDp_!S&szjE$E!7)B-bprZ9^8ca zA{aGr`y7CT~A4Z{q37Y_1#4(<^+3%#*)XdV{fH6M&|qkUCFe_aIt*ab%7y! zIa^5`xu%!sZM7VRCi$|T@Wg46#j?Z5uw*#~_-XC4z=Q#M@ANb)XhNdQJu>ENj=|C7 zeVJDkkLe$(?WPCgT^FUw%11I~f=MWp=4+q`54A5D!L^_Fj&|-tgTMrQmD^vL6E}}Z z+Vm9G(QzR_&9_4lUEUza0P7)TB4NYwjwj=ZK#y5>-y}&&g51Hz>DiW9LT8}gGvOKs znO!PQ)RspQfnnXmE_N<-fuS@ZS&W*PM9S-`sc^P?y*&c0u#5x5Ck?5WE)gD{o>;lS zlEPYmpUBa2Aer;S53#V}g8cWwrLh{rz(1U|YChpt=QEetA3$BkGu6@$C9fjdVuzeb3m@i5 zcDaqYHxgviw9%{6vb|Tc&+xpnMOs!ap06g8wDQ9L9-mE0q)Hm%Y>#fBhth`7!XChh z9Xe@7)%fR8LOPvd4pf~~5ZpujySh6%QSr*+*v-xRQyX|E{O?Fw34xz-Kf%Qsjt!Kz z(DY^JqKBR%OWFI74&r2q?5?{k)aS)mvlr#VXR^wzI`o>Q<&X2#b=?BTPR@2JGrU8g z*TiR2&gUC@+ev9z^V){2Yk9n8JXo>jprXdg2OyLzA_ohr*a!@sVgiZSdS=+k@s(*B z5FPRoP}ZCfWD?qE+sx;Q_S17^__mq>auCY4Ke^9$8%Qt!9#~+o|_~1WDgA_=4Z|-U|pJ7U@v0l$X?8(cl4=Z zP}WJlcIy$YimoQa?#K4(LC|kziVpVsYgX8Mc#q-!<(a*n;6(ECtdU)j>Vfh*qi%_$ zgnXNu4`)Aix6^}5cr_EkDL9OiWp4XNsvzkXE9=L|pRv*PMCw7;EHYLG2Hv~%`VDr5*$TVC9W}f4++I z-J6`MZS7-%2IhpPwVFPb;ZlL&K6~$c=FdoAb?QR5JUTYrJDekW3~Z1`O7XF|1FQ0^ z+lltMRctI>s)_rKcQIG+>jNnR{Amb9`YNTZ&#!yu~7AUqfZj4+A6r|CU*~( z3dGL$ICYMfo~h+Hug z2YDmJ)S>WVC}8Ic&4z-dV&ab9p68q;p5f8xO>RpCSnEnjrkZKK#`{Ke%4HSg?9s*&iekL%LhcG z`d~VvLa#ZPh>v;93!r2&k}FtfA@P_&ufRE*gfT%4Y+mZ1sU)f3o0{J_&A+%iD>22^ zwynZeJqv6(J#BV+)RbGx<45>fc27>-*s?k#1DVOqsWlktQJw>eubryHtb#x|F)_#A%!nv{QM8Oc;U zzSrh<=uM>)CD{G_DT$vW&}8CBuV^j$yMJZ>0S}A|O%mwWbp&77BAls+T`Ux~&1GrW zv`KF_6U1)89qsowyZ|P%6H#vT#E7i*f8fr5y~%@BgtT6lM1}qV;lg?G3u`keZ$EFG zqSB2_=pQk&j5&Tl#PGllq20wJ1r^F^r+S;{&8;*d*p+gsG_1?ZEbC? zIg{%bZv_ThY7;}*QXtU=yS;a(HbEq5cFnUt;Ka8nKk+F~ga}32LB?n9mPOS_XJb6fDC<2XF=H`4#^aUo?fJ1t4)FJZcMfP}L zd^uU!2NZtV@aqQ6PYS#V^Ji#3TJj`bZ2v*MS<}I-m8hoq5vYEkJ#ZF$fUa>>1#M8h zAYqeI57e6(;9ryq!WJ%wpkM`Q&Y$8yaL<|hU1+@_*gXdXEK(q><7WK{xeMS{qyswYD78Vy#X=yY5_alJG@34eW!%dhKz!#;mz612j zYD(rw8NC=Pe1n-ORe=`t1qUJCpRmm7^Kn%AdRz4QU(EdEhD-`ZA$N_zs&gL@%@5yLfGexgvivj@a)Jh+f4!8+NxwGy+mj>l@T z0I&PlCq?U{-B3=pWjWAJJ4 z?gL|zrQ{27f6z|+K;&yttev@G%#!?*R|eO=eo14AZd6&A@HeqVpqri!Xx{7u$PD)Q z>Y$9hGew54H8}f5{0pF@mY%4>HgJ8Q6KQiEv+1Tly<=sS?rAdc3lBTBCtyB-^YzD1 z${U1j#paRz62wv~0Gp5KRR@_C8vwARzMif8^K|@Z+c@G$Y#EY~%~U{$CB$I+g$AFm zPf=b@X3A9uRCNDFyZaw1`gz_Tg9<-ApSh%d1GAZiqY4eA@y=S103>DwB!0Ue_wOMX zg@+KLmn?KWUjE%W0EQ2+kR6~B)MA{9 z)YffmZL4Nxg(}W?0q&ol&4ARImgc<2Jp0nfA(+ZFb>qN(0|jmQcHv z%cri9OA=mQ-o$S^VknYb)!@B{lpFElIP=CSZ;pT+%j4A@);}CVVM7^F&pCaBcd%fU zQ_(wCjx{jpo%~*^=h^D2(^)*i#>Qsdf+cg^cMh)FSQ-0jljg?y>bVZkr5VpjzKUsg znrV4-uvS;+Iu}67EKuXINIC4fr2$NrzG2>2nPVVIr=nq{ z6w80G3#(Y#qtbVDDm)hKeE5XiNk7m7B_NYeVX}P*98clx=aD6fc&zVIb8azUCY>ShEo|}HS zZY@zp!0EE`H0oACUm8fXm?VwEtp)1?Yk3r98iqi7eR&o4XzD{2v`qXkUAFPb;h^Yv z^<5tez*yhSH0Ylic_^d(fsF`kdW?@kt<`fcvk zaJgv#cJz3C{IEtuxFV}MyDhuMJGiI}fS0{@2)0+oVAbI7KD!0tMX`3}oe8us^s*0z zj(gC$trg_d$^`w4}6ZX72Ap8?h_*mHm=@+)J_1qxq44l#P#qQ?CEYs z*vpsu5N9FP`^V3j6lo1Ed&+e044>p-M{H$DU3p4lf``7DxE+6aoHX(Gl6e zO06!F><^(g*r?68ttv6mlhzLX@N_&oVvlSCz&1Y2=idD%?T;KF@fk{tUpXBQeEZIK zh8U=SZZhTVq@2-acy{s1N$@L{R()B;K-(FZjw#&UezUUI;w~U1<*7cO zv3EX#UIa5s_j3T?)HMSy^*2A|PX6MTu^62o$2pk~xi6kej7_g^z4rdV{7ar#M!B*S z$yKi1#bpr_&z;p0WcH9@$?}Lrtbm>=RKDcP*Pt!iBMGXbvPNsg#z*r(*TF-S zWRq|_hXrqK2iuc5@?g2}0RjeZ^A8b{^@HK9R{!v=kfd`!4Pfr}?C8{6wy@Y|=X1}V z5rLjLTj#|$q$#1tf9X8vN*?LRLM?YyWg|CBa=j#fDS&b97xfMHnx@!3obuwl67k5& zDs|IkR+APaD=rQXx`wpKSY1g^*e0#=#=3|sEgBqzoX$nBuI>!S zA*G$!v1K9Rpz!_5PWsvHVcVkN(W}?Q0Y33^n80{~gG(VW3x?E0L%)^|_MxdE(?Sz?d(99EjSP9=esgzh4{hm;bjaGP6 z+_-w{V8_jEcKL&Tjmb1+g;{T}Mq(fluQs8evQyQTJeTjcc+3ySY7P@~E*`qtxzHCq zCnDGy2}99l*;#)f*-W@=cl`9aC5nrDq^qu_sVNXBR4 z)RP3B5T{w({Jl?)?E`R}J$mc~fwza>b8ooDjfv(v92YNIXQ?PBO{PSMI-rl#63xZk zn^qDXNm?K6HYM~Qnq>Elj+#{H-MD&EYiix|1HR-62}~`G&#G5txvOujX8K%z^ksu$ zDv#G}!p0^xD=B#>!Oo}Mdk?edogfJX*}7uJ6;ZU@galB%Qj|rJOT_ts!91h;8-MnCJN<^FN^HdG8gPT*Szq) z1`-y#@)#JC^iG%pxO(C@ngEt8!*{qi9@)jV@zpsAsQ(oy+l?(=RdJVD1$VW28SN{q!iD6v7+V<8A2Eoumfw-Awye<03jW1(QM zRVx!342zJMgk5@bclR_=c1?&STffj0vc6qp^C0c((VYE;4Fx$()!Q_20f21AF;~8< zD6$hgrpd45NASkIK6ZBLJSDo$Hs+Umy>a=3ok2w}wfJR>x1H!v#*!r}i5%3ZKh;R% z;0Sad+Fh+5nig4ySp@}K4eXMf`Z<#d zjchoS&aLm)38HEt@DKZ`k7!CB7!icm4l(SX6n~m$FSk=4-R>q=ZYVIR{aMo~Z}+0< z`;dyx{pc476z>~42UPOMBjx5qVXM=}dQ^2|&4v+sm#1<_npoC@-~Fn~Q`f2XwziHE z+gjV0I^C2IAn^z}fJ5N14|(#Z6p-J5AjLWF|%C-vzv*`qZi^4UKC8VSl=`Z?S5xX&M~!4=xWhdzeRJFHaVpBK*0 z1?&3ilyJqLN=1Yj4CnPh2gIke=9k)=MXGg=#s`W9>t*;Wp7qyu%73lja8(Qm>zwk) zB%L%+M-s|BtI^^wEh;)WI4ZzSwqX6-gn`f^`4_nY8jrJHGUOH0;{$^YlXH47dJqsotma;v&5C87D3apS-L5 zo9Z;b$s{K0z2WK6q#Fyk(?j3RLox2$deWqCeW1Set;Co*>8B$I7j^B*8p%rNK@$zH#3ygoWUX+|;-D^|gBvxD1V!Z8`q1te* z@OEi2d~y>=wSVB0z3@=*V@(?d+`V|phJX|guQn0BxGiFZaiv^~)fPwdx>y*~I~09y zOmz`0x#+}V+!U9s)UiFGXkPeAoq^Owds78?wA05fes_RuiIP-k6;iK{u1Ll&XS`oP z=~m|1>3e)Z(gSZFvB_@&fX*ZzqIj`7_~Orml+^y6-)ur@(oWoEKpFjZ7|1OkZ5Q4|fGt6U(7Tnm_2xuN| zZ9FW}oi}|^xwKhOC)?8(`*?6>A&*GzV%_8;XQWdsg+9j+I!~b|8@{3)2 zBuJkPxiR)<7-h&EfWf_;Fl8*n%0+m=e=7U|R)CNYUPISS{(3HntUOi_SA|R8sc%3N z0gmq^07(V|^vB13*EKYz`KqcCL56@Ig*swy`WE-K6mSp~dBw5OMO-zFc%w!<1?*qS zK8ddkAeaT7qNJ$2t9X?eJ^sE3y>={8nQ`#kl)@h9DGZa{xiyA36Es)7Fd7Rx@3 zdwVDa!)h)ha>FV$p!NrDswXkhE% z;o*;AlYu8d#`r?of+v}}j6wAr#Vgi&vpnbyp!bV6FZJ}m%fQU%+}Brf_ynv>2&8yj znm#tb!hk`8u`TRMc3oG6xX+w&6^jSw#*-soTwWQqP3o09Je7K2J-%f^0gsufCyGGR z?3EVMxisR_Up57?f>Y{-;UpB_c}$-GbZn_~GkS+Lv|Mh!oU__>@VIw6q8IT1V6-o( zTlBkuyAaO~KFw86_a%=Nz#gYRn#r{p*}XchDN9_g&{z1Or3&qkY~Mnw>#rE%@GA4 z8vdbM%fp>y$XNi@)Ev>`vahh^=#^>BE_|C zxr0qxB7c(O1y2>R9sgu`2MR1m!RMaP6O_mYEr-i8=ehqj{ zy-O84wfw!NaR;R}c)wx@K*q7^%xDjd|08@ug8t`pLiTb3qDD|`MnKg}&)T{Op=W@Wpuy&T{Nt%O#^mWmb!EPqnb`+_iz7kY29{%j2E9=-um`8?Zj%I3#U6@=|%ORTz~84 z(L9wjtv~I+8qoUD**8Mg{||?Q2J_{-B)!yhd4^MaNN|WO9uA8Q4^bB=u%AS=C9h>a zA@llUi9!7YhO1Sq_-)OTyXzCNEbP>;1{J0u^FAj7UjeXQWcNJ#ep!PlKy_|Q2hfDw zt~(RcG`Gt%m*7LA+cl%z&F>$?6jh-P{hZZPr3UprLInX!fQCu6`FqbULx57z-z19% zERGFq)__TGC4X=Yf>{B8&VQa*-HwXij7DjEY4ooO4r)cc^&|`f3D06E7qg@l#K`%%YW$SlZmoAD^6@ zMIh7={dFZM4^_etfeuoj1>d&RCGcS4KTR4IGQAE4#@4;=(620(-}~Q8`u~JqTOC^h#$1K0lfHrJQ}fdLB*6ApcX*> z^*V&xr&T#!XgQ2*TSq;u4HXR=>pq0IP#g;a=qCCa_hRKFyV*};12|1Dd+YOkb(JS; zi$>LpepfHY4?COugZF@7o(3<-foHKcc;n|2=9CyHBnkkZC6_TT0ML!}#2#m=v`JtS z2dW{!HHnXVUH`BNv_Mz9keE_T@YxP;Y{x$6+k-5l?VtS@0Jm?qZm0Ci*RKNJKgYFi z^n3Wz(q(YHcKjIAN^cP{tTo!?m;7L=AGa{BYD{_BQdetv$p z&0#vDwY+6NFN+$-a*qHK{O^axxJqN-B?NA7h+m+Qjc3>`RpoF%aPc%U-9ov-hcjjL z5C9>c0$fPi9J+v(Hf&ZO^`BWNl8pzitxkOJ*eNw2Ikyg3;^Y<{>f=wd-i}XVzp7U zjzka%`}g5R1T@CbYm3Rq)&&@10D1h==nAX(U7N(-2P;NREUQ4|kZ7#?0B<9Tn2SAY z_y{7+>Z7q1-zLZDa(uZ{W>m4WAq^PSrWT*YW-lY_!D0XFZvRt%`Qa-AbujDUD9wH1 zUu$|5I_6_EObA6G?_aw|i_U)%l$rrT!jGcA6-fcbE>@&5?o)D-{5~2q*UJzC=iq`m zK$wO-@bvjCD^ztV&%u71GGx1^5YC7CB2mKsh}3`CXtJ-TP+-iV^X@$M4k!i1UYS`# zey9MraWU56VaX6&9L|Q7>P#Yf9yW)E2aComApw8{0D=}a^VN1%Q61G{JJ4r2s}S#} zuWTgk%@T%q%9&5d95XPQS^7XO+P|jkoblIywh&@%COe1tVO^hM`|!A@u-_7|qhTjl zoM#t;APbgLS(4IdVXZ!aGuqB!Y^S5m7#J7;eQgxbc-kZ7`HsS1JD-x9JUU!rzPq7@JvfMrAv` zr3VD$B9wf#c^mxE0bet$LB}JO4zCe`hb1C)?#lS`yRZL>Da+UaaM6C}Wqyv6+()_w zI^*JnI2x>E4-B*DI#Z<%A7x%d;p6^{Yz(F2kOovmu=$&_f}WrS6$*{r7UJQgu2bT~ zlm$Ew42mhB>nmiR0~C+$@^iBX3u(ttFfQ0*`?g;@R)F*+kQ?1Mu5QOwO@Zjs7rLkp zWCXrFWe3P!2Ua1+6m~VOfCvMCy7!MA{we{f$YmO)!OACT3KC3F2V%?Th!~;MHs?1` z0Ejm8x8>R|iF+)mp*QqpMk6%-*N)9r7UftKZO|uY6|uhGv`Y22q@(z3wtWjDOf^w=8jH?~*6Dn&_NrM(Nxz@nLxZ)Rg)NC*t_Gk;4{)d2cb?Q< zcI*Hv0?LM9dLPfP&HP!>1PmJrARU^Q-R$*k;#9#OfP^66$ex0aUAN!P1I-jqqe*Nm zWJKtwsr)kw*yo9)!|bijjd#I@1E;r|0cAipKq{2R?|d0sz|!7c@X4S1P6Z&KyRJY%@5ZE+hsPx(8 zGVKIs4&od}D}ARTvOrnywp#D5!l4&JmRlzz6z z?dy7z@yXdl|1Y8p!hj{Sg0=FsHO&E$ECur~6OROp+qd2-DJtrZ5lGqX=rEc_L&opK zy$`4;r#*A88aL;9gw4%A$c@ z5DaXlb?QX8yYbfqk0@06091L}ZV=-sZW+u}ZCycL1NX;MiiNJvYrKJPplWlsDd#!g z^_vt8DKH<_j^+C;3eR|9|B+QRQP=G^f$Xex zeO@c#BX?amtWbzCl2ci`84x4*iZQ?^ z=vRt%`gcOey`l~Z3$S~+tj*JF5$-q`Lrv5dXA8m}Ka|vWqcD_Z8DD*)^i{M=eO$H2 z$HcX#xKN&^L!OVdT^o*sI#?(bI*MIBAKt(_JJ*~zMe-)pibKqbae(6txRxDsf+TMD zjP9?bGwFe7^OlkdWj#MzMiLzOdHrOl<(=cawV1lLbJ<=zeWntAPVeXZ^wqxPwB{Gt z%is?q4+V@bvuml8J^cohg%_L+W?Y-cdAcV0UQ=P@9!}p`yrAw(xtD_Z)pA zSf=%S(Tzw{jd^&m%r?&Ts6pn0FEWu|dPCZW;nJMNNdc<2CVP?KHTp2Qw($Jx!!`n8 zuZ_Yjd=s!Ro_MoendnUU^Jf~;QJW1ZJEW810(1kmV(Xxx6bxKadSZ+VT3W=c;;;lr zGJx)LksEP@m<^b!cxbY^sFHmXR5hsh(sM)atZ=>X-j%NlEj*a$xB_JyMK<64{uS%>ijuu+sNaJ3Y+j7x-q*1GuN4yHed(~Ia% zr#?xVUMz7rhW?a+N6;=Y=|UwpTq_99}s(I>DZ8G7*ggFREIKhef8`TRa^QM zv8%HSdgn@Q*fL3+l=9;T)rWh$x!l(TG2i}vsA<$VZJ~mFv7)6OwK9_Ij_)t$K*1Ic z%=CFC`q=jpYq37bw}%sWH8eF0{t#Nw8ZY+5C{i+NNR>lQkQZg<8hFtoi}Fq~Aht`f zmPj81VYC9Erjl4ex(`gUWS@l=ex%(W%W#eiA`wlC2)5(Y2vS04Ig`}t=#oVlC9Fx? z&-IuI#-0^^PBucc&$MeeHT`a!!k#r$%5RIZp4RXNb{AjgN{sPnhs2TpKl8xDiQKG3 z|9;qCMmKOOJ2ODCGOdJ8k?iJqr43zKeaS;dI~}SBM+nZtOg-$(?3IT(#Oaq$u(Caw z4^#av48L$Qxj_55 z7S2icR8P_7SFb6Pd_#K5s?7IxuCxtv7G$4P+fmYrb#)uxsHKX}Q(6Iz`jPI`Ty6_% zBKX`>L%1#}%ae0AmHxHsD>eVQcPNU2oUv+TbnAGSscHT*Q8Mz(ahVhiZkpSDZ@v}u znC1|HkCd&oc&;dtaB0>r1`OaJ-H)gj(a(KQmpg12<9dPh^7?pwc?MY9$FTuY?X>Ll zZ*(TP?hT%-9PxAI^E$Q>B&j}+JqN7DE8BvaIqLgoVIRxxN~3x1;Kvc>bgDX=@pUug zHyU~`EiB|Emk%E_nu}}b>au^@v8qa_)0##&wm&c9L{L_QLwzo)gwBCN8g-)o#0niN zC9is9*XqjX3KOmjgjv@(8`m@9Y^*X>ze`HwOo#E~cz9#(lDxvxM7wcf3{tGb8|`$! zp+V#ZF%izn_%(`dSceb>OFDV#5;c+k3my6eP2krb}D)~@|bTb&2kBcmb-lll#S3h_3 z#?WLf)>n-u3}X7j0{&?SX#;)ehxx@AY21wX?V7CY!*@$xa2!d`ft*=EyoJgV_0&=n zllUXtxURh{je<&xELD9x5m8HX*umAWpzUvnyr@UYQ28>K-`%r z&Nv7zCrY~T7O?q z#f4_;cK5OSLZ*^_@!~ZaRKX6An0)oab@{T0pacSH z+KY-gY8pN}=s58>C)r!01tB)2ZITC$p5^^QNGfNGyeKBBol<rzO)?ePpC2&y`T;kOR~Ln{hjzL~rIqWf!BbnH;Tr*R60^MjGj${{GsJp^#u zujm^~%d;EiJ5GxaiKXsqU@+)M{jn?0m3b2m@FxFQiI2cgnU#EZd8z35eGvfRO9wxn8|`bv&YN_!dA| zIEqk>jJwJ$cwJF#?d>E;n_gm7;Dt#|lo_#PdM}Tv>FYBJq$v$%)YEn=1BlOi!RjvL z6cjHH_6mWf6e?PgcO;SB&C4=Fhw^Jec zm5Yl_;bLwy#ml=}IaH&H(wLZ8ryuiYpRNX93Rwld-*l^A`;zHtC}2CoN_`_njMuwk zf2}2A=l%=;s+tLBC{H)p7Kz?YNfeZnBtmIv1?|4}9xRB6j$HSV&&||X8w|^Kr>oR|?M`!2WdNv}bZ|R1F^XAWWp#Cb4_pP0daCT|YbqD`h8Aj?5&iS$NNEl&YHNv?&&~v=+KU&Kml7{K zt^v8@hxA6~+R4%%9$s8;7{-NOii~39ymoa>^f@}ogn&L&yEJ9}nlG}RD-JS!{ycuG z`@c5@urG>afbiNXexm`gS>nP_$u(?%2^;EN+<;W`$Nw)^X9nB=1T>Jj#fIImvjDT?5Gocp}0E(TjIC9Lq9T&Zo@@R z?NV=&4#UjC-g_vb^{TCsTpy|UbCun$@H5cc&3~Q-y~PVNn5eAm>B6P>2%LNp6>@=& zju9>8prr8SUHi^Yvggh|KKgzqv(-OX${MV?V8c^@I(J8Vn`BwBQMIx_;nm^T)|cbs zSfluAe46i7a%(MtzU+KjDHT(H&*p!eZxu+DQlIlUJsDxnj-!xJ8;*~8r*r7%3P#Oe zdRXWH(#fBU3<93w&f3}(Yqt(uAwkkHHjHFsYAMBUD+ngg$6bX08Krd3Xr<_`L_szj zlAbE-NhZSZo{kQfeuKv}3{~n}s$FN3G_sQ~>O4YtVszFTwaL`=Xq_<1l>A;sR;Pg7&K|L=_&n8*lqbYQY{=0fb@9y1gNG(g`sGJh#ao_XL5ycp^$ z2Epski^(%}vK$nH2RwybIZ#WBS3-I6g>T!`KJ7pLK?^&CUk83LJ3B@P^x9IJ0J%C@ zGHF9-@jZGfRLP=?Q_83aKY{uI7)n?>0p>QP#Zv=vIVTLN%$U!h(O;79wQG zhDNqhb?xpF2reY&qZ^xf~=ytKNd7<)taWO*c^#t=fn60Z2Z z`wo#I{PO}5MBrY?$Anq_xEDHP7+&`(dwPKoI!hK)$MTh_>vVPmcTApO&GiElJl~Sg zXRXS43fH#*ju7-r1V#xrB@ z$`*MDt4mF_a>i++VrT9)OX%h?l9S7OyLsM21^=0K2b@*%(#1zHG*s@JPdt2GY>Z!J zo#X+(Gl^y5s?z|!(Eb_4cf!fYbeN7{DUsQ7-9)MF_wBCvbs@mh{&VD!FpHUyBZ7;b zW;!D#1^S+Vj9T_#%oiu)JHHo`by|_$d_^hRWs<3j#OM;w@j?DS*{K0}i>LG63CgX2 zACSHFzU*cBq*a7pW;AJaeD9Jk&)D85(~)FhxUd^cI$8BK6{{2c%WZGYB*vh@#WLbF zBr;i1o|~PDp?WDvi6UCT!Q!P(NVCZ*o~ZkBke0-+z8QW_HElf(;LxnPbzi5vdb!Bh zIRP`euq8R&j~@#vo`1EUEetYmw9#)i%<=LAjyYm;;uR5LPUbR7O3G!noM>ZX^%%)X zjC0?gvWs{_8%y^;9y!qa=tWU2)}Yd#$uKFZ>do0k*{#~`kJx4g3_MYKR)HYj#k*E~ zrE^Yl{g5Hio432ujq0~y05N)-j52Z5eCU4i;(_lD#c_eklM+Y>)p2jZqvM$}!^{-{ zfJYirpo8IfWPdk*mw(dTzn5!GNyKLE;0%8Hb%P-n%7aQ*r7w>NDQ%~#)<1UVRRTNDK1^yjl=qLf< zhjxP4#vguGlmV%t_)C!j&>{8zXwped^g8lzRv7{+*=}w0y*VC?caQ!uNQgt1pP_Do zmk3`z4n7QvGd!&#mu~#3N3}iWZ4i?gFsyBSOxE@S?pcCPaCCzo(^q9JxGzG49kqlp zX}qnoF=X^sjEY7xE`AiO++qj0EEWw+f}bluSQoFB+Uqsrh(;Cy`OgPbL9b+~+dp*2 z?99yY!={TS?`z8~=NA~K>mI(easrA||BNt!*BtVIGsS9K*Z9Mk9-z`D(Yc(DtZ_Wb z3CIfLtdfvEu`HFe`NF?5=Y}iwzJTsqjhr*s6v(q*-k-I>HG9F;`tH(X56oLp@oyUm z%_T4%Y*nmh(8vGxw)wq_iowAC;Xq9t{r#JNzW_c3h$b!FVLQL$@&D&WAoPG634uD3 z|9hH#|78y!Fkg@MVjQ0Q>+%2ov0M_!s7t{X%KvYOe*euoSwv|C~5B=X?qXi3SY++WeH(+L# zHtnrpJiR?)fG;%SlKC5)03*s0JPfx@ni=bkM7ZVVyoM zKN8XpRnaNqD2-Zw!#CeI_R2lN!1H6ZG;hHqO+pGUm~FTsV`XA5FgJmb<;_ZLNS?Gt z@nZHF&dxi^d)Bqp}K`96UaaBoc04Il+5NqY1}jWaN@!# z)Obr#@}%W*qw7n4%2@IWho*ttHqC4-d+y?S1}d{)r}JMI7|JEwg>lIOVudHLAKluq zr8Pou>jl;4+==g3t|oF7!hW&bW4Cxuv&qvvTs^#%WMyAKAo@Lv z^qKZ5~55Xm0%fFo>FD?Yw=dL;WCm|{UOo9>p|07umv|&_TW_xqDI_h7j zkQw(7QlAx@SR;CQhvK>M3+=+7`$etj*pmYVj^UFAcHaHM7tWb!oUQ?nZ%sTe`kuS8 zbZO}~1nmr#*fnkFy7k0tuV0n+f5>5;@XNiMcq^n+f-wV|DU5XIwR z{2iEBE^;cB|MAqwgej2$N*Rut3#JS%8Jo2xZQ%qfSjm*I5)~$QG8U6U5PB)b+q5+t&L_+Q?SDDglEOK$UEX6LgvJF>*xi2 z8+mgl!yV@H(yGbD!pN(8R5X^-3JfTB`BwDfODC-yKw_WtmysHHJZ8(GkKHy>=g9Z! z?OCImOA22o|Ey99!6@S{OW8=U_yuq4@$>+tPR-X;dtSJir}OC|p}v#8Ph=^jJ-Ep`n)1@VzH``}p2-SjF;lR$ zR1$ZyvTqAnW~)eh|mhHD!N9wXx<6~Rx)uB4|*~sWn|vvY!Zy4Eh^J; z;dMuM8vC1EteBowkTzpP@!QS1_+$c;N$~U0kmhNUdt%^U4)eaP&xK352Br)74oVEn zFRL#u4~Ptwze-z4P)flc!Zd0)kMP@8nO&`#oPMUbSk*j=`1Vmm#9WUi_7xZK*zdgU zYk2>M59_&)0!ntZ`e`@`YZt;;o(pD$$1NRgHd?iTas2)EY8u#Cq7E@CcALddm8L;w zsU&RytRW;yU2BT=^*(oVv(CIhLSfsT+_oaDcW3Vw3)>G^~|DOmf?qMAqC+DN}hF`YQ}b#Pq9ftC?7`jEN|7%!UXWj)+qaHW(}sf zJX5PoUQCu8*`@U2TS^`juDQ&6?9#bfBJb^#_xo)SUi#|R!S(OCFS`D)_7@i5?{z?g zNVVuIYfB&X|sy4wV&dkZ%)J*a%7G7$(FZMM0yh z97vTb(+lZN&`l&&mK6GB7mMFlX6HlifAx)qeEBE>E2-xyEIq(sh>3aKNX9ewC$0O6 z0EF&4xJ@*lu6dMS1L<}o^oBW)rmbMel7F=YrOk~%;KmZSBJ><+0GijaeG?1A3QoC~ z-9O21Z=A;~ZwqE3+Ac(O|H0KzLN{;j!uiUJ3Yam6TPny4nnd)0Gcr|orjD^Ymu=V8 zy7b~#qN`(~aoGrWxEa0t`M)9vV>;CI)XL`ke0fT|rVb-ewas4InBacAymXL+?;||4 z;Aqs(0WG(exXhKLAcJErf7U(|ei-DuJDMd++efUrIUFI#T8I9ocviJ_oPGtsB6Wu(Bb`;{~|R zCU+Fo${dekYcYC!;T2^#i}59z(XX#p-N(lB&(P1XvZsN%Z9b;nYJzMTKWI)7m!1+;pwi8Chu?DC1oRQC@mwC&uIU{)sti1Q zX}WAsTIlk%RnJyUvtOf67UYeiO0G3nwZsgpNKhK(nss>`xpUSVHm|*AhU*!+geAz@ zp-3DZSl*Z9iNECNW-^Uh6s3ZaWYU}+M{Sxl+8hw~X#DHh+HS-#Mk7S-5Lg?DQ@DrcGEi{+5KNAjkZ>ES~cwVY{+~XvUUp5z+NfQBlSFvx><#g_lpobq7A&oRa9dUqNpO+l(<1a zH|C(I>yx)*y{BiSWu6`kz6IajlnIEJ?kl)*?SxXk5gSkeg)N&JWPBMIFyoK{@{T@- z^#^}%bzpm!+o0}Vt}8Yu-{-7Wu>Yh)7#Ufo_w}da(kB+xefGF#&FZu`Kx^eIL4Z2G z=_JQwOs}AzxC!c)G=|1pQyx_2A9cY2@MoRi1KO*1R<-U1zF-tOXwnPB za+tVTvih%W|DPy$0~7l4vV5nIGJiHJ$e>Deq2|THwvc%TJ1ES7_?82Lw=4kD1`k$c ziChgo_>LE}C=2OWdRf1(<(MQBr|wg-eJEV1Y+jVjo3FCwMQ15# zvkrfi-i)C*Fa*t19)xes<{D)n-7cr&@mc zD|zZZRfLl~?Q(eWzeWce6`UGrO6hRcIok1+qkUf1#mOqan90px!WZ<(maXKCDQ<98 z2QM+}miAG<6=|4=5}ZR?+BM!_Do;UOwZ&i*-6`yEdxk^M3aopPQ#f> zl23a7N&mFqM|wJG1-K`ek|y^jn(Ug(A}53DAma?~=3H{6YCl}$exq_S!M$Z#X^FU| zEPE?2P&%pW4PaWe$hd1-4M52s3PME{mI-+RPVl2%bHoHaZT)alnm=1MX|fmmn4nXB z-=dYcxWKkj<%9v<+(lLlNBn&~FR(XCam)vSe$wv{;Exx=N-fI{^1#6=ZdbcGY?SUi#DL&rXF#z}YD8#K|+B z+|5KKoU|09E(Hbm+t;Bx?koCV`GyVCWH&g6oU~npZg)?OQ0N0q^G|{x7l$^Jv;w8- z+;^9QRWv!P`ZD6GQE0OvmZc27KA?9~Y80pZDf<}($AA0HR}M?p6l zsD1dLOn-5UUcpP-P;b^kkl?Er8VUZCLF9BX23~ANcD_8*^Ffu5TmBDG?<%TJIM?Bnp3Ah{Cy*7IC zO9{q~^K%h{;3f0$I1p863SiPfp&&ik_F6#dY-C}9tJ-d!*I6PCb`Iw;6N1p`{Lv_d ziXkdmD(-2M&;DM|Mbb$2*Y0~WOu*SQ%!{PtFjN@Cu-R|k28;q0gR3GA$r;5L?gJBg z0L&8xLY)i z5ZsJkym6w)Q9&7%c{!H%W2VY_{Y=4%OXAwFRpcSFUl|mqLre(os2;Ymr1-KHfLYOM z0|owXBk^nFuD0Pvm6iiM1X#f`~Dn9YJhY69eB0vg+&Ob)VK7Ksorne0}Kd@6L*7f z&Ua6KK>jY-LA!{cn+w3Xd%{)9Lg<3_3lEr>n3xa9IwTq_<(dIPdTL<~;(TLPWGvEM zKpXU0G(!kvf}gXh-E<7O08X}L0d%^UZNPXEKM2~-lZ4=CEs)b-n(G#!I7ACDDF*4i zUJnr*{b=vI91Z^k+R)e!&q_;gyi6u9EP*J zkHaSAO3@E>#tcrOFl#G_DmQ_LZ{@7&**CtzN?y8$CKP>ig-r7d>hsasVT&4{G8~vS zZ$w4%y?1Bgo8Ki!=pfj_&Q4&EuEvWNN7p0>bQ4_Ejm6e61SVMmR0%^y>7bObxPVZw zX-&f6YZ^NExme?XoW8txm^jc1c@Ty4J`y~uicCf?-WExRw^AD$MC*Sd=>KSKGZ+a_ zk3Pi6jW_~`(%+n`D=RgC7PA>UNi!e>p9}8w33;qQDNR5qL^4W&+Ag@@AqAX5VoPL* zQ$dJ3*gA$!I5M<4^vA3ep)woEtZk1U)%|k|^Tr^}1$jJ*`Ef_eZO9iCI8Hm)x^}Mu zYytz_Gpx4DaK!!eOQDf9{T}HEK+d9FJns@j_3T6(2Gr!G|3_yX_~$ z5LokJ2sar4{*XUBdaF%rF5KfiLzECdKhpbA2NZe;@}Po!$~~@^Hs`)DPoN)U=|}Tv zDBh3D|Bq&ZJ|lF)m}^1PkmzrN|B?q;D6D*W;M|4IUq?g|pMa#s_sUJ!|F&W4ae{RZ zA@RMy0D{{%e{c|L2Re*Z&;Hj@Rek;J4qRRSY#Zjx_cmUmf{tM7UZa@4inh|_sn?L#{ zJ#s+6a&lC9lKOY1V+_GuiWZ&|1FaU}jYkFpm(MYoa9^}+#n zSxF2FoG#>OT9^tPQO9Y0qzqDdI95kI7(Rrk5o^9YJmf+ z!8u~b{<@G}x^)Q`DOXbtBi3V_w(rS= z1DH^CPqOi2=m6ma`ESA*L|{9KYrMk3Lf+NocI28^&SXa|qz!t+GZAYssUNcz+Yq5l_FlnVc8vGz6bak`(oG>LEDZSo9*(|L@qFjng z(&iC=4K^KZF3H2x(u1pmW^N#U=(_unpz=*XJ#9$DTqOAlagVg52vgiy-)H?#Q_O^9 z0ygn+K%zVHuPee>4nUB5Y$P!W*g-7M0bd7FmH>HbP0>nA!iddmKmZs3S29p@x1%rY zHH5A8Xl6!NeVTZAfEa!B>52Jo1Mh=Ci_z@!mc{=)9K0~3IbRF#%}9s@>>m9Cg5?hI z?h?9m3Qflo7=%XHsiOw-VVCpJ6Y~z4n$RU)Nj%;f^k#s}L?K8|839HH&K06jS!0*3 zVJOVWp=a-^yrg{W_ASaJ9DYA2qW^bWhc*yG2ef`ok*_FvZykXrM4Dmnm*GAL;0rhM(Wt@Sx;k8+kZCBUq6A zBim!lX7lk8)c!A+L*O-lZr~9=cwCnH_qb?J|r+SRW@Lqn;1 z9-$wBC27CD(={^6G&bv3L}B78vnjA44vT;txEs88jQ@p_#z(B(bNN}MfbGFJy8wOT zErI39PX(i4b7^R$X`WoS^_!n-u9u^*X_}`UjKq)wlHx}H!w6CV&lM{8>VmG4H2DVFKT2ub~zDPrA}9(d~ibChh0*?c2eT~ zd|?J2o~d-tJ8GH}#j&?F#`ahNDekj|ZQs zW&nuyazfMx+Rja4-P%D9H-H(`p#&ZGYA*W+IPX{{$7H$r`HK z{c{-;>X@M9zsET<*3?XV`}VE8rDZ|E$nMF-H}rTOn)c|DSz~nq zFkLIq!UJ%fEY-4C19Uu=lLz?xw(;w4p5f9ovHX%a=WwZ0gg;E3-5)#P_*78ytTguY znQD7{f(vfxA;S~@yx>FpGupz>dP9z))kmUdS2x$RwEi^Y4NinQuE5Y}-;g*U|8r;< zd7dJ13H@LS<^dfcNzzfuCPjjm;Q%K?wSj4oeCn$eC6fjj*AG8YN-L5 ziHZcd025ukmrB`0!))jbA4Tg9&k>C=&L4=z))${_K>mq@d5Mh7@VX!YyxaxQYGt8- zfBcFmj2G7aMxODw?`i&;ZpR0c7EFljb5|kFF>EU|Qy6wT|E3nkYavH%%1U9ySF0>( z)v1qTcvjdseS%It)Ol)Hz`Wy7Oh`oOoE*Ys54Bk6+M@*G()55k`TZ#3y>+x^Ui$i@7SxJ7}dWm~n!Ws6GXEXV%j_awv!&`iQ zLLGh-4}Gsj!vyEb($c}2kfZBR9kOs~En9>16u|KNBci!X?*Wc~!isy;P?Dgac%U2x zLxqQE^*oe}cWz|TJeN8z_bP~5#n%&}ew2*cI#jS?tTP`BV*84!t{x|Swk%o{EfX}7 zD~w&=7538Cvm|}r{)14;4*vTGHDaRO@$SS!j>MdS)-~DL%S~-eFC6Y8=t|He03DU` zNie1#5z+i74VVUbh*B45kYY(r96uDFx3S^H05kd~f}%yIgmTgWhmAJW{ze40PyY?#@_aQRjTy;URm5kx#_hzUP=05n5~@OeRJNBz{^C z?krADkJDck2il3ggk>QwZxd3MFz7{s{61J-rOOchFv&k2m4VW1`neD>VYsSF{6?nB-;cCN^`5#BSD?LxvNPVLe}4?O|fUq5ZSjcrQ-xF_=+#DU?e^tY67 zP#y^zpT5dKzeiq(zXKgt1HK3`g8#73Qk|JlXJ_>%WA%D-Up5tT_CcLQaJu|JxM?*$ zt}6PZU({Y9W}%+mVBTFc)1YziSHR_7|GN@L%OZKDU%FZRILo~xl^r^dVkhmmsEsQ$ zIQ2C(SjYyqhAQ&rHBH&Md>3_Re%;U7oN=yIPoLc6V_+-r#-FHJxFGqPXq`}I=lM$*l#xos?#N;$9$^>ttB=htlPhr1L|XDu zTxTs?U-$yq_&QZ8{|GUl*qhh0&D8H|&4LV(IE+fE(d295Ov%A~C;~mG zLp++8AGAoKu;gp~+@X#D%0Gh`$kkTMuNS|L|slzwY5`Mg2|oi zL99XY0D-6P9^g})mgo=h{@R27AjeSAS2?)wFi%g<{Er_85n-z&0Ha0J6~EjuMEoig z8n)>M;5d@h@!KuzrI~<*1YK3*j{X2b(D11-d9Zpw#vnP$s~ajuNLiVlrgk$`-t9f8YSc z;Vl89V|rj=n|04E5FLI!bJVN|&|wmQb~QC0H`)Jb;_jA`0yR1rq!5{P_0l3(xrpLMj|9FXi!5P2{y(_#A zj-+t@{lkBpi%Q>__4wN0$iGsG-;MkK{tQ@W0OIlUhfb(}dno_=+uu(5WbY!b!y0?l zzl_G8k?9|wlwX0!2SrfKp#g7bR8&-;Bf^)(6H`<2=H`XoXUn0LN;lv0Vh2`ZMOd+W zn+wHXym-rZdbxOeI4deDs;{lh zUovluZ_^O%-%rO~3mGWoB6kWqxHjbXZ6=^c1mQ%!-ZWlkdcBqMtn~yH|9=GPToH^x!o8_TL*?+;XOcIpLbh) z0}SU3?>1KRIa9oWlz)ubb|v^8T7&ZC>uC3yZkXAs1{+?5rf)?&MIzEaQF6}1G_}_%lRuWWP1K& zdPqr`q*H4v$4HRcKok}TX3=kSExB4ZxG6Y)jh$r(%~}Q~&PJ!jJr)7GPjH-lHjBM- zGx!k4cD*Mw&f9ZEeD1L60dUj*f@kl};~MYc&fVsrk1RoqXYAO*YmdBv(PC>|;rIXy z?YgK10D#huu7FH@ZL0OAx?<1nTaDsm@e7kfQc6BY+A^(?P2)6X^22HV+9oHuMEp#tp6MtnQ%>KUqbLpL z5I3x*K~4~4BbRu+&e3GYmoY&2@UJTcS|skOt3T~GiyulNDYXIGjPHw!x!#1vE3;RN zJ&>G!5|ON1XKyr+0WMqy)NlaepcC8Fho#dD2|D^;lT>ma(vHO9v2*ItCOuxdRmyWM z7Q>%g2c1pwK#~p8vHRjgj7~--nwTGzj(G^TR;rK=Qml+d*f?bL@~=;s2RUkNQeh4$ zTclf3?%w&=e;on3^gv#myvJI$BaGa~64pVdR0ynL>sjWesrk|@S&d$ItwtP05) zu!jKg)LfuFo-q}9s5K5=5p&HTKAGUNHP#g8X+?aWNMQ(gu-UbKmY6U(0%)jvH1}*T z;aew%$LTAr4o>)O`U$dSiQ~Z3330y~nb3Hi>Fzf)Zn4S+)W~_B7Y2E&uWUt=Pb4H@ zwa>~Fp9=C+vc02TDT)YLofXmZmt1`6f=aYCi>Y4zK!)KZV;t9}7FKC{DqbU#LZDx} z#j}uVfP5!0Y(lS&%Fa^F9F1Ka3NAu(n4Mvnf3n zVt)Mi4Ipik8c6TkdK>W7)y<8JqrATU1<+}vn1w~glk&4CBO?Rn*JjK)t_4|GKLytH z4w;(yvR?3R_dLh>cqaAvuNrZ_1){gAcVLI-tB2{L`Xm%LU$*6pg$%IT|%j-9`Ynt;_~b3={3WTGTP7U5M&ja z!M(Mt!4v)NBLQKfD-vW)%-BkqL0j+{%J`U^x~?i&^p}!WO%2Z)$t_$4wJF$2segQ3 zNJ;+*CUd|$`e#4b@0jxcIwnpd@QTXHk{PnRp8@(YOt&|Sw_f=4Q6U4JaMX78YDjoB zT%AD}5UX>b@ZIxTLyub|)iXSM{1!v|5+F>Iqg`x!;gn|G}% z8H%jJW1qj^0|fY>>Vq9o?hy{ZcG*+(O)lg z*5hvXhnQD|w;x7hPS)uk!7}T8*aROMZAf`uaeixNM(qK4;AbuVd?Vu3pnkJLg_^I2 zBCgI+({mZSSez7g6UF`m&xP$KK1ApCsKUYHSY4DN;k3-_;^X>%DqwfPvhdv^hcm?oa%IoBS zE^-PU$(IpeTmU}1S^MksKnO+}^Y*g>SlymJrkr!CVDMxSqi|sP8cylRNI>Z567O(m z*Lx+=G$jNzBjhx5aU&jcUX=zrHw@ShglVITe(DFifg3Dcq!6>Y_tcavG9(<{0bqlh z8!wZ0{umu0r=<>qdqUcelx+_`)IH>iku5$bkcr6m6NI(aLtc|ztK!c#`IYGC&uY?| z^XF|`{VLz6@2~i+^ZOrh0}xx>!5i|u2{fkQdvN5(nub3Upu0x$c3}J#*B$MP-Amt) zymBD&v$h_TSjrArupR%<8T7OKQbnp&)_TL^X%otd%Zvhvoj2uAZE=(Z0p_l+7GZW$ z%lQlrDpHLSsl*jsoaau>lwPitG7F7FJl`H;d1cnTcPAOs4Kr_~CLIubtAM-P>%)9u z^R;~4gx7`{I|7)u12B9L*T_sbZ1|%hXj(kFThM?mvAGz%3OM0k`o{ zD)R`alaL_yq1CBMwG+D%r0y-MO)W*=JVewteqOo>$`8Hrb>fC!4op;pA~RK!YUmch z+QDR{nhw~Pu<4u}t^Nz`&?~!E%)xSu*O->~2+#PR7tM_?;niaJTG*r8e#uhxd1x^8 zT)C9+dD1coA{(G7PXq;4)W``q(U`o5e@4{Im>0GZ=zm4VKmyE zLap!U&MOny<{(hinchm$t@qXc{gt$h19itb=SO^a09zaxJsSxu*&yQaLGr13`Q>rF zf0HjOe3KF8!x1Iu`<(e?{_BaEAlmu8)2h1lMc{$B(uypR4T*k|~4@6*6g1<0hVj z{8nHFf?6VZz_A(k9|!{fP$>QA2NcnkYxj!&k<-)E&b?_x9|_LbsTv<`f6#J0SV~&B zb!N^Dyys&U*{d#c6B^5voV`JUkl)80!mrb#@X?6;5xUD5_DpYObS}{0+Se$n!(=8g zv%MdeV`3!YLqHlQEEtmzMnwuo``wxKy+D=?c#?nqCXpGhp^36$JsfUP z`>mj~Mx$)X_E07^I(B_qNU62lo{K(!qqYBS#%CH;u8jM?6?pF9xw`{~fGQIvb7vGl z0i-V1JDLuAq90{$vS{)gH87nKxkr~(R329~lf7%F)!;szWO3%JU{gsJOywyEkM8%X zhqX|Ul(ogZ43=XFS9SW9Vq0MZmBCi@fW4v7N;Y>b9mH6$_-ItREd7gBV~J7k(Wp>i zBWU-JF53V;^fo>sbeWeHhAPmMb&1Lu>u2ifv}8KkcX;R7Xjs%QY}rRVUY-fMM)$>l zfw|oTs7~uKhx~U*h2imC_9AqW{ubnzigfrxs8n3Xq`njEWYJd{8c4Gf=L|_-tKB>Tu|O_lCDs6ALqut7Z2} zFjv&+#kv30Lc4Kg&yMyr$@3Vawf#_rwI+TNVir->mlWIbzr{y!A|NQ96jv9QK`KBA zSy?f5Xq=ZOfAr`Pmq8(mLNc3zd&VDO6fkH55G;_0-UN31=cjFD9w+?I6~4_!iIXc` zOL~10knCV#ZorCRV2~0U>gUaV9(4LDtc(tt%h1IMb_6$Pum)pGHaaoaTE z!@_m#4_LkgcBlY28(-Yc8^Vq=JJsI;yHVhI|H)> zGf=Y$PvT!_sZLzYdB)MDvIeWVC@>KCV@m-$DJ9MS%a3%t`!Issc7UN(URkMz9Dvkm zv)R)}`NvZ*1Ph*(rW@WZ<0Tk+TgB;Zi-3>#eqDsUJqY__9a;cHsr47C5rleC7dhwa zo(UlsR+sY(1`-U5gKQROzeI4-d?xN7-{_128-+yTY*mo)Q zL&x_YKYkIXP3(X+bP53x7a#~)@34y6-OfjH$o@y(ss-d9hZUm89@0ozAAq_!SYN%c zB^wo$mQ$vn6+p`I6Dp?>3q>6Qr8(`D)%hPx5_9S+Pt$Co`ni=cJaH58>^CZ)kd6wt z4xkZ-#=-GirHhY*XetP!`^OL-IRmHYP*cmVqchGw)8w{ape54cqN2EgfdMpr0KK)e zu!sYcgo(D?{juHtaPh#Gfb?M`*h>y-j=qG#t*@~9<>c3^5C|HKG$|=5RaiJnSz20} zmNfeC%W1lgq`ntl|A#$?x!r2C;Tpai-p z%Gk9E?k)7=%M`}@YoP%i3}(1aPz+RtyJgH?LC}`T5&m#tVOx;+|M~<2-U8`q`dh_e z@)y8(IdvPx&BAwlPBK)x;&JAXKnG9R1BJX?5%DP8S5duqhq%;|3@Y|)_s$eH9Yy|p z`ask~c(2JT{1jN zk+I&=Mie0H|_DJ9@&y!$X4)@3Q;Z+%N*Vm7 z7url3<-cNArC3#$i_j8<|ovSwiiF=0uTADl_zs6*Sbm2Hw zomzmzCdcBc?vYZ0UfqK0GB{VcSL1 z?<5K3{L8c42iA+QKSDqj+KRrU_+UYOMUlwySKGD!UJ3C;C3?iUVvMFTGA&X7B!;=> zimn_$*H&jQ&sJmC_m?}RQ_g>Cwla?hou<$5Cc=*Z)WI&pQ&@m^WM@~e6m8*C4F^mWo`YM&3hRW+2Iht(YTG5fYG>X zcdEhOhy4uFaR`5Kh2!nb_2Cqt^=o!zp5-$M&LF`Iq#ZW#f+Isf?WBQh+83*Ro;Jw1 zyn0dHCdix1Eczq+okO9G#DZzR;kr;>^zvL$+XYR1Li9%wNct+Pi7p87ZaW9TaQl}DVOrFi9*P80_+tO6eP_avW3%YcHxhXOx9(J*j}+y0t8{tFYk+i zZ6y&~;?9WgXz@5E66tqIv;qkyhFMMyl_Vd6rna`0EC)o8&5;GZ5;ZSSztq7l3%ZJS$-YBv>Dghjl;P1{3K-DV@w0c zn)oY^9=t=(g?5zeZ=kW|E<4NK{u0tDi@!to?}BPyWQg6;h6E!+y7%NF1k3|Dgp#i8 zDf~oJ`W|@%y%E)_?8qKNGZFn9kP^xPL4Htyv$$$Ud|@aU=A5)2bk0F^6JjJG5*7pA zlM>0F@Ha%ufmmu53!h;uqhE5MN-B6)JA#dENHDMW-S9eKJ?%{-_?y~O!7i(NjFR#Z zs`5v5ovX;p3lQWlWV`r|7%Q2}!d2DP1#TRtsAVo^4IB5k=EVMR{~@BokwJZ-Ed>j7 z%GUdRHPVGG@KI#Ujq&5|Twk1keJYW|NRb4BvWO6YRNcc~~c+E}(Sl3D9 z_z}FweL^4hWEYC0@?`!bx+lxtR)U``!=tn`e;z+M-53{SZzmwrug&KTYnk*xxJONK z{Z_#EEy%&7i91>`w5t9B_874~A#LB&HCONXiVKF7f9j(zd5&GN7|qV;NZiMNeg0mN z@&F}qTqq$f-f%_u)bRZjJ(~9XArShTk=J4XM3Oi~(0&hURvXP{)Lqn%qu>R^pRMO@ z#^|Ei6_TW2^T|`l8Z=wXyAsqm!ME>*?gPs&$VAH-##?<9fU~NF4DInZv_O~n9&~dS zQSqpK43Eg+fe#0oFt`#V92Mk4GI-(txq&Du*kn>Gg=}+Kd$bo1`Ef68PwJK@@nG`i z-vSOt+&)VONNGE<8}Wzf%{Nv)9OniLB=HBM%_>IZGK?IO5?p0!A{jNreMKi8wjVWp zJ?I&yxVPcirQ#oLG*6c=vE$p6$~l z#6G~_T*V@ejgDy&G&1qX$AvISX~exes&w=3q5JAUpO;8fa2cS8aeekJ?3^9DOQe{} zmTOx=1i`mzekCLo#wlzs(N{^QZR}H271iXHB$_(L8x#3Q5((7uypZ`=YygId>bx#V5EU{?qwP?@ok1WJkDBM(%YNvO#&;<9rV z0dlzl_sZnt$a~m9IS2*lWJ~~k` zji=w>`m4d)eWNghW~na6*MKqTbTh!fJ^ai$(g)NpT%1@zjl7HAYPO3lHX8`eJ9w@b zotpk4xBa>M#jhVs4v>yTGP@?&ol`E>sU1cTug(Fp>Nb1chn>Up#iu0;R``qNCaC8#Q=l~yB6PoU4 zzae(k>0P8`MfgFKWFL^DkFE<7hA_ZJmjw}wd16pR(0=pjLgG*wZyLK%m~WBy1F=!O zU(l;|qcDh39Wu;Q{JS7PBv+^Uj7Ge?+!<;Ll;_mGMmr3B|GO~Zj40H+TTJq@LkONj zo!Nrwo&5uCbiA0gt!(>o!tGv=gW8+vups7N(ZLM)c3d#xhTL)Ooj3`y`NQ3><=$pG zIZZxv79BCX6Z}z2T5FR7ZuB)ND@*s|>i?$&1}y`NmIt+k*t|pKz|wgY)aQEq^Lgda z^R?;FtHNR3bc1ASTGM0+-4lmBgRKt7B%u96SUXicKP zs5S(H_=5y{vpsmN2>>k4%M2JbPqQ^z)B3~(NXE}7R554__qlbVLk|kANXJycQw>lj*v3mDek%4r=5V2-^|AUNbG+* z3AQ+R#Y{Yte;SdN5qS;lS`qjq9>~DuVfSvLBj{kT(DbQ zm}rGaZKF^F*hz^GI`~p53C;=rK^yp6VXMy+nJ>)=V$=>fU1);52E4%lXIk^Kjf(!E zfI1MjHAzed1CRS|W9@V(K$puz+l0>25u*7ycd(fYg)K`v+#(H|&^C3GROI{@hn zLsk(Pcbh+wt$#hPxJr_zQ-pG)S;7wWNSWunH*~y@_avy zT}d=7BxqG`6&AbWoGTfG(7MgKV*Bk$rB4wutrCpy|J3-^O0&q8Y}d%19a@G|rEL#+Nc3n~3H zBvZcjgwiWUmRqquMiO~RjSY460{b{pW-oeH4mZyIonvSfj1sKvASgSojdk<3Ol)k) zKz0UTAc??OSpfJ5advSry`*!zrj6qU0U})v^}5p8GS#2Ed7=V_c9djurW&V>-1V{X z_dVw}IFP3VDeZn|OW1(+X9>$p{t_=GQig7Ji(y8Kq;uu8gYl&|#QHqs2(PN%g#mU` zCH!o%EuTGw74;nrC5dthIRb)>|H77{3zNBt&)V9WO9#*sbHJIc>f$u8E?D#cK$}73 z?EHL5SMHqzfERBO5!CH>hSW}lWkO3S{!;gbW#hp|sWxx+{MIe0TUPqS@KDdDl1XlR zpdJ)(Meo080s;Ty((Jn0`;n;`pCD^}X`KlfWOtq7fD=DkBY$98&j?=5uB+3!>pW{{ z(1yyof7JsS#pN#P#v)NN1gz5-e!=(-*`AR#G)0GNy7CYIvtODv4i|JNP5v1?`|uMu zv=dfPQ1DDum7`jWq)J!%;U@*wAb`LkD(mF!>gpP?xaU_?z5)=fD+JQ+w(huKxrSo1X%h{D&2T`hHyM2Ml-^ z(V}1l7tC?GI>*xh&iFQOa5j>Pra>|wJ}67gp)7F3b$``k*U{^xwIHLL=VjrKcD-|@ z?4lx-)Sp(}Il+yn7ngJd80!Lf`hUYh00e05u873vH^|WSE(t=rL`oO=Cl??!l-+`m z^sU|Zayn+}xs+&-sDC}fQtUAv=Fb@{IGit;|Bi6P6 zy@|Z*zyfk+KSS@`K4s(s_c`@)%8`-5&F6n78vTKMi7MUoajDfFwMUBSnW^U@7{khq z>spA$A8js)4;wAc(YK|)toQNr_?w!+pF8;+kT3)&*n4LP#jEygTJLK>Q3E$2n1BlX zXFV`n9l$%1R`cCGj_BAqIAHew64?CHtiAk0D#N@9`KvwcJ%>@M*URm;G~e@ysKqGT z>m6~#$+v(0d_qQs?vM0Ao0p3Vg8CP@^v6xyMOoN)@-+8(9fXsSjO?AAC*VkSi!G0% zqod7$5xTnelpW`vdxdEMSRC6c;JY85yn^hrZkxYFKz}{tfOR^6{^N0OHuoKj9xa^* zAT&(Cc=*6T8q5;IbO+*Yl=TMzSlrm}nJNYg1XO`Ip8)K%F?=ZgUqkcc4Zv7ZXyzQO zV`VYdrH+RXBm>O5@%eclgFc6QPbn(n+o;j&C{U!=zAb!DsLa6qK_)-PoA8D8Q(pgL zQ~k6ix5u>L02wduAel_SI{KYi=qgo7y;OW7;mT7_OOzF7kbyZ|Pue*7jmhEL%a6l4 z&tKvf-)HRtJVfbf_e)6>nX9I_4Rw~)82ZBP368QKBn-AvM2`HIfAvWH`5xj#r2&du z%2=(r2eElQc$xOs<+U{JuXFEr7U9q9Kt9v5hB=_QVdNJdI~L?)3HsaIjY%EKMJaVQ zlO|skdS|a>(@`vthK0*_*WDw;&sqNZW;Pm5680VE2!htElxAJ*G~nk@BzYB|pP5Vm zp6B`6Akr$orY<9?R3nq`d=YP3JS_{g<2Wl4=hK7FsUHVaXw2M5uk%7n9 zeCVN}p^Ew)?d|PM#IW?>VueP>Q<$b%;J4C-Y1YfBhV{bW52Xd%p&wkftkQd4Sm=Im zllnd=#8F?;p|@8Q5tQl-85XI-9K@93nGzUcTwO+gh(#ZzlDq(YD&FS>q&fNmWO3gl z92|a-00G_1mz4yUuh;?8Q)MeAlJj8}ltP7FCW*$pl3mE?Jqb^8l{)RM58+8@V-=tJ zY}2E((2@i`uu2**GL1E^wpm5iGaokyT@bFug7EMr%aVF_N9&te(}pz-=b|Rbe@wOi zHvZsWuFA0Bu1>`?{^ALac(fX3GSDIX6iz;z@YU(czk%K}U|9lRN7QOaMfp zx@Djzs!ru4c??bCtkW*1bJMmB-gpQ&s2wpSkiN8{7xjLZEB!~Vesm#w>@k0^!SU-g zJ0YQ-$DqJ7uC7f_uFED7{1I^c zaP$_`_mhjhucC$=IG(|fsiFTqYE%8wgK96(qgIY_!OxxS2}H70Tez#59hI}L(<6Nn zKa^fQSJQq?{4Am}NSY45f2b!P`Pd=#o0s)zIQFHVj&`U0D|8+Y{=I0>a0>?UKJ)ys z8`D?9@c@$v4`!#JLXN}^g@J>M`#UExu^J)Tp<^C>(?^-8O+)j6a6H^cY2F6;aQw=- znX|8l zD`Y%0yJP&(3r$F$+w-D^hFMwjcYGMfIH3_h{;Cbs^CjRJAr}UbHpk%0C2##lE!WR1EW7h+R{)t?bt+GD9!R5I$k$T2$yRt&M zL=5uP+$lEahk|wJgUysnC?cB6$7l|0&#cND7b#?YM7x@|eO{kw=RYb-e_N%DU7fb* z_&w#baH_7Lx3zvn^QVFlzUh^~6>Md0Vf&s^Rvl#?=va`yh>k3Iam#xVi0hyQP%x&M2(e+)Sf?!5~Xby;m`7M@EB{H*26S{EsRvH zhe*psuHzNvtK!1)*%;n8A=-f)5s{G+UvRr<0=0vHbQsg*N$~q{e1;GXfWo)zE+G*V z|0-Rnu+xPracNHIbk1abeZAeXuf8)BC-n5Y;3GyxunD9U(g7(RPUpklh}hyFzSIWh z=Z%SZ0MI+E;O%g~_u||f`2wfOP%5|UT3`G(adO1HK#J2rpMqyT$NP0ul4Ej-^ud}I zqYn7Shim;jem7TbDVsdxE6^bkdIiN0RH|Oq!y_+`)hlwhIuf@zvZ$AhYxdYHpBuGD zM}g+Nar<5vbpzbJ`Cbd28x?Ck?~iDO!;%4mx-)uDHN4cQrog|4 z3;EX(5Kjwru&1>=Eej6wAN>CDBa8SfUEn~>D#gj^9=GQOOi$a%?o~o#QBm)SK~nOi zzy8**Em?;vO=UIO=y=g6cMIupwTFXEWW|_7=iRP`y;5F{J5?FNKAxh*Y5Xo90IKXT zph`egn#Xd$0*oKb^1ZNxltPLkA|f6f?6jj%J;)_85k6nT_u9;G3&h7-W{a1d-kz!4 z0h+}t0U8xAdFR0?z(j?HMFU&O0GQQyvRyecJ6G>i0>rjp4PfL{Mf3H^Hvk7BDwPnF z2zxm<;&*cdr92e!Y7+xGhEDAD%oN@}S+98?4jut22xzM1K!Xi{qM_NMz>axa&lMp` zxUcM_p;aqwh&_80YAQW%o$==TmTeA?AGkfAK0(g(5}NC#^B*F9vbFOqdLoHP*@W`m=JL#Z$7?F2O}G3TDUkIs9$6bf`` zNs0{{#3~J%$FTN;eJ5xnA3RvTIQMXa?fx1$;UJbJ{IAKQW+z3LC2vAeHoaLBzpU*2 zxP7_D9X$MzQ;6;$=VA8=p&eqsc8@Dq+iWI4R&1x{>%!bpYqRT^UPwlbtCHMz?~Cn~ zXP3(t8}|L5Ec-_vA0rykZ#Nc6t>u7WPiy_!I#m|&JM)dab$gH-efQ82)OmQ#BK(l+Xl&o_66(g-_=cg01~&@ zsKlH$FeFpx=riZZ&yQuOXv=jDtJ^uDlo%_^cQj`E?VHQJTA<^<-*so!v^SOt$EzLI zM`{0nTKI{G68~IeSX5YOtrIk1a$*AHMxXwPCgRgf?_STI1NpAt^7~rWD%BuJ#jfXl z)dc5-DYu##w98qc0yvvD>&>P7s!d+ub*6nNjb%6dzMPQsoFv!PW^8Jp%80qmTfdMe zQl6ihX2eSM;%w&@mLo6vHGL=%ZJ&$n$zSg*lxY2b9w#r}zpEYBmpPwL=}=wnS>$OQ zdsy7)_7E<;=T1#o$@_A@t2U{YllHU^2kJ)S!QbXH5Q7l6@!m|zaClu44<8hpwYiot zj4bWv7uU%&C>O!I?X&(b8rs^Vh!&tKi5h?*GZSzhO`M-K5X#zO=egZjZ4-t3aQ4GGyn=`8gayBGoG7b%Nmd++E#7 z#HvdI8U$1dP;qe?Xr^GoEA*0mFMYe@bwlgWXK;+^+O>gRZmb^zQKrCLPrpTBK=Xs0 zt-T_Y%6A%|l@sr#Rq3B}^XGrV1IN}eIWRdYg96upEl5o6TeoK~lJG9lVj!K|glz&$ zYuIr!V{W|>Xb~~}Q0m2GxQSAx;KRvYp?>W(#T6 z_a!naZgJSnYCW{uc8fdmHf;Oe`l)`8Ql42X`Z(yStU;P^x5DBQwnetkMx2AS1Msgo zotoRLn@hV3bj$PJ$_>3lM6Z-r|w6S+2w%S^OhBoOOHgCjBiV?giW*1Z8; zd{|o?pqMkYO8g|N&4DbQ7m2|on+@xr^Mm5lJP<+1BPFlUuS%^g zvl+_hQ%ZGF8#oi(0*);#Q9lmsAgho9NJ_3(^u%)zizID}^qd5^1HDhHiwz;JYYWH3 zeF`arB*I~)f zEyML$LFm8^A2DT?wmRuX<3jer;!G0d<;*(`mSZmC^lDw@Jpx^XcSw?_PY0WjidpQ? zWAyiz_(H)Vt+zp8Y2`P%=JuqeDKD%IwDrwK<}_a^UkDd*gbl`f=Gg7Z1v}p?8ay|; z?eA7OiC}y>;7CA-dzSSJyT|WF@+;PS=kdbFp78&~bzyDMcyUhq(&5)Js}d3Ut+EvF zSV_e$jAhM#m0{wZpHUC0AnMwdHO7fA^9F-O5TZTA4RK>Ti?6+$5LLu9){!nF-q!Z< z^*1439>6ehv5gpc2#xp=bRZfKHKIUw6&`C5dWQHO;vZHXszsQl$4>7%B=|Ymirl0Q zo7jZllRXPHKTOt)htjC*@^r7Y;|K9Wdu;OsHox=1a_kPG;o7&H=Cd&r0)gL;fkHy? za+Ji&)f?PH$ODt7oVk?mM?`K8VuPF#Jos)i_Pl$6VVD$Mf+wM(edH5plEt*sBGgPV z;*ttKc^v(~S;f)*y%i*NY=9OH=?SMg^wBt1$b{8g_*g$tSvZz4oBpeBt)L%P4h3giT>@gVq~=Ow<{m}{u?c9nlL{fv z&)lSXdlNx$&b?Jy*>__2OFI7%2h5o-3YDoh> z4i4{7M&gJien;CS9C!)^UwkxRBd=n4l>$pz+tee{zFPT zyAijjSVrkke$W!LD1*y91@98xFnUnN2-VY%uI3~<#djc|>Tzm1@-aJ#c7LP;8cg>p zD#KlaEI>dGym;V{eEdpmWRhINIkYV)8R)L4B)eqWckM2*uy44;6vkR>$<3jRPgnLO zUP=l^2ka*?b(k3lOq_-#hPj0W%_!u4!{x9KX{5!qoMBj@+Z9QVoWjrdti+CyLMVdd zB$anBUs-ikV+fS7VvnciEud8U&ClXJv!$}H)i;~)%upl`zAC!vZy4BjQg)*3;_2c? z@Pazush4Fqhi?0dw4B>}oo#RP%~+x}dB~QEs&{ddO%0KEnyeYos==@CEu2jp&$-!8a-Mkf`3iapn67T4i$HY=Hrh#_(j$c_? zd}^whPAeinc=0d)=;op|qFjl>3~!xPQhWVzQKIy6oCBAH2YLCH^NK3u5Tj$S)2cou z)P^cm>z}#r(mT8*JI|Z%F`i>8iSp#+5yR~1or+&KhKtbOu*Up~w75AZzrHRj6_|0{)=IhIPX{%SSFp!A@@mV`kh9hM| zwjM>egjq**9suHm+?%+W_S7pEo%5mD1R01&y(AmPN^@?X3439guq50jC%t#_kb93B zurF(cp^tj@%8rLyaJN3)wiUSEH7IsA<8R0M97G&rRHD>bmAvuDJckhDek+2J1g*Y3 zzRfN26mpU|cyg&DhEbv3&B|&u6+(74VrRnd%>~bJg#D3^0B=8VvUUs zM6Rr+Cz7oN{++M;@`v*n*1gzNADO03_R`ifr%ay_J5Q{(EIZrk&dg8p_O1J3b$O`J zto!M%pp~WDnQ79No*@~2-%`;oiPw^GYn<^E;83outg)77_vy%a)xxhLP zTmqB0i`I6OwxMI6%X}U9?mUUN9LBFp7bZ8VPkSvRPmo`dhoqG+)Jtlvv~^#IgKTjF zY;QAWmZ=(kI!xFc(|^-f68l$BW`BuaE&o}PSL%M% z!NNptUay{XxC+ZY{$4&QO0;8{qAYWTl~oSCk%XF??3XSFsS=*9*vsZ~*??Q_+Hb+p zMMm3)6Edj@g6z$GC*-HjCTvSQKC|kLDy`gHBVorV9=%wh(=u31Xq4{I3HmPX5YeD= z8EsZ2co)sj+2yBss0)A$6JZa`bGcL^ibjZ}mNJ*>`09HUxz+u{;nRtmyi8~ybZfkB zHB{tkxeY$qRoyEHe(OyY3saXeI*_J(XiW2?k?X=Rc4A}o2Evi2B10jW?&vJF;rp<& z0gmu($`j!TK4U$za?~GV zf-+j|B(B?Z5$VOfm<}9HRBw-xKR~hae zq_-z&J(OT6m%P=~{$^-D%art|_s=2m6f_~M_m9=;$TpqWsjBb&(Dg1^(nd%insozG zWUx9uFOg1)ZZ~eXQ+RWGtsT9UcaF+?_+{!PLf|3!)a>FCi?bvsBpK;8$A0$;4+1 zPQ71MyxdycoDWG~L*HE)TWyJERJ~%S{*Sh(Mr}8vp9Cw|lH`&pYo00l>F%3v6R;INa9x_&oP36#fZWJ4quQ~; z_LE->#(M>zjQDSPOxiUaNE?1$sCO;m_S8cc*tGm)qFbfI|JZU1rz&*T>r$}Vbsz=# za(4D<0VTfD6b#i5aU0QA9iujDC)DFANy{%)#58l)nKg7^E}qKz ze|vg8>ya2y%-=ePZLLE!uEhDJ+oGWz2crYO^a+UnnSx0P|L~am{q{(XuvcLD{LLry zH}=bvSI1?36kP%!$(vD0utbwYIyHm_m%^bPCJtzYz6iY^&rz`pHCWZ?@(5GIO{Q3R zp{JMeSho%sSG$sAV#k3(co0+ZHb9<((qtse+F6O-lqi&(d|Jc`cWmr*)~akKmD?)N zE6$MowCDPh49W6_1lOPvVGK1bw!$lNYW62=m$A|ET8W!T_dzNtjw2j4DK2&NsGnuW zq!mmTbz@V)`+Bb(CiXVo8LU zH^?mt+OH~!-hdy@JTE`4^Ef8*u()~?ilV14Nj0JMl-E`I+1T*U8RHk`l(g4Umu z;XJhe+?{`e+-W34xzS`oDm8HL;r2mQUL@F6zQWj9a6p+HGtFsPn%S_GcRQxTCq5I= zQ7BfJUU@IYQ(9~YX?&+%OCdaE>ZRoRfIr#8(AYF!=$8usK5KpCR3+|H=8~p!IAWqI)*MYLnnw zTBBOcwL5>9jfaPaM&{?rhhNdto@@*lr~xouN6O;1JQfo+Yg$s(*UAJ9ftJzWc1j5( zT|OF1DhdmnvXf!qfO5gNcEzop8WnW9p$g{?EyC_KUxa5DDG~CyRg5E)nR?bN-q@u+ zjZy zsGG3a)bX_b(JbfXs2V<$p+&GuFjmezZzVJ0Hq#O^ywAVHCRrE0yHBQ*Bm&(v&2zY) zMM$(Z<0Sudr^16yY+qpINY`SgLS}>L+a^&`oyozcCAgC~HMtH4r{$(M+wrb!7winN zcOWM+>91(qO+p*pJ!|R`>;5RIp z#0IoRPHv+td5Ie!kRocS+HyS=IuOLNw^&m>)6==r?ohj^W`BDI?Cy1@*0i7BO1^Qr zDAz&ySWr7w@RT|AS9hCBc0YUVRjMmJv*oXP;zmI&?(0hG$f1pdgnmy>QcV5BmzK+( zf`DI!lC9{OuRaT2>RHoj)ZH)ttQTOnZBco9?P>zA=|4SAZq+1A54_zfktSvt3LX^S z+Wt{pdy>CRoK;glSkZa2;$&jofEnHR^vI?jGnBB~qvo4hGESp=(f_yIzXjZtn>y9T zDW#UMVR8G5aL!kFV#QV#Kk;w)n}(N?E}Uh*WjuKYZQ8^DHHqlzSOOrgO=r$Yzy)H) z1Q#ul6^<9ZGrvpHq|3Xn;FK&t*jU1u_uB9iyGR`Y*^{4`nFnPfjbF+Kr3SPz;@r%J zPQ8}qhXN^^mj!J*k~QCaQ|760FA+x%ic|f6ib#|oTI>bN5xGW!0D(Y%vc2N^=3eF| z7&@zPpveRz{~`MO22=g}>;m@4!w#DZflR?U=zYS(;IjW~0xR(1? z>b1cYEk!ez+?TGC&B={&+6Hwr5fw!`VJ>SpsKA&^maHIP>+2^$)Ujwem+>>+E52EYkpQc3JNzu5J z!-o;KY_N7#lN_}?&BrSe?EQ895tiEvb$eIJlkS`MfBx&P5CRmj7q3n&2_6ndldR3z zsNU!lr)x6?5J&awgY4KTuH=Tp7-hLz27oxZ!@wQ^hk<>tm0ocRMX*+3M}MuRx!9`W z=i7&u)s|pJT>~b_^lz#7sxDA(+qm{hMdyvdZhmCLRSO(7%Nww=3Yx^W!G7&vgr-QGAgG(v_~wGkbk(e z!tWC!YzsiK3bvgJ=DLgbe2K}ii7%(kK7jx)9D$dOQ86K2RA<1DP6AgD1D{obD$x%YHDTwnHC-Db3q6t{^SaB!H&9eOZ2(&a^@=w*IMf%%7COpov-Y`$ z0uzI;icFkT!^t5{li}zqtB-dtiudxAzM=6|A;PiYAxC_F>A+ZEhl-Z7wlfp9${wQ1 zod`wOG5<`eF6Z4P2bwj>iTMQu+72}(A4`cUPiN37ieP1>@2!cVvJzuEIxNDGejb59 zG>ndt#N{_ES=g<8k!Sk5gNsYW8C>?Om&1s)&{oTM2O2mF>$EW8E{XiVwH^NC4nT?{ z?w89$Y@elN#<()`3nS`FKE03#u0-;YmX5&R{&4ULvZ^+;l&un6C@O4Z9eG~=7!f_Y zD<8;a)2^s5Ayf%Tt#ca5_OXO9aWSEUTSYUzqaVQ{_l>8Xx?v&J2XEu6ZSc3>)My{F z=Y8IUI>Kv>vwmbPrKx6Q=1c1MYG4#dxh=T)z1U@g+160tvCIS5Ul-J3_DRU4vwr^m zVOf1-u^{;h@9=5YGoTO!qLgHMFv9_vv1CR;Jen`>E5IbXS9MYv==OGN(7k%VG#qa& zm`Q-mnow^M>c*Mm9{&gIq!DH&W;yvB%Kq|HKk4Ur4-Kx5IkIbB`rO9#GzwT|&f?gD zuD93Q3o=sXqT`;PPEL8jEauYFN}I`zQvD;?$ELDVm<- z&||4DK_>gMu!rmWugZ*9RMxZDy#~I|0^ACWF_EBXIB<05q}C&r%VDM`w@0W(C~N2-HmQ!#gD%5yVPJH?_Vmr>(qp zgrjC&GkNs21`DPyg?H90^1o@q3{`GF(}G@GA5onv;S<07Eu|tdM@Z?4CbG7R-c)S$ zc5UUJ^***TMlIPWEGTU2px9>OEs}DoT|ZPW{v&ZTLC*l}wq~|5+wWW_t-`9<@ zRxpezl<#AF@7^e_LoOH9U8W(m2qTWz)NjEn{!0B77}9OhOE@RPFfmYY8C!PQxRb@{YFTB}^mzt1zA_wbkjPj}Uns~Rj5O`8Z?~hx`mKJ< zr>z9s`ldE_7_Ipx3*p-bW}XQi^dGpk4A@`Lsi@|tF}4qn^) zyu~f+X)aIV8ihd)pVkpEPsPUzbA7y?k6JC0dzJtZJY>i4C9-Ax-qpjL{znV$FMOtfV5lk4&iMN hYJLC5;c+^t;zRYG#d4x=nHz^B%iYDtxyBLx@IM-Y5H0`! literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/flops_step1_baseline.png b/examples/aot/matmul_optimization_guide/fig/flops_step1_baseline.png new file mode 100644 index 0000000000000000000000000000000000000000..7428f65d5b3e637b6f30beae97ebf3d31bcc6f71 GIT binary patch literal 90408 zcmeFZc{tVW+6KHzQi{r)NNF;LQk0BQ5y?Dh7cZQpp=PBfkw`T1 z=VVn$Br0zbiDJ*DjrdK?q?Z`}b<#;r+eyvV%*oZ*!IY$A>||$c>tt!gQJD* zwWEiH1P_by?J;+9vU5BsAYk+FR~)u=xGL~c&-oqRWwYHm9Y+$04qWeBC(WqE?p&x#X|G8MP{-U$|%)E+q`vj6>~F8}}k=0EH4f4dqZ7UdMOvgv_AK`;HNwr$_N z``*ydm2=6e4xGun0g$&)9=<*8AhKIxzGSY&wl^5wa6=cH6s>9%g&YIOPX zdMYX^(Gw>I{I*GW$L;QWCT~5|7(V|!-it)~^X+YIYvSpmPf5Z1PMp{|_xp2ghE`5k zSQv?vTUh8{P;iv>xb>!^N00K)U39j$w|{OwxRFdIhsDNj-L-31cW-Z!#rtQhr5yOH z>r+EP6Xo|6lJOtP7Fl+-KDRf~*48%fzjV=Jz|o-MkJVzS=S06WpLQ;Z)cV5Bae4Vq z%YomAcC;-84@!|wu~BKV+4fb|Rot9IH2QROsI#!Lk`*F(?3|tJZgY5UI(_4^aRYd2EQqwOF6;F4w2`s#y0d4`UcY|5x~WNF-%88t*V5)G6W{$tfsMJ2*HPUAsn$XYR-`R|UH!8WX76HyOd9U3Bh`m;gmSArGcP@>u1}Yy&eQ2~DDVr_z`(%7^fX_m zm13;uj);Rw>q+i2om%tbRci`Pl?|aByf)F2*pc$d$p((-5#6Sz$;kuNJ`|!3Lz{Ms z-%uZ^#J3&k%wS%e8Kt79r!UHO9BJMA=80HItA~@5laZO(#w}a6bYvUsy32Y@rsUe^ zj@AC9hPG5aBv zTy6~8OFgDnY%N{xGS*RcZMDyMWpTzHU#{q0S26K|rP;ChrSYop$B#3dIM~_Q3$40n z_no~bTVVFKqw>s|GaK=I{^hRYU%yn`e13E8@TE%kpk@8-=!gyCV)M-+KJ`antv=uV26PT3fk&^y<6*sp019lV9b0mK65W z3ET8AoW4yPP*UQW_xr3x%+af~v>XzA);ZdF>#>oV3-j1_lkQw&>?}i1x<uu)mAZ}7Q1 zJ}5eYAg%e{d2{af08Z|?3m2HKjbpnimwt=g)#nuvVfOLy=`OgsZTG3`7yCUI7Z=St zQg>7?Po1A0X-jH##LhEomMGj;O_m-ri=?5+{Cre!|#i>CwZ731>rhvz%~vT7O9E4vWxwW`Ro#MZ;@vv>)c? z3OfC0r_|BW5fm1Fg}7^eI~vwe%N-!Ae)!2LH@fC%fwPK=cNCV8*G%ey8N2gMw=nT( zCrk?HIyyRrKYSSY>Xoej?c0Mj0b3^i{5c(c*dW(#Kt@|nkNL(6b)LE?e!X1Jl|@d; zD~(sKThqXOzp8WtFhF*?30}T;K>ty z*4PUV1rxOkHgsj_rOZ}Wk(&@`?aAl!kS93j-oAZX7qEqS0;%=#NNeK!x5u^#Nor2X zcW8DU@BdWlh!nXwzo6h)n@w_letxcb8=14pDO|i{ZTXKf&MouG(wxB5CsgN=uAKO$ z@ebi-i005!ZoVzwQtr+SaBLBF@VeKRP^G znw^qzTR!5Tas8d0ByVJs-Ip$18g?)f*`eZAYrm(#Cpl=ax=E7OeR*E!L$*N(;xlbo zLRB^AQbi4R%zpe!1s+Rriq28AV@fM-WTmGN${hJMVPzXvK{g>YKB_UOp5up{60li`vU% zy-^Gg5AS|%zmNEM_ocBC%L=dhSFenIcBJ7;>4q;od-m+|+UknvjcG1?^uXX?(v&l` zGtfB;HfuYAX?%E@M5p zB_+3IIi;^$*)Jm_6BZN0^NW7ht`L;F0UXAMBDQe~hL+J!a}HgIt;Rx4HVafSEGD{s z%r{k#H1GcSF%kI?m2d;;RYSvIT?jMJ+uu*`pKuro?4ekD_>e`v%qi``4fW?Z9s`{j zT0ojqw>haoDBULd*ZKPTHVro^DLoc*J-nWRLe2)MAyq49Hyb-UGG<6%;HFbIrq69M zmrqvx^{JHp(W6J&FRt5%cZoWF+lsS#K6&j(X=&+0LF4tSi=zT6~Ujf{?NNqaNp<+jje z=w(cV^1}D{V^I!@db=gu0|#nvhg43d>J%KrG6f(>TIQJW^gC88i+7bf{qQ|z^`W$V8T z+k1NAPm3h;33*BqGBbx_jIg}%{osfQdg6IZOiYM9Kq--N+quVfS=hQ;kO+g~;^MkW zLy{e`EgfY>Q#lSi@gEAmYkU8`wkeXgElW>?e>T$`l~~J)Q$`-S!szR(Afj-|jcwex zG1ah2!rI2h*vQD+b^OcT{rfkxwYAk%MpafySazhYJ9Ov}K*c_!c^s(Ic6PZVGi>zq zew*nJR=2clV?XJ{HT6PXULMC-H13oer=`fTV{)~(XjGCF81YGa_V1T%WT70Q^!@(k z5rv1x3Z1gq`E%#0gBf`hoM=h&i=#RA08>NXzI_dqT&?cTGl?|oN|?I4&WL>EYk$AD z1~}#1qN6ckjB;-QqP!F!%II zu5B)nJ9Fmpn}4?d?fde~`*XWhq9t9Yt^!WJZUPF%=jUZM2g@uKe&CMC= zRV?p|QG2*Hyt*;_E9lYDt2`ZnZcDTLnpwIB6lx1y$1gek>fVv1S918nOonDwFreh! ztq+!0R^%w;3=9ku9}2Dm*kGvm4nVbc&mIym1e5Bw!>EBpR$XWOHqxk9O7t?PGm`sh0;uFCJr~`i9fsWXJaKy9Wyu z)2r%g5~`1kpaDH6C#T?zX>*m_k-<>O`a+BM51w6XdH1fKz%=xSE(ESmqZB%NwDwTE zyD!dQqE3MsrNmWWz&x)tPwP+5cUV|hki1irTc{Bup6`jU<7?G5zmu}y> z#mU8WZs3pK5flLPPtS!;iHo-Z&H%D=c}NmaUuEVf`zrR1J@#gOqR70=jNh)h= zYwC)3z%jBf4Lv+O6dxU-L@-3wP!FSO86$znl!hR4^Idwu!mn4%H+3>Vr^u4=vWdyM z2ft>9e*UCZR#ql(3o7YoS5Ct%nq9%41=?vhdIvQH#>K@ol}-&)g(49gcvp}U6-($}xbh+q>PaFAME^=e;_g!T2 zUtcQpl8Q9GXd(3Kfp~%RcA>Z1uyNyM=Tdu^*U%5wbtC_ZHBP$1sf&?}hty>Kf~ zkQU#w5GDYzW(8H8&)K^JnN4~ zNNyC71d_lvJUo3eDk=)VZU4q?yYV2dxB{aN!OM|by-4j_o1Sy9*D^y=Ugo&Tdmrts#=-ABk zbZeQjRpGVIy2@Xi5om)ZuGBnA;l9xhwVS8s7drs&lDf|{sb1k{wvRkYSLfrV`p#wGH}N}|CR<6fCgmJEfp7xkzxX#KNlrQ-KJ$n6RiI= zPtGm;{(WB|llnl_b-P$uYmnRHJ!T)xo_uytOiavpE7ww9aq$_voZ%;lxeK3WhmvMo zj-CrS`)uOJXng|%AsqEPY$x&?|Fpy$JD=PYaqk|j#LBdqutoa@BHS-liAYH7M$=U5 zznPx;)Nyp`RJ3f{ixBc##c$54mw6#+5fw1Wu*$RMN-+{GXO7xKtFG*Uxjrwdu(?t* zWT*q_YJE9|lEiV4x+MM^twrLQyR3B*9FY}swoS}CBF`He8`nfXKY~u+6`iG0B6{yDjM~Z-?nc&6|_xJ;~eNJOAu>`G8xkqbv%I;J0^A z^eil`EhB)Q(7pzYp#@HJw?+lzTCtGk=vO$s>U}FE6c6sp4qoXY1H=GjJ?aw)-v0f$pSA-$rli^Ez zu)=NrcuQB5M>#Vo%lFS{kpBW~($v%Kc5AF)k3^kDwtK&jk3%2B^}($v>FLw`?kh80 z2{si|Yj3x5K3|H1L)JbF~E8<5*G-0SF@H*W+z7M)b|>TM(MgNFuidg2TlsSD`&f*hwgx3dG zypsyD0nMCZDZ0Rc)s>|h7yHrL=cT1VW;akRzD^9(9{6qOHjQ++*OWJmxmC^7TNThq z|}csEQe1A`9Imj^^sD zu+@jHy(P8+!jWcfHQ#S6cx8J5Db}p{`T5z;{Wd7dQI=JIWY2caG+?m$RHPf*rXi0n zYR+Us`5|RLOHtG#X}vJ;44elKp6QMgGPzan_ol9`&_YdR zbMVB(gv?yDmv|cM$QqKQ9DQNP<8;sFDE{lK%O|{F1YJ$Pm$oG-Daj8k$d+EL8Y5+C z+VWtEGG}{PW8=A42@lDT8q0NJ6$;_nmdJm=204|aA0q>wq7(Y$7=B8sPoxO(d&ey zuW)A!^A7g+C$u{cHP8WK^RP$vp=X=?TDOHHm8DxG=ss^()S|MxuD+h2s;*D;`?Q)F z8~d<}xul+|cZ)~;-D@gD0$&hvqFkv|Y5 zyWY^yut-asdB0cIPhiAlG)k-|1R`lZkPdQYiP`mgb5W{*+XEXqcsX9#<%RV#15HgP zS~gK{pzi40lwdYCHd(_BRcL>lZmv2-NqymyvS~KVP%$**0NN|*NqcJcHs%0p*lsbG zt;n+~-2$SbEW3_dSI2v;rcIPLDszSt?JAhWS14CtDYG@bIywukDi74ob9gQL;EOWH z(ESA954?A8W7yof)`o_Z)}vZ^2xm8qDYOKvksE;ZO6fe@!xX}R~d?R%?G_>J!xog`~ zdHTCE%B#&EgYPT+CVCTOU%3^Vu55Pjbvdf(uc!QxVb}CU1O=r4JMFOdB)9P&f4;Y% z-EPxf3k>Apxku5=L@;TZvuGH@llh{4^b}=8-5{Tn$WT_%aZguI=(tP1Q%~VcA%@9X3^Pznou}}d);y9#d-ak|7lsGF5u(uw#X}}dUtXH4{ zyUId6W3}v4XV6M?Hgl*iuf9K!Y#45iKJWhSYuMVL;cheeVP%gkNH#yieS6yb83g^8b338#WI6YpCL&BY_NAtVuo=63P zTo_nM1jrz_G&6Rh#S*+eVWcwA1sWc_j`2jv_NifK)?=1WljFEfI{$nyGOle=5r-B~ zcBfE>N9FQtuWii+0j&EQK%%U{5YF&(vg@$O0#>I+nId-s!Jo@Ga^#4tg$l|1>FzU# zB-!4%@jih_hO<-LbwgI_LpgbyNFvuhc~@0cnJ261jCQ0&n9t^x&3Ijim*7{hBL51B zIUjxaWmlKhw2zdQ7BlMFX@m;aaohEqHPZwIr+H`HHp3_q#` zzb3VeQn=yWgx@zGE?c%PQ>Z94u9cBU+qP}H8)x{QcK4~k6OP{kzeG!{EjupH4~7P0 zFLSlb>$j+Vi65OR$QkbI4GR2nmW?V({2s*#+RR^r)%jr$A9BU8A?fqDN8N=S0j90B z#MTffn3aI|s60Y;{ZjKwOUycqqb}jf2JHSLAI|Js9IJXawZNr z_szF&-l%6gQx8#QT#=R}(0!b^-mpn?26$P5V-_&11T~Jn8qkyL3D@bhLta^qdvZRU z3==tc3JnSxoh@j)gw0z?$Ui~^2`v}GG=Psi-03OqKEIt;Gjn_K0Xq3`9r{n7KFLad z1*-sND!b9@b=B_wQYBx znVOEacJ;@PSo18Eq>WH10?IanG$!)-88w;8>yt9Z{e}f*N^LHP4kK*jqTl%U@3#{Y z_P8$$M}yV!YdW@9xbDrHtzW);sj>csHzjD7`7n(K%YIkTZaR&AP%@4OpA}eqdmG0+ zoEe7sy+a=hu8O+;W+PZg#&NmVGrvBnJ)B%=kzDIX`cqh%LzdX|l9HZ2;)s%?1F%I~ z@w1JzCiQn-f(Z){up^)>@QzjHtcA{-FRl&_Jkzs?xj}RoGJDKdmlwKA9S>P{rYAeC z_4gaSc`TybeH^X+Q~3ztu7S&8%CC71&u4Sx<>isbmeVQ}ctDg@t;{}7%|f%uDpvJu z1@K23?BdAzE>K=3U>?Yec|X588XV}scz1NlRUSB$g}I&sF?f* zt9=<~K0Yseo#P%^ac^#{_jV+&7EcEWQ6&o&6eZpVY>aWW_qvD2tfI~wOn3(RgolS3 z%2Ev;$bR$#|G;r%pG^JI{Kou!ShiPaaR$c6x9u5s>q}KhT0c68|Ae5WuP@DJwH`NZ zVtDxQVQRA8;l?o`+C*yNj%}f>yKZ>gpJEYD=GHW@Q;$ zTI!~EaRo2)=z#x6z-<5*^lGpnRIbF-%#0I_(%!v$MeX`YtRmO$pl3;2cFh!o-T|Oo zjaS+K?BHfQ>2z-TgelNCbwv3CokH^+i*w_lpoyM>L7p0V4U*Z!$KpvS+X-Ks+ReSt z4HecZnTF6=GpZgHA-Xd6$$+MY^d!h+bP!f+~wa?IsZWZ!i5joeQ0fDXOe&Y`qi3zp84(*!w4cHqI3lJV1Kq58urn384v#x z$e$Ykia&#*VNfqP0DP3M$o$!?BUD_OEIR@ry#!bfu-CQ+ z>Qoq0r1`0fei6)%0RmWIwU0ea(aJc7EMLZ_wjN;FHro)Y!ya6xD;P>*^?h z>eUwts==<=fmbED{TPYn(^368E=tYGapn8N;;%+fKMHxg?ouRN5BR<1G7>{iRF04~ z9h#+v#M8HBZ45mZlG{C~wz7Cf3m9HTfmE;!4Gff$mG!^q_qwBl0X*v&$a<3>uf|I& zEB~?W1S=tnS{hI$16W>nXXnx2ZcfO)aXXx_H)4{j^F~ID0nbjcOh%3rhl+ zRN}mPzgRjbIk9camRdCZ{$tC|1qzSVc^H$Ucz*Ote?b&OWRuu?G2f(tcI)m_+QtVi z$!S7lBzQI?z+D8SM#3aq1p&`aTJ;w5gNO*(CNZC|^=5i37)v-A0>L3r+d#76vODhL z*<@K}VL3a2Ph+T@2Q6Y&nw7}Qu?%WH)tu7WwEBCQwwrr4AdJB#QkCZlE81kn<1?Z= z!VlUJN`@yJ)hY+jk;uYaHen?|0K2^5!^!l~IgFCvTKCQ`y`AtYF-7=+fR?E-2Gr70$CMC0^*<-9o<%F-nO2gxQONtfB(W`iAi#G-o$5V zbaXTctmZ=rkFuJp4}G7eq?n+Gs+on{!l3!X@_t2y`1H(7KwtU6f@>;2F4Gusw(~5! zeEv*z+qycA`IKkRI8U9@pQdMGx`oQxkA8v(Z%wc=C+j;q86j13O~fKoy+o6+>Fn7C z&YXBKX-MWq$T$;|lP^(z)PI(o1f04tJ+jvt4v!4=hzV%pHETB3)~zU{oS`boUjmAU zI2!xPU6m8BT`@9Bn2N1Z1C z|MT&Y4)<#Y&ENmnwvwB9ozNpRmBGcRX~8(~j5idoTzSeB5O#;>sCft-BVz!xi(pN2 zcnvBQs~C9Gf>#fmmY&yNlN_EbjazHp1aF6~hT>n!=prhs>FvKtDM+6Votpg|%<{ntv8gT`oqK`Fy+Vf$d8jPZp(Pp*Ld{Q5Njk$b zmho0YKW6#zrB(fft9;Dga5Ts}W99OSit3O;kxlhSJPpd6d{R;lfxrs^LHreELoQqO z#^N-0ZV)t(hvu>D}Y zF$`1+@$~pASftFFH%wALsK?5PvM|!Yn$52yeVuf}=!Kd6U0-E>* zQ0)V0Bf$EA-TQ1o;5WW`vmtD*G73kB=-i>u6a4nquV1&bvE6|`hf`2+JERXS?s6IQ zp-lX`PlXHa9U>f5W+bYoN#Et;X-^(@YQla>gp%wP+(>!Nav&YUjBHWLTI?3;IRQ?Q$x@bUAb_vU=7-U zV*+}ddW^1I@y9j>^#G0r95yK115p!ibOvPAwJ$F=dwF@mPa~JQ^Ppi!_Wm@+f`S4e zU~d%z5wzFVy~SDjo@vp2;O`kKwLiXaJ9^x@J0W?u1ylSe<6!AEK>wZ#%E|(>rGbGR zFNwgM5~Y1pYi~4+KJ>ULy#(l=XNC6)U*L;|sFwGYIiJ@aOQi9Q##9zpyO)IfJP3p_#%MBW?gIYz_?JVCFJFb2lP_H$qPVa1lmIPL^7J+RC9esXbVY756GQKg8EgWap!oPopGYbfU zmwL4ry)@)-Lio2|TlM(#+@9#TASFHAqc|mh{(LP&5Fv~9r!s~mU`+oq=|2*HoXK6L z7EO}RV5OncJfXHwPZHc=5xH!wspKhje~m72zu~6y`{`g!Oa!E#2jz?R*iTrFK<;f% zW>>il%mVJom(Y)qNJMYW^1X|f48=0@ihqaa=-6ri;nfLP12kuzAC@sQdp2_1)pbth z{S47HPA_@R{bom(;R{hP0pUTUaXd;epo94egi4m}|o z0b);r0o}4x<-R}%{)|`iC!f+Nwx8fIb>GqWadx6-bNaMA(zY)^qjwAQVSq-GH1EA@_vnFHZrv!%5eY)*z zjSKKu4KsUBsx42IF5kX=N$S}qXsr-6A=w4U?W(1kIwv57!T#WHdD0wQ>YE#pexryB;Zl`(2L^%&vlDVv2y!5-a?aW4!NL<6+A zcsK`EXqf(P1o#YXlUzh58 zUBc!GQrcyPxTX zKw#~oBMW);wK-fCy8P@i=_Y?cIrv4w(Zy>_M@P4viAnRTq!G-1bTsM!q$=1cx+evE zXl{(rUXFlhA8B3z&l+N875$*J^u&u7FYEwANo&!+)1eoWSEf={&}N>-8ibrGdHi@6 z(WHY&-HQqi;KkHA@Bo$(qVXlnC~$jAJt$7iB%U2)EGbNDDT5%G|D1(xVytPb6PDAr zH=bHCf?!~voDRZ6O7ezo$BzBwn;jKI3lZ^I4K~ZP>fRv=9Ye2`qV|)fQ`k`L5;{5zg)-1$Nm9nu7rQO*`ey2t zTwpMY$a<#XeCC?lWyBXjZowmkt~#c56$rVewRJnpXN$rVh#wLKV3f2R+q^2FKG0S-om)Rm}T^wF+UpA*>QT z<3PVmnvt+ZBmwcrRmKgCHXS;6F!-L(@#C5&lH^xOcG5q})0T}14xtH*42S1@V(Psb zG;Ru~CaN5UgU!nk*NoT9?!!g{PCNvJu!{N3g{kEFocEBZjQh6@(M~a3R5(`80-QC>}EWAi{F;PN7Ldh=taO4vT<2iPaL6iVQ0UgKEr>3Vn zuFO5);i_W@({el{xge|;umILwt{1VbmSd9q1`zV!ubh+a;RIN0dz*RkB7=fN5X&p(qqbxsEW)|m$)Md{`;pl#aHh(Ft5Zpi_F=3)Vy_1TX9vDS1CI|oUvd@v{BGoXAxb5 znTR^$R7gIE<@E(e&6}`L!Y4~pMF|3aosq^t;b!yEnWu{8=BegWz9AtYx+azA6*jFi zl3JKQwKmEPo);)xb7A=i;7SijpbS(K znmzi0koIfqs^MH~ezBX#LMNR@_E>*k6Svwo#+)F!C8tm=Y3BE5J>kJM@Byq!by`tF zqrX!B)ZJZ<)=d`jE6f~Plj(#FJgZZHV8?5iC1I^-?HQGh05y_l(|f#a{4`JGWzg+p z0cqa#5?z1P{lDK>6)00uR4kkdIP%vkZS)c<^DfAc*^nXOMOWA67BF;uAvl#|x^QD8 zW=}>7uiC|nij{poKOBoT^x&>)pYS+_tOT2yT*oXF0i&h|+%g}wEJ_0OFxh0NiRi4J zur@VK4mupvNcB|1Lp)9F3-5SN3QYQKOIiSVMcunpsVE3_5t)bCL#hJ7k9J?}*B^N7 z^$PRPD=X({r-?UHu(MD0I$1n5YwPR`AdnpUjvadyLSww@6C)!T(GTZ@k7C@xeR;A0 z%;D`+CR25F2Fw{Ggj=Pa{z&R;Z&#fTkkV;hyXiLbQ^Tj|rZBj>ZS)5fw--A)I1ui5 z)$&UgM~)t)x5fZQGaX_wq1qU?k>$t&BtSXcD`PRiuReWZ z0}45dIucM!7^;z-H&Ii!UvBBB{ab9mkDILiRUME(DWT&3vSZVBtM@UL^NbvYJzQNv z)6-DH>Osb7&vq$kAFhQw|FR#qhUaop2ZP#)L6bY|VwxQ;tIKYn?vvgeDtq9{I5wl+ zDjRhF&fCZ_!!MVVr)DDmcty($?^2VZ5C3pNR5StH8qTz&|8W!?LRD<-?9>?5KXKy3 zU%DMQYBRX+PFbzy*kz9H_IW{yp|{_@lsRYQdVsSexG{`Vkr~bt;+`(Il{x^gKU%E7 z_H6;TZ)fP=JflXW8}&3*N-(lAdf@+#d3tqXv}f9$Lq5I`IdLHQ7wZ&nl<82PR)@zjJmqq|O_qBB zxec-$*H>#9dIX?ZR_>N?{T=(kN|Lz&maMQxkAhIYx7a{xt{3TZ_rt1kD^7=lF6d(v zW4|f)bzr~;?>~r+gXA3+wq2H>^R~?R1nCaYIWm(FLPLi(HFJJt7e= z=_LW+n$x~;1=Z3Ydo`f3Fuz(YWK;JN+>`oY#mC2$gv);*NVSWtAN73K z!I;5F=rfqwpIvy89(0(g85V;O13%6~ht4K`ISVWx+{RA{jfCto3sx^ID(act%KfJ+ zJa?8APuOfNhMf?U$F*h1X*HqlJ#qQ4Qyy)b#ko8`bAw3Q+k6RE>4z-dPOGR}kX?g| zJLL~?RoPl>-J{TprzALU<8q)1>MD9uqBF-#Mo_z|2ee0cs9h|a3iNaH@(7>=8iuFP z)YOy^fw0$Pm7>=nF zhch3AZ7Cb9ea#w?1<>hnhRQ1`)!A099JBSY&TZUMQ%W65hf_-EPcHp*l}A>$^bH1)%PB}QLfxRHxt zuDA@NDGyC7CJPCBDY3}pS>QrOQLzlZSW)}+gfRi74ZhUG4|%*=*$i(}I{-RCr5yV# zC%xLC#m&tv3fn;HZZmkv+F;n_Y;jzgwJBPE2tBx{+tkz4Ea9sn%+KIvGyjnJXd-Eh z(H`g{Z{&Q~GWSP?^T=5j;XpISMBJZbPbHZxmz{krAnUBWtU4=3$6nbReMCxhxI@t#92d76 z6f2kY_n{#pr2liEUnU*`=Lfvz2SXxY$2bDL-WNvHqD5y=C47dJ`W>^w(S}UVt6(}X z-}(SEP+%>Y9yy%C#0LZfrlKPF1v4_fnG%JR)i&z(irmEKR2JHS!0+;S6 zODt>v+`=@Vf=|(&r>fl9mnwpDDi^6Sc}Vy9I1mwNyl7p?a_7;5i`A8Q8N9KZkj(aR z|G_9EF?WVyby`Y_1ZG#W|4ltS4#YGj_jG&AtDyG&Z{NQC)r)hDDegx^FwQC~DEP+5b0FdHu>Jy#mfI*?4H!mQ+3?!y;~WH; z8oxgoDh6mGnmw%my4Q9$>MPSC@63H<~ySr&FitcsY_1Pk<6te+*s zB}o62aceM#1%SB6#GN;BFi-CN28$VT?#^4Z>^w0;Bfo!N8aajmRN0w$pBK}8`_A0C z7(=TI`VD$2fV9BG)bRN>w<)Su@`{|{M$o#+F{4snuB0&XYMQ5MrJu(zlAGewpM4%~ z@R^gpUqr`q<_?Pwq>O;gn^9yj61LHEZN)WF>Fil5^d$kZoGLd~QBeqU%IRBM3B`@^ z%a*D0@HgU*1RoVrxdmmv4yI=67aPMw(rN)tI=X;z29Wp^tR$;2ZKn22QDwe-ty9a8 zQM+7k7n*z7VODkSXt`l3D-yj*kW#!vg?2+02o2)|uv-q8PQDB)mrO6pN61D3Z-i(H(T_31F!_Lx&gUda|lBSX-f4;BfLGqs`;s$m^lHlR0sCU_}aC5@JvnM zVQf7Cq$Hr=m7|EU7=pmScaQ-B1{_G$E#l$h;|s&29r_2%qipJG$-yBchC8OGsn>p1 zp)NjmU&uG_T!SACefBn}y<{W=%uk(>ky(!_u_rx}A3`Qu2Y^NE=!R zf`5J)|3?E#UX?JU16X3}4CB883b)Wsg=c?6>nH}USklK66*!omzN&rqB6LhfV8IAd#Z(rfhv*Y*~G ztQxS_k-We?fX#9Mw7EC}ZPW=c%|MwbZykgpD`@-Wg~Oi)ETHAzW{Eu#6SE4V8gPiP zPmlL;^tu0`xpU`^LdIiXaQwXNq#}#=8&MWR5#tAbHtT%rl*Jh2)n>Wiv$rCPNwEj&)qNx`vUo^%#5!m<7e0=1yyCYl}%?RuoyaAI-#H5G5B;uem_cxrACQ%kXK_)~$RI zqs5k;)WB!QE5Q9UM+iZTklbS5Vz2+=M?ahDaW`4ELL zT$cK2y5Pol*WAhx70Afn@{#zp|Zz%qPD@S*z>=30L z5)GR|A1Vaabqhg(A@=Ofk$Wqg+)MD zLJ!cLIY9%88FS{pFt<-I`Y!^m(|rDaM>Be8i`p% z%j#Z~rsX6%><7EEjYC5A`gKuAz9bSRN9-INWI&l-1ba$c{egi7VhjO$8UV`xY;rR2 zCE;bc1qG+Ex6oVF%?*NrhtT#AuLFM%Uuq!Y7zV#;6gpH#X?UO#l4=*u1%Ps3METnP zRR3eeO%bZ$y6uQ*&`O*~j;Kyc{3UG7TN7k6b~X7z=R^UUfTf8@ekc*G@8A2~y-S0c zZ$B*AY4Aq9pZPjanEW50-d_;)SS(a9G!MHmuP?xX@r%2JueDp^@9ICN-=VPqWR5-v zI7mZ`A$W6CmPx=7n9-!+&O<%Ll^O@AWeH~^$}PHhu(xM$$p~?uhNUI7m6cUTmL4-~ z4jtX2+;Cpvri5z1YvN&vYer&XU=*OndtegOrPO_F_X&p$$a;5kLHJ*|a5$|W?T;Tc z=6y$w(4m1rfXaaMX+vYlvTGOfXl%Zs8r-$7ph97|><+Pfct?;baJorhH-Z)0vhQKw z0nGMxSPsqsZ90`S<=F^>rXl zPNz1%$LJZ*sdTeArz+-z4x*jB2M&tlR?xFFGnz7dX-8z-@^~*MBw>Y2in>W$eZxr8 z;3J#<2o1UC(wGRz%~>6G>zv`;J2b38J`kchF_}nuh@tiK8`PzSZOdqZVYwfQ+LLp? z!uQ~az$nFAJ&l8rVDrm99I=c@)v#1@P+(v+>Lj>c&Bv$Q^8fW|aL>r5mx7C*Si{J) zdEOz}$Xaxm{{1;XSpZfE!@E`N|MgG=jY^~^(jLh5SiBDop@AI{UQ(FD=6Xg$R|?vSU-c=VBT@Up;Raz#hk@W)_mMF~C(F zK8)yGWYwh!^BsUGY06Sw0%(F*!Y6zreH@g5^L*r{5nuwD#S`$sVklA#f6K{94o2-D z*9^y7qhGwvKkJ|BKzKB8tpJGxafz6>#D~_^u~rky+z5>z0HrQOd-&Ub)>O&k!ZODk?>h3U^Ru7$t-&rkaPhwEeT zHvjoay-yhb{`=?u+izOmyHKSLVJa*_LK5#t>$s3Hglhc=~ zh4*Y$bi`Wku+Y@Gdu%5udqaDBJDe$*W7qzyfh)yDj~74r>?JR zcJiM;HZ({r&9j)F(bSB}T|Bn-aMYtF&upQ(tMGqb@c;5!e9yiX73%{LRkRT}U8ui> z-H+$Aa7suMb2)PETMI)&eOxBzay<*;^vd%u+~@tGVP6k}h<4gcql zgC+@hy>Jd5OmQ6~r2qQ)zr9WB|Nb7Yy00+ZLuZbh+eS-!D?Z+kd-lKL?w?RCB4+!$ zx|k&-BzP!&Fq8)xfC7Z=MjvE{fBh)z0E;9m`}8<41uodwd-7yij*YC zG@P{%49Q?mI})Lgd`?kO7S(p-5V8^=IDN$#-I8nkY=^?n!3KQ?_dyW_2Z~*8ak1un z74F)SnD3|H|8n}@QBQ92fq2i!$iTS4Y$BQ`gm9T+=rk}mSjNezaQnzz6sI$1Zq=+F zKYpBWoK3L4=2)hjTHioiK&OgRZYZ&O2Xv_1_Vy9!2bts*1F<5oznHYAGOsFNU?zUr}r&|b%3~eVq_LM3G=c{xSs}AEKE8DOVlSg$~w#} zEz^YqtwDScDnAeec?DP4*n{!K9dn!3jRdou%ifg1R*;{MZZm6pW8_LEuI9oI*#!Pk z!t|TwPNO5G7l-G~B_EeWz^F)R&3Y}!ZzT~t@G@Epp1I^)Z!*t1dCa%r3K|}9SHyB2 z#siW|l9Q7+`{?Nyi6XASWN3A30so?P4Gal6?c(ARJ}0kp(o*UL=Lslg;H(w=Mvfdc zdvkFU3A4;KL{ga0#%FnDWoD`?sJ;EK?cYG;i-UtI-Yp;?5gZWp5);$8I1yXoMj$!jat(s9(cQM+$|}+Xdj_{KL;Dtj z(;}!_%IjP7Y^X^D43!~ufBAALYZY*p+lB+=ExdOZahaF#a5m0pdtqbD#g1{E&V~$8 z1UOu}nZo_Nw(unrj2pOB9Q^GZ90D8e3{y0sA;497nm2YktEuwp-NhHV3+mKAARx^> z4w;9H=7d~~eq{jqj)Gq!Yz{0(&9+)a(*JXockHN0`I}X6F^(WHAr>3UmS_X_1%OfD zc594SZ9}~HV+YIm&t+VjoaAgHqqB?Kz zNVwVWKvBW2%RANWYZ=+> zxrXMHC+qOP@fkVT?{AK$q@p5B7NGi>dx#4Lpt&%C>t|lCj~gHuF|mm*p@7Z+@wyEz9^J-PtnRRc1w`<$rVROEziKcij1DsvX!U|Wot?b_)0&?; zkH+)Qa9cufQ`PO#o?TnENIyWg8B_cW@6MOinf2h=PR}9R$Bm;xbsyf5k6*rtC+k2Uwfwy?>Iv&>~J%o@+UT5S? z@%rb}qK&HpVavEdKxGN61UYvcKJ&kdhn}M1)}BI(eU`XU$s4zj5z`XjeYs=LM)uoy zHnksq3{xg1agn`v?CP=u0_4J_4NRp6rmlgACr$l4k}-8Ey5bD(>GC(>6*CPjhYJdv zziz(}6hSabGJ>K)LMlI~7vO#-gD8^e7nP8>WKmrz2$vN$=C(SI9C~fADP$)}K$pEN zr>=bxCB%A5gWLZDcLJN32)rW~mRuHTXkS9GTC5+Z9{kj-2NRwcXKU}iwleon(#zAw zBGXjzpLS3Nmsv2p+nZ69PP?F6>b;njhsznzN@j*J=}3?=GctHL*qE+;+cw0bXG}vv zanQ_={D1g*?|82F|Ns9I<$yv2()mEK4)xz3jF|T zNeXrk9jpheOK&s&A9a`F-9auLwvIVk=f&~6TQ+ZQq@K$4vQUPP8>fqLolc2^eNNvb zGIS9SK)z{gbr?DR{!%FsaA2nQxm@vGVxp+L-4M7mwy%Bdq@H`` z;q@uKscvc2Zu*LMgK}=_uTb(bky-`4K#+trFHOY-rfbyT_pe@6CtIM)IqFONjk0@c zZ*%Vc7hwa_RJR1GZLPTNLHW#haIPbad02sQ;@9rc>D$7)Zp$|4+=(k`6rE1Hs};?V zuz@4Pe$%gDLPMu@a9nVo%U#p$8wg;fwrDeh9xCnZ);G(j+8A!STLuW1ZC5hDBz)w7MVuT8`K?S(yS6i~Xb%@JuEyBd7;w_K#K@q=^}NoNZx+-} zo^*Ih_-j7Uhy~j<+}GA>A}oIu1%F;CNPrF!yo)WJbv~jr4?3Y&ZTrc#_V#PXxpEq* zY-%_&+;;U_(p3z!G}Wt2DcV^X2`zObY<$P`umkm#TD{6lZl$VivrP{~S(rX`3YD!9P_R z#-PxS-S+-(BYjyRwveyOM8bo@>?1F*v4vSvk@j`%nz=A`-MVn?*KgijLP!mES z)fqtkq$o)C)mh)-MR+Zdbu`B+tj;{539$>ZzP5j^ZN$KR2mSs12Ng9|B&5*L?!S;@ z1f{s0w(bf0<4r{>NF6NFYfj}ljGQFzqK@aH2L5o?_uOd`1lb*8DDi{I zX+j-4rt-7d=@B_^FYS!6ph{b|Y11a@U&VED#%Tx0wA=n`)`&Eb-fUl1A}B{qfdYVZ zT?6G^iY+Q8EQZI2*HWmof&rFe= zgXm!3p?`YzM2LBTM^Ef#ek8*NW=Qt;`;Tice6A}dH7fnpb>x));*<*$P789!*^6+# zB}`>cg;NU;HSF|FOMxK^ZFOqPmNDDZECFD`cEUh+`tkrMsX2!`Bb}rj#_ij&!~XRW zj2R*`O0Z~pdej7RK&qKyB}SV?N3$(#3Nl(@lhePZy(}njj9e~! zqI?;QQav+6_-`uPjbjQ53QnV5nt$32*2CaP=*pz7S8A$94H|FsgmTDJ9Jw0AgsGAP z@vigNxy;Evlzo67gU-!{O`1g5w3M!7Sazs$hc^Xy*$rhii&>M-_2bTtm zw-eE*fr8fj22OCH4Q(Y`z~JzK&G;8ATo{v5nx1KW^zh+UUAoj)gkDE6KvhWF`w=Qt zsc4195Thj7tkDR?b1FRh(|iA$46=5Svo&$2_%}a9jd`yH84Wecb2%J%k=lQGP2&)= z(`=irE*F%4@#So32Bw7QUjkwPV{58C}m@#x71k!6=7sK^P7@{&7(U(IS)NgT-`O_Zb zeH|JV&3SYo+G8*4WmQte^RSJORQ1Cj1l)an7uAst72>-XL!&gF89$L93p#*InOxuI zb$@^^G*tujl$$qiYW~E3HOkA8?dTlsRhrjXxlHvLjWE^7)hJ1T;;FYZD&HnVc=iwa zEOaRxA~Lo6aiuG9f96_30$V(fT2o$sGTT%HOd_2jtk#HL_xWVo9npx>LlMXLJB_v9 z(2FKG)Dbhv&DNgg=le@X9Xs;QqQ5InmhddqM>192V}o{bl*Gl+H6NDoo3jH&;2HwL zw{PFHmprDrLrJy~XYUmNZ)71_vJ-x*N_vrTzW!65EmbxN&_ZUPdU=b>**g@CGHI< zbZ0!eUEHI>kaMA~ zt77z(0J+X`;=oIjo4}spPpSD+TptOOyZU$;2)ldIs&9Vz^g*1led#TSs-|2ef! z4RV^WS4bExBjYi_v{=mbDfOoC*34yaI@6Z|ujItP?sHNDUE~eW-8daB3U>dPO zRS!DT6%eiXVnQFn?HpgP7l%}@m=5AW2v+}yd1!09FSJA8bZq>fGiZy)kxFKsS5Q!! zOuzU;dF`Zr{cQo9>Y<=|SVYsSLGut@AL3GJX5=it zj6R_no5+)LH63?`@8ZoXZk|qa0>5np{=ie{#)L)Jet`Bi)pi5lKdeK$l(-*%M5t(( zXvT;Jk;}w1q_as^cO5%HF{e$2?XJP)i+mdr*d=qW>S!ZO$Hg_@yv`s9Rjx2oDJ9q- zNEN@pOBPCaJ#NeE$BBJ8yMi8y5n2x@4o8w#?YKQ_F;eM{tzItpDo+R(vm6B0{TcS#uXx@1{v;Zo-(1k%Gj}Y&Ro>0cy@>NK@Zkp&!0Y3 zLr*qm(JHQeL>r=y6uDVVf4(W$(sIDC#f#&@3#gPzUpK3*n#^xN07Ai;lGqd%hs^E-;!Hn!k9bQ$({T2WnZT@=XuA%VRGr+adEoAg%yF5|8($!@( zIZH(su*~#6($>3IU%3tP5mQKxa^#{MOjFAo_JmYG>- zT(JX!x=J)o;d3{X>t>A!!2nyE>nN)t(|}kZw#Ue>Qh2p__*_fA2k7yqn5YEP%nKxo zOgW8cZ8!_H3s#=Z`!CZao*BL?tc;7*PE}(ujGzrE72%KCi!jrNyy3EC%cLNXe?YD! zs4iWH?4-fy0!(1ZMRkV+$Fltmp)nE5ET;Qc+Cssw14SdUw}(FuOnjH|6ng#IqfVFd zk7T_h?RMY#cD-2oqQ9f(Lrnof;FMFIf`!+E$AA6se76e=3nD#oSuZ;FzW|-jNT|jB z6a-`y>+UE)MXZshPMw-{AqElb%2liE9uL5)kk&{yM_C8*^+_VW&Suc?0Zh+lkJ;O! zHhU{m6|dGA^kB{WVsi^uOk4MH&x4>6@u!AK9!fIxk%qKLPj|)(eEac3c=fPW!U%9< z^4`6>)~cdZWZgyeCh2%~O2dWo&qG`KryygBz9$$rG~gDE<`^o-#fw}R?){mxtb*3? zmWn);bmD}u$6~|vU%kN_S#OV?YG}|DhApmQ~U_T_O4tX+@R4L9$1^NTF_TBP5UtSlQ1f zWgH$MR-JVfP`NBmC0W)feeDwKc+@j+YFvZc8B4b2z`S2-PX8g$?rUKbBI~XA@uod} z&%^FPZ}k;&S#n4S{eu(%p1yt^p0R?e4B2nTbDUKY=z7h2R@FD*3)Q#Vv=M8OPcBL9g+xT&xpKn9#7Tfz!LeJW5<0slqh2A z#W~XMT(@r{%WufxCK7X zMdy!=m-sMZTg&j1(dGL+R1bA-C?R*o3-G$ro#ZDCwq2O-rlJsqmn%UB$-s z7sr91zHrh&;LU;F9y^13Eh{3XY=W`Rx2*9#g(03duC={S! zT~kN07Sb8KQ4(Tja22;1s!BzRGXlCZtRfAOOmP za`CV}r`m~n1`)x*!FsEbx_%m%$Dg{s?^to=`N2mE>TGXJtQU20Y3a|ozkoO(DShN4 zQ_%de?1FvDO2557drN!sL=iC0zGwCNJbp_x@mdRuO~ihH{?QbUnEH(ySLf-AAtD?E zyzSX`g-xyfPtyw^$=*GbfZh8ne2>IO-rU+^6h>eQNXv0Xk;AY?RqJSKHrT9R{$cd! zUiei8s;&)ZT3wMTAF%t#e*ZzFCe(@?%%ZGj=c-fn^+>?qr$3z!VXt~fL2m^fAfkUH z)FKe;v8TtJ0FqD%|I#!i{r-gJO^=3$etHi$!Rp}nse;17%}fuOb$Ylgn!_F6R-2FZ z#ZI$P2-M>Lj%QBR-?FTNhE`k9{Y4LSW7si{C5;Hohby5 zz`26NT3n!?#y}`cK}{-Ho>$VKHGk@p&Y=OV81g`a3&j}?1=eN?+!-JD?)+QjtF-*z z29542rDf1}8U$v4(mOc-IhNFw=Dp_+X_JFFcRfV$$ioA~KFNJAZ-RBiN9cl0j$6VL zm>DJ&M(*+I7(VMEbor_bzkD&sL%DKUITe6lth%!J^UF=cZi)kelFVUHhd#(uNp)~- zQddCRhqk?|ALIa!b{Sdd>H?9ybN#o=&zH)JHZVToYj2G^ijoX@^rk>me4@Lh)FE~p z5`LdtG=rhoURYN2bg<$}Q%z>BOTn4n>kJNw3mW$}C5Ge}Mn|XJ&`LBe*SpP4SFs;R zSwzO*C}Oa8;TF(D7Kim-Wal!c$f_ID*qTtzP<;7JhGvXP%2pJq;EWpOzka<%%|bQn z@4z7g&UCF@Zh6B4KjrCHejV;#&3D7r{rfjTBed?=u_ofmOFJj8=y>4~`d`5n>`4D9 zh5Sjfb6Jz3yhg&9L9OrUuI<~kpOyjCEl=(pkhhoO(0rvAL!Zdl4W10h+()K6XHiJX zN1VM#xT)j%Q{4#_I}_hi7yh5ZrZ$F#BXb=)IP|)k)#BojXe3gii!v@Pt^A`XvyfwT zOLsf=ui{Z(ns?|P5#0p%04YknN5)AP8Y{9VplzmN%^{<)uom(o>;d^roUb6eFfM3M4uQLrD9ysr{PIf_4 z{(fx6#`Q3IPs=p2DRyLKZTE9E8vE*FXg@w^_UrHeug__@Cd%;&pRxb{@fm~v;xk5! zZRZf-D^wA&v^qKjF%}e|LG&$W7Z(izl9EiTtp7r&RR;5@DbR?=zuY5GD+;oa@nt&= zbg+jnF#g|*%Wgs~#qcx>L+lrUVDg%Xi&stE&q+HxeEJ@r?9NK-;AI@Q(Ll`R@n<`Hs(6xaYy)tz>PAtR2O8qo0#FkA|aqrLCi1zkWkv z>48&I1SZ2{-JVn4U)NTV&e|&%`&%z*z}yf{gXeV8iH%KtZSBJam3UdThwb{*QeK^1 zSTI3sCAk*{8b0)rZX#J@vP_4E3)#>$l=oa~?r;z554j7vC2Y)Tb4c`qFxLQjtG! zkAKP$N(E%c`_c7-s?foU=BYXQZbhLh2;%?td=*}({EP-I&E_l$P7tqFBp`}{h*mt+ zPBS0I|2OyeBkFcV5q&sC8Stdc!4tYv*h*6H)oa$MEB!IT<68#%(!h)ZaJQ09B<8q? zlMxB|FzSckSal(*Ku_?kVG9qgPfzDAdc4WhcU}t45OTJWNe{FlqG4ii{JM|^S zi?kwdP~YrO)fe7Re2^Z-FRCl@El!wiSQQ|!m%(;YK7s@YIdo`Qh!=Gv)NJddF2gZ_ zVrwO}MHz*IP&OD&c2ClwAujL1{xWInQ~_aETC2)WkjiTaYFs6`Y%*Dg48@Ax%#4bE zaNQZ{=}6Wi2ChS1wK6yf2_{o${+d3$ibAp_b$8Mmqpuq)Z!!N3pD)IL2ByLX5^?+s z#Lo~AJK??R(Mw3w<5-|J8q5P=K6OK<|`=2n6ITDo@4b9Y)mr zzkq(wNMO~re-e?s@)C_=a4kM1fcs6Ccl|2fI;ivG$B#v#PVCz@aLV|ip5?fSuxy-q zKr-)$H9*?TaMra<1tJ6w%H<$XxCyF?fa-IO=v_{DHEL3IN8M8Bgp@~*9JS1(zGxs! z#o>993==DrU|{|0+}M*9P>6eNd#aK_E>G}hZU@hC(r-ZOA+C=y{Df5Z+#{n3gb_MO zM_a^xyt6^mMlafi+DKcTblm1XYE|(w2HRo5AQ#1a5dmagE>lmO2M|(JNIiVe|5o-I z?B^^LxX2b+w`tQ*ln(^L)pPETnKY?6v*mxi6w9O z;bzw~nPks^*niC9edEc?{3tC4ozy``_M*Qy(!pknp^lq zn&0Q@KJ;h{y8I{6Zq(GNdUUlP_6$1FY1iJsK)pUMu@D)A%hQlKj4h~9r;h#i3mV5C z9=8e8RLoHrJb2I)x#B<)hbgmWY4O0FCzgzb(DQ%dMQ@N2j4Y04?2q_5(&41c)$5pe z1iCCUPkd)&P+KU4oJpO~v*5>ZUtAe@u75WTniw=MRFk2xD`hF{AVr@B9h9;V3?)-Y<~K4d-~8JG z0yNckw*9u3=}X5h2Wfb?czN=}Cr|kh^i({;!>?3h{CnIFnCR6(OA%LRBzyJL6Hud| z$X$cLgXvQU%0e^Gqi8!Ze?Y(w$7=)Zp_qqqw-K#XQ>Y!MC6$$Gk%0Lx5WyW3ner*~ z&z8(;;I39t&5%%+L5fhYqih?cKOdr_r)N`JqDBn!=A@)dJ!jBfTwAtv^Q_4IMjQ)P zv=nP|7tU-N)$Dqe(v*u4_RbDhrTdEp94XfS#=NNuvLXMP4}k)1#S>ApQPmb{C)=uZ z;{1UF(lhm!s-`^+zQ-!BrT}guvWMG6G)*w*r0eS9VmHO&7=)}?e^5Htj78*AN0>?o zO*MscT1Yl2O9Sm^mTYdIGkE4`+tzxanW+Tshys9?4C`H8z_F8e_E!HpPLaf(P6y(2MuxL>l%DXkd z-t`Y;nCR}^HO;?APV`b~!f4sGM_os_q}}CEhrC%cXMT6u?!5=qf^UNN(a5SN|GIRF z;78q1@(7qW$s_pIH9m7=w;F(+KWGP6e7ShT*%qKg1II8}VnesivwuA%$LvnC<>5t#Qfy4O8a}6L7&>Jqf@8W(Ho$bK ztC%5Z5oT0cwC+{+g~n6q`y*`1UHit=K;^~qo_FALTepIi94oC$@so#`JGuIaXc0BC z%sB;DX+9cYX6E?j=SVB7df5KfMATq=(&-O07#^OUtC`Lygi&5z@}3-8K2#rRJuO*y ziAW(iq`;@_XUpj&l}Gtjz9sr%%wPO-5$uVBB_jsR-{?NX&}GCGOP!tE2@%*&u-pBF zmc9B78y?DlP~Oa`@(gBFQK(4C{9jlesiawnjHjTdT|ct%#n{e*<5sfxgGqm-BWFfS zejKXK;rM6RP=j+aYpr}z?2gQ7fFUltRn;{EQ0!knIAHSecMa-q6P{;zO};;gg+xHz_k;(#d)6*Y+9 z{o$ff0Wn|w_uu-_@7}y=ZDcemq%P6Qzobk1_SLz`oUV#5R5`eFILhC89d0HC++(~r zVc^5i3*LHqUP7uGP&d9zj@||0lG;FZQjAyC4a;#!XoCp}i@UIM^k>uc_#7(@=~a>s zhNS~XHigtyCMH{V$2}g4{0X9>2{nr~#F{o-0aSNGHVB`OQ$3{=Zkr}H+g@9ek_M1T zZ{*G&g2A4Fsahu^qiaP6tY%I;KVP5KdZfEcLRap9{fxh{b|r5>{mi4CL+f>Q&YP6O z+63p;pMBq(xv_6GKUfPD&syTj{mtjv+S>Xpw~nEVCPf2l`$0=!Ktr|f&`svQ)k%z3 zu1KuAPGy9NYv#!DNK2w0{Za9WHI3l65zA`Fpc)e&QG~~J&7Jm-ujXgbg!)a0`)AlL zX(cBGdnTtGAq!N!z1l*wgL5%$Xh_B<6IcL&WO7y5QIx_kFPrhSg|#zfTuL|r$P#Y5 zJtmrnQE2FL{RZpPFFipB3CCO8Cc@S^Bn@(`^Dnseaf@39YWTQ(80?=o+zTNIJ*(Dn zoxfK}`R;=xC_Ez#9*#b-n2ev>_U-F*_*}J5ZhNj(_r+Jf+}OJQt1_G|n$q)R`y+AY zh6+$u+DyEJiEGib_BFK5mtL8Op-dej)p$@?8Fu{JX@$4d*O%0rb%yOp`w8b}w0^K< z;1nBwFfVyz9tJ>E4Ldv{S*^~;KPgj0pJ62?6TVd$nv^qO?N$e$Vytw;*Mpz1)9Y|} zcAc6v2SV1+bf#0{gR8(GWnBM)SO7QXWvhb{$PkB8YO@9iQOwo8e>7!flJAcX*3{L* zv8QCfX+-+Ts&I*dRytVy`cXXV@ZrPSi#MJLx-6)ZGtA?|W3#X{yglUfWQNzcQ3-$j z8#uqAcFBLr9lIaRwwBGSLO;swrRxNGSb^SX(KuJm%u0J&d&;=lRuQC;e$zZS0SS9m zk`Kpl&nW>78PUwq%JPw{1E_4-@+c6+5-4ci7{i{R9)udqi{0>mN~Ud{fOMoO!94QN z7|#AwI-{h6mQP8LP&Z3_=C2l@S(?862WJJw;8ougWoIiTMBO7^mXmJ z^Z`Kr5fSiWQaq5F34=u}yQxV$vKCJl)2&~Z+Bdoy6lT?V9SKl)3!Icy{`}d6<67I+J`>gST>H@UEpnQ&OvVe~ z7Z-f0A_yE(5WM9+s>rz_Ps#bfC|L1-B@Bf<7}%7waK*MUO>sdl!W|`VRo(t#jisT8 z(3mW#LsfraIe5$+t6qhgT!nsN8gWih=vr>==*g{kd@a*imLLL%p_d-xG1{tc--&NZ z&rFE^BpuG-;T;gTJ>`YXDT!L4Qwy;d&u<<3Iy;Y~cefr|^6PqM#tfji(mCFCM|OMT z*Gn@(?He2o3wufn1@n8GD8a-GO|?8mjWT=IVW!bk?x38)V0Fz`#^oZ;e96Q+_F7!j zN?LoJPz9HX3U@(cCi2QQ&hDVBYS*qB zQZAWvU$E%=Czsm`*|zXD{yN8z0fjbAAsKNCyIlPF^Vr^bHC|{WTK3KBFtRQj&PqZc zEeJNAKiM4O2o-co{GL%f`~OIPvf`%^A<2sJy4aW&oJphg;8@kKp^FVZ57D7n2Rii< zTGg=M=+LA^UjnA(GZG8+E!HuEXpqJe<`U_YaZwx!*LCWt(DiGMAD`nlP&(QgY!iE` zwx3X_Nz3j%1ZMw`Q^K_zy5dv;KkO!PPNTQHIVg$}0g-7Mf}L``0f@lUmZyratbwLqGMm)eZAqrcZAJ(%XJ!d`zT6qZdqx zQ>Z|6=yL+>;eQ|&1yhQf)A%2*Y-9i5wxg#1`G7Sh+=zdPv^PYrqmC&xSO4D|CJe15 z>=$XD^s=id@CrKae{1+9CM3vU7S2tr*zpr4Or`%2WsTuY$F`xC0<{`I>c|uq5z9Z| zr`GbOGs_E= z?NFf1RK`)LrV^p8p74ED;wQs35%arHH<^{cfXJt^ED!!hE~Rm#7Q?dr8L$?91I-ig zIgR{Q-)+o@5$5ds5Cp3Tb&zt2EeZ+LPzbZ2yT$E7)7Q%<7m=nor^JXvnS^-ef6SPU z1EeRVrPUniHRb>mu26XlKALtiKYS{6BP_G)QfgLS93Pi}x2e@&5y67DLbKO* zt@0x5!i~v;zX5(;vjam0PO)>_Zy9DIu2WGt7Ng4M+wf=U43y?V=fPBl z6iAI6ZdLp3iUxMogk``2YxR|?ps{bD3(*YtFaMlm%xGn?R%XCVEpw;dU8&b&24PTJ z{aSS&x)W;=`>SebbUZu<@A3T}fAARm6_c&8-VWoHdn8ZJ`ZB`CnywxHgYpnM|YA{v3b=B1_Wx%>nw)wIX4ejay zx$D!HW+JMt7cUf1n+pqw=GyJQ7FL}xG{3eRb6_I`?%)|w7@09EMj^X1D%lwRp5r5g zU1K?S!;$rm!W$MS4a2bBG4BO^z|QCJGkN{vgCAwTIhOvX8pq=^J!AV^NQGpcKu;d1X$`$VNrq4!tbAHo-oR>f|?$4egu7 zjHdLDq4I)3fUbbqxKVoB@~Ajlk0@x-teF-u?nvME2E!$HmHyB#X)CaW`-Z{S|6~^d z7WkP$h1^T>Ny?zdZMB#v^B`|ey|pfV3)U+2v7$_)C9Zu|{fqOS5A!QtM0dlGh{F(4 zCdi3ZBRAePY1&k*jEE<8HI!TXdY}EaXm$8TogJn9aI}q^!%otSxE*E_k@B6-fwAQ% zTK;s#t?D%(!d8=$#lHj#U^No*HPO-0rzZ3&OLyk5lkhs@kd4B8+TSyaSWE@9pbZk%WxeUjSg(1BCFe|eMs_6Ksb`s8S6 zgL`nu7x4(3vripcQv_l`9$c$c=b`b4>{FI-fMJuB=YyVxXc2a5OGrp-mZBOT3<2m= z+UWLti>_y_#>4Txh0R_g+RXkTa7;l|Pm;Hyl`zG8864qUy@aJO6abBJsWj;dW}UNW zL-@mPUAhdUY9}%cdW{>>pS+&x2;~wxXCKID@jcxhb6@C(ts78nn81+?wX2F+Kr?o_i%STs-Jq?#d}>dfHS3+; ztuaHUvV#&|(eEiEuP%Y*xH`+Riz~`b+uX!=h9>K&q_T|M=*w|0#T~m5)H=ky5)q3w z=S?(T$hl`e=x%Go_Sb2m(rs3kKPko=hv@my6u?vtt%uX8VG#1qTg@F8v+^6u-*Rkx z&av@hLulQs#g4G`Uct7M$j$y{7VM6Cy6*JqP>Bs#{v_ZUv$Y}g4a%G_2%ySz`TAbl zBeEqJYt^49DPQ3)#Ew-GD`>n2g__p>VY9EaQKB7s5Z2BlheQq>KE7Vuv6Y)8YakZs zJizHvC0yU0Nf)B^J%=5`ChX>vmGKi8oHUM}jPEpwNyBIyc$)zSB&UFo?p_*ukL#f! zr0DBchc{*&%9b`!s`*7+mo&NJ%l1!s)SUD)rWfrD|E4)|(ZqdUVul3yUGN(Wcs!oe zLx_YIFRHS0mi>5h)W2{d8gLnIRign6MN^xIa~adNhaICEAWWJL==9bs)zcbCT6|hzR<~~N>4k}(_fKjt|I_XT5Sp745(0+JaQWTB_U`h%YKvMc z)!cSZKd@vNlN4@K503y}yj#=E{nUlFihM=x2O-TO%kJa-C7kwv&?enn!-3BBJ2h-) zxzzr%hM(V_TMYN1@L>%$Z)-vGb>RHL6L($<6Ghk3$GqOPU(Na3D>s#!#2oYz*_?|D z?s{BG;@}(kyTy=iT4T(WT3LFcv!Y1d$--vp)KHn@1#9NCF#gfAbk(JVi}(9s8od56 z^R~4q?3#h`bt|RXO=}j`?kazG8=Qx?r+TB{9j9^&pz@tgZ&{9&vZbYE*PVgJ-xkrg z$Iyk&JFmI4XxXI6_!muS@Uo`Gfk%CaylMtbm_s*5OisE;13)lqS-)-D*Z(w|n*|G| zzrLt(3qM6STpwCx(u{-aNb7Mn+)wR?FW>oSwN|@d{O(%tC_d*s^AEN%3~n^65#fY*;|^LTC#P|`|GGaxm9AqkGD~L+KcyTSlcGz8Q}T2aXN}} zDs%VKxe%4-+b`kY2-1UCThyN494z~F6eYt688rY6oD-en6eTjVocENcw_OCE4CFy4 ziK{dNHEgep99;WSgADW9#mE6>K+w9z({s|ao9o8PYxpf=U=gV>;J^wiD48Z?I`{Fz zw`bTdLoGFZKIgAZ{+@=x;bhKVH+)dw(W9`Fm<$j0gS`-q(oAeaF#?HWy3#_@4XIrJ6AKY8q8qGA)GFOhrtl+P)7=gjT|&o>!Eo5 z6&;7yRJ-g+>xvmYrTz2Tctx@5yU*y?4F`{Q9xH0V@-6&4kGc!z+$d;~a30%X6a(q* zotxh0YC^4+au!3t?kK$Y(>)qve&%=HX004&Qc%a4?o)w4Q0 zz$eIvPq@_VS5a!Xs=5P>MK=DhXDWAH@l+RP`;VJDMUBz`qw&YpkBQ1OxT`)_3KYx=){`R zLjx5_am^8`F1}W32KuLBs^T5Gc0F_T^!J}N+Ux7jq`ouZ%wH@X7%4PlJ3*FV9Kf;x z?7lb1bA7K?E2|z%SO&o~BKEQONaLN?-p*Hj*zg|IzK?SD=(lTU8Hfa+n6)}_5?cze zJi7;;oWqEqMhl01pu`zq_JIuX2O|QbKnAGyMqj|f7bQ)&%~ykA#g}SL39dHo{QR)T z-tUZ>tOBPg_MmnynNQ?Tg0&Bc#2dr2_+!4Yem zYSPvMTJplX6Y-Iu{Zwxl8O@1T8w^E#r>poX9wqLtGW$pz-IPc4B>>LAYDI1zO-09L z;E2Rl;|s4t^#**_j@?oOnGi+@jD*}UMlafZ4{znFRB7SHLJQ2< zjp8j5oOQilY#XkbII=f6IUO2z{2M!s3(8mR5Ub-Tkl)4Rx*ML$S^#tQ+XX_MIhTXaCLp!3O+tv+swilrwiu zn!jG2H~G^$?38!VMr4Zl=!Ar^ysEhK5{Z1@D-y zxT6|&O4fQ6jztul41f{1o2QF{dt*UWcb7;eOQ`hUe2EC#- za`N*16xAh|&*#K{9kTWd@p13+#*;k!q)AmnLu0(^&P;!cipD+R<1i91ou7*>Edvmy ze7WB3mKcAaNKDQXq zTqNF83703&;HhLSL}8H`|KVa3-A)oIB&vB9MCa`I_VPTqO#97(^OJs{EcT?JvuLLl zySK&3xO$6a(NHp8g1+P+5!{l_T)vXWPg-!p^krwm)YQ~ydUF5+_nyh>?sxE!y8~9t~_cKGOL75LrbnmcH-8;+}<0@-5Z8ZPT<0+a%yriCg1X zv=CcwxQ=CxKo+khvsQLGdC%M(lP`_az?7ql9xpli=fr84Y(Ba^uI(Gt8OE=ojyvsA zt7Y2|){5$n^|n%4Fe0Ue)2XiNy0Y!y-zAu7$MAXjHC-Ir(tsokugeLh2P0f+ntU-u_L9*r74EI$2yQ>Iu%^|K=?b}RCi;7`J{YBuBFa1^=D$jI9=X8I8w001McHvpR(onEZah)U1_oN@4pX0w2bGp zy5g$$J;Y;P?({~D)ekl)-Qg1BF$y$q+wl97!#6!+7R9Ju)oS8zMhp|M|1SNSRHi!h z%#Sywr-A{p<4Kn8a+Z7{e9x3f0f6QotJn9QmDInPo^AsyGM#%>CJNkJh*L zm{Z$FzeK4P>HqAag#{rC2p4s6o(ceY^ngpbdp!1Wqy)8QJ^q}VSiHkMvDiN98Y|iS z00ifo&3y|qh^wtrO}^0FDX~Mg>WK?)BlXhef;=f)W+34`hYN>=4b_8eT9QG_kycK6 z5qH-lHBd9j`)IjzttF$mTbrmp;`DooOJf0Ae;Jp(_w1w7vgO=4)_kMGHWmbicI@!( z^Iu$@x@G+7sYAD2P?GgXrw+N{x<&J&hD;hq3ZL5 z8)3E`UY_`5y^m*k7Be_+f&e<@X0}(3<551sB=WSKJ00%7jJx|!k;Bw#CV8hd*CYkL zAb6vFarC&P;oqbGt-5}@YXUz3!3A=Pk{`{2w{v}haj@yUXO{;}o5}ZV2f*;ya@X}O zAmV522YNTe?CV72H5Cv^ zwyDTNPr?b^K)|2lAfzS|rvznZ>kb^p>`b70vg z3$hoB^FPZ*?=rchifW2d)K5|YMq+85eZ(BI<;@dAIkNeW~y~g{ciutYzQ6`vtsC96UvRN^&Y2_ch_W zTP{7ZnQgZ8o9aAN^~;M-jO?6f3qyBTqw%C*V)`Uv*I8A9ABP_1e?d>XaS^5o$Bdcx zb~3SK4`KbUn8}Nua>8)L)K|UMf#YOSur3GxitU!eeG2Ze7&)fC1rIlDP11J)EST=| zL&;EzS_yc6y!@-<+2!|Lc6Myn?w^zZ)r%B$qK8^jOUp2GjGPBn@!S8yj17za?vmFU z05^7w8AI)7Tb!?wL%sK3*R$g%as2ajfS|d8vflo4*hWWj6TMtCxl9R~N75fSH8^!gAE8pzZMi_N<{RcJh`7*7E^PNN}R3Rw;*w=`z^h!uOYZ zy`EjJd3G+gY#aCUlfDg3Rqd{NF{NK7zW3Cg4*?Z-FK0>KAtjvUY{z$fr?*k6b&8pJ zr(M)Y)ek2>M;9=YK@zB*cHUofXH?HO7r5QOrCnqN*T`u%3Jjki5^d(W;?s3)(@ii`_m8;|K z(r4@2Yd3B@J*IP9p89fH;Wafi&zzDM4*mgG(+OQ$Fy>Bs3bc&l%;lV|$QScu4n`y; zNDCWIX4wXN=g8GeAFk9Imq){wEqDI@f$lQ{5zPL#DKZGk%Zsxec#74udP)_$wn&mXClsx`?@q%}3-8yhE4Y&6K9De(j*A|J!972s+OUV|@!9&a z&P>&pDzE+pumSVWL5v({Xr&NW-rg6W*t0_hSAXRF?)U|svSmLXI>IG6l{p_ElqhC{ zUMP7m%aopbjcV0)EPf1&)Ex&H!k=aN1{@3ZQk-0+u1U9ZYj$&U(Vtx}!7No3=E{S| zxaTDPIjp9hOqJfXd|}_u+FX`zdh(dNOBiPqMd^7KojrX}Xvtccb>jRm1XVv!26;gX zyjR_oFlE5yxGBtx}J><%Chc{mVLefjPF#^8@=C2;MuIFDgKfXn&wP8 z1Ufasf~e;s&%dS0-ZRO~HjGvB{l^c>z!M3dEQtYW|x?K zfARMP7MRg7x&7h2cd6}5b3Oa!!K{!%P5d+h0M4aZhlA6bd{{pMZ|{Y*usET7&sb*KdXvNKl#%Sds94VZ8LpLMTUN zERoTD5^pe*;XjSNXH#N z+VGgkdk;$#vmByK8U~pW)RvR*>U-2DndG7R7Nm*Owk^QEB$SMMWFP~p_YT^UA3Hbh z=^DMI5APK$T(Be$a>OpvP@WN7$FTYH&j^o9Suyo}eBp(6U+zwvd-|(DaI1g6)r{Xr zAx7VVl7asZie>s1TubMYgkPmV)JAV_&E5I6pKWIfS@t_sFXtO3XFuQ&w(!#XclXa8 zC2MM3dG;%A?bP$S((al;Yq#o*a-~iGr@n=lziMh4lsu$jqSMDdP)-IE3Klj{LnF;P z<3otP@l}cXl=WaIMgt6FI#MLUvSY^8Rc|}8U&*~`FE!l)v*@c@js@YTuU})!!`#bH zXD%pX1R(pAem-b9dbB->l;<_0*j-!lZxwc$liZ+Qy;dE2elh!;ka_dQfX-zPR~BSS zIoQxJ>d&;$`T4i1-TDf=dkgQm$HaN1G}A06p44)G^#1x%-^etxQ;iP#yqj0|7=LQ< zHQD>&s`PF9_s{XF*~0(y5ihrtoG(JUq6UG$4ri_GFrG>q5GMNw(60(Pkg}VJQJY1x0Ru!uVet{jnBVm0E8NY!XQ?KD1wJgtLD*B0_Sg#z8!?8Q2Nn~6wi20 zA9kRnPtM@dPd7AnuDdzl_Ez2P*Os>EmrF1$hTGUvzz3T3 zt?21p9f2u|xlRD3WpZIt#^aPj9ycTPylyIUS(^IGPR7*YbAL9Q>DwYPQthV&~w2pW*RRxdCj}sG;o6SuKks1J7YI<#-&zqX} zw&}5Vlms6dsIHe19wm2&ix+YpO}yRf1TvYSLxyy~;E~MF_{%+3I_c6$aF-jcUqPQ9 z_t?kR1ppQOIi;5c=k!Fn=s)7W8Jtk*k`g6}d#Sa(U+Fg*GoILeIU)2pWy}^3w zj{jlUkt2JS3xiS^_qWMcHI!uh2 z-6dBJHaNH06{|Du3Cx$o_Wk{qWWH}^=6z1^$GZU4QPZ2N;v|7gjVSsq92KzO+HxmK zb;S&7arn06X%mEg(^iPg_BS){Vox|1{Z#;YO5G{J z4H)}i$4DnzUda^C4vy^!La6ZNzTjDip=9{Y{7c#JP#l871QdL&!9%wfKIz$ZNKRYYa-qhU{(mdw4_*Z@Jty>DQ+X z_X{}^U*6nr*s^8I@+Yx%i=r#P1MSm;Zav7^5rLjh{Tsi&b1XSlNbQ!r^#C~#ehFhPng zmDQSsLJ~9jCM(IO?ChFk%1qfTs#cIANoBvs!2Ctx@XZI@CpEW?9E9CLF=riVvitpR za@m=RP?6rA0qV%(AqF~_R&`>GvIw^*xRw;t=^B|XvCN}R4PU>m!UG6H5_53L%O=?o zW796IRNdDOM^SIW5btQJJ{QA9=F(X+h2s8Kw)4A?wJjpw&X=pORUoda31v?3ahA!t z*Gq@X@B6w%oU8YZEtiEuQP7SZt}T2>=-x&gJ#Z#+lKepa5N`LSZnyd!A}*aZ!vW@g z!Lsl29VPPUvq7RQilXS&he?=A9x@+?8R%`_X=?CoEBChJlE=%;NTC_;Uo42p1w=Yj zJgU5h` zm!nk}`UB>CzSMWKPs*>jtvmS4@2se=1rw5E&4fzDPOP#pZ*T)^qzzvZNi#l`l*9z9 zcI2x7MYoqUNl0=h3%~IEs}@n$-ni<@`Mz-=u@+Of>x@6Im z^7wZ-H`GN+fdHj! znO2z?^rIW(A`q{wh48oJgHF#*Pgj-N?Cjd{j=l?$TO^ySar^=2(#jK{SFfg5p6i=Q z*WYk@cFmQ20cr;au~W>KI6^$Y9?E!qoLZvk$k^2sqE(fN1xxt zjqK$nE9UuE!OW85ksPSbg=G;>=UxmC51+boeKmd{Z#VScRv084M%x^{)6s&*dY6)o z%h~&#PN<%X?}D!o>~4@ZuF@lYfp&oMl+!n!-5n^!O6s?_XUkYp2d<+cAN5U_X^!s_ zzdr7k%i(gHhq~lWH{Q`|aDC;_-I3In*%8xXNM*wbTMsKpn(*c{kZ039bbTStECkOe zY*fmFXWrGJ1mwg`fN591xv*F`5`>DcN@Pm<-mYMLHviV99i|vy4c(-1Q);PaL5S|67oLORq!td>qq@?oS4Af2bri0S6Ba$3x7IlReb*~ju8eoAqk557vpHEM^(-de zRz#L*aYW#a^Aen%qj{R5;ul6n}b zxKG~&;_NvyWpz#0co+?DoDuW$^IcT5nbJ>J??x)?`jeN=p)NS5a_iza`qPsm zAYJ$z#+x>~QvXY|`&h;JVlot>iUZ ztmW5)Kjbb?kGa+NLqBfagEQW$_v=Mp_eX#B*sK`*El!fkJfYIj$X*se@|}9hp?|!g zxxAQ-efBriOM_Oek}G#<9HR0^a3g_-QR&R!E^J&NPeS`C;n_EyoPV&qbV=c(C@Fm& z+3S9$LfK_-9od#tMu1mGz}}JMCFLPDxyq8ilmrx|72x>O$B)#pH&Kf0411pbHq&a; zB=)h8PLjN%nOf0ly9b6V2RF3d;I$ECM@|3-gjIS zq}nyUxn118zs^N3@OM2+UD1B zO`?EM@FeMpW7wfR4`SyYG*0|tBLyyc*rP8rm%GzJ=eg_lwseq6rIF4EeG8J=tugM8 z+~x_-tVC+XY9x~&Y-=mJM^tiQE#fJ_TAW{%1=q&WGLnqFYfHRDGibu|Os(B>*K)GN zh+m=B$IZAJ%Q328n~nV=GQ6zv$QL|h<6i6Lphe-iAVOWANbwCt{&-c5%J=PM9$BeP z0_$l9j?C-RFjJZdDbE_`e?rYOHG1;<(~`4%cYn3=!}e;lOPl}@WzJ5!e7OT5wtTq< zd-6uusJ)y~`s=%EJO{Hkn7uqOnR`9wmSQ{3OTu$?DvpES9Yx}~#(9Ln^7~OxJslbJ zvBnv%Hpigx#*wEfFUZz+?MoYlDsj<~os)c@<*{4zIrQD_D&Ud1k@0`}{P}G8{?Fc& ztDYw-!G`*4MA~m6@5?l5He)H*j5ObZqxOX~rb{QDi2BlXy+LqTJ zHjq}1962Y0g7hu8U?F_TUft%YR(xbL9#WmPg9ESojkLhIzf;FWQ|?d+KkL-fX9|wh z3peY8wv(P%IS0=!=N7mD=qrGROKMz`@B2m{73c@KnEbvO5$VLmA8zGmx0U00BZcO` zZEaadNmUNma7_7Nm@Up3h)Z1%6cYU#!@q|1e z_ZYf;itkO^QvN;Px7T^h7?OZtxaeaDkf(0>&eBI%F(rod+xzaj7EE~!v}IwYcht_l z3t8PAkJqziQ<4?O72oqKKS5et6Pv<|N%%TkxQ`Jv(Dcq{mnyKs0{7fb(GaDAdQc2X4MzAw@BAQ#Eab90tH~^xozA+d%x@`Y+OeEYy>yMQ z#UHRwt4NRcWb*GSn*@#(4j+jSGfxi z6?{Y&)vm)27oE4vq-PG*^MoMp zC4JMp1qClW`y8)$iZ-55BlH2AJbEsd&GML#64;gWS!?PA3P^ch?s{b z(0b_b;qNx}e4IhPcg$GiDdx4Gmngj^qc_Q-cTe-64-dh9s%iLUoDW~*+qsneC3iE} zvWxqP`}~#E6h)||suL{uN3}?8+r=8I)wbP%#Kv*qeHU*C+#rE@z2yp&?bjHI6t2SR zPdrj<(;l$BJT`K-I)W0ktBS1o~q; zYt6_I5b!(ndtGvCH|=f(u~v~Y7&LzVa?dOXFHsoZSZHqUON~dWF%?1N@AJ#66$$kR zH~Nm;DD&g7 z2t)sS@07Epg+-U2l2E`a-&N_{lq^s8+P*Tc3hv;V$wr$k-D(XMm@Z*;(T-Ja9T-Q2 z3c>Z~iTFYrlG(~DugjHRGpPkix5UoO%|-B6aTGmBo-@+CB~eI{%id$}xW~Q= zFg%e#h4C2p=`{`T4^BC+BDld@=yRfhZzx<3Bfc+i7#}{2YC|!wb$0>&8-wP^vkbqf z+cRf2hu$3(U#Y)yQfOnR#h1#@tE@)uTD~+zEueYvn(NsJ?mQ}M2Dyg@G>=7WDr8y? zq_1pusmnu5ddk~K(Y)YsV(#n#9J!GbV^5SX5A%FyZW#rb5kf28uT!^f4sFb+qo8C~hH!1qGHG6CtuW(Y)rpRc_>{gAQs$@?!BY`gWiy473Wv4Gg#qCW{xzxfn+&ti zpl0?}a*U(2WU06nRn1v;vTyp8xbv<*B_m!Ae3x5b4B1hExi-WZDgC|PZf_g(tF9TVNH(Rfup-9<55 zTH6BPQXTa}_Iv|g-q^Bg<;pMcwByIg!c*d^^Ai3)<;s%rJW;9{xM6oEbz=TG(f+{V zL13zL6)@RS>fBTZ&u28-qk;zpRg;T)aRf-#BSb7wmRqwV_cGYVNu*?`}R>~Oke*(LUVIy`GKxRn*e);&i+=INDHGG|GrkY40 z0mqmT>5n?kWa-pWtqi3_M*tLu!M^+WpJt*B)X~=dtNeLdL27l9kxr;~AI{!Fkpu#{ zk{SA!B~i>Vw`W=om1GC0dzI7A=GN}HRpL4U<~Fw+Z~{qSM}7UFIQpOt+>C&^JiZ0ATs_WQeNP594{|m2H54R6;Tx~20G0I z^#5RxKoIs3FLc2|l##4C7IG-PXk#Oq?UJ%dk|e5Iy9-3COr@vd$`7oXcXql~ty)62 z9yeBrPb_U+)QV7ed`UMvIfGUC^8X|3tplpc+VBwf-@OhvGtc`z@9*=EnQ=Jmv-jF--FaQtU7F_l?7W=; zbjzWI5Ctjo3qWh2TR|8$I#$-~#9Ei&+3u`|5H_>-A|XjAn;#;VhDa}R*oDD!CqO16 zAApV@W=xvK4wD8b2oPZI{CwZTH#C$nVf3y+=}r}}jiJwY6%}@%oMCB~Vv+{!X<>lj zdQ-!}fqub#D<0)1=sx57(Usv6Nl@fMK^6c%4x78R^>ZxF2#NqZLjgi6mn|qTCnI|a z>5DwH3_+a5FDM9M3lZcKCjPZSpsGjd1lc(%v7l1}NQfW)N)SAyz;J94NVmQNwGjl6 znE=#}9Mk?rm4G|n1Y}7q2(vzI8Ni^37^KA8UH>OWBOYwt#7+=z1GrdXkc9xCyK-Ki z86s?I9)x=u3Dc}bsf?&2Zogpy+3;hZ>*6g???7!sgwj#P7Kuarl#dPpJ8_z7f5Qzv z3?^MOuNRA`hnAyWOLUE56HM$XL&Xx{!qttL-DF{rLAQ($SBSUMG7}2y5-3+FN18`{ z{*3x27q*{5I+U>s3Nf9a-{Tt=mI?S3u4$2_C)V zuO<|6VcIUV8}_Lv??9&10=0;GEAT-eM|WuRYa3IJEb6C~#KKkpB-l-WT7$zyFxy)D zJ)4QpIH<-bA&kg;*}fwn^yvU+g8*!5fNT;?%Y!~1iuKxoCV|@4P%nf4+#s;A1R|By zW?>?~%>t-_u|XRMP%|?iKiUesZaw}r$n5o@|8xnOWdL1Px;FTX9EP?W6!$a=rrC{B z2_V9X1Il{v9JFzN0kl$S(G$@g&oeA7gjbGGCg3h$m?r{^*Rj20MC7j^9_v&%_7F26 zK&&oGNmVjfUoL8(j1;oraENKgI!bg&fV~58I^CT+((o^6mO|i)ARXYlbzw@0Q4;o0 zXoOC48l_Gl;Lq-I_&#edoY5c_oe#j&V!#4mFtt^NCnsRb&?E3a!Ye^I-85wj9oZHr z@luOG^%Y(R-^8Vq6`=Gy@-{*Mj=Nca#)gsxjR<@?X{m><7bslWpzN1ahaM5*^)Wj2>CjQSL;h*kA^NgpnGyo9EWrq>dC&;sCfbC7eBKG=sSF>yE36ZC}Yr_c^`o) z!k__E0;v5h@Y8fVU`IX%Sn6-Wu_xN9YI( z8p`*6YY3|!Gn+!M49-n`X%(g2V{jUw2X$k?%6vm2w9}yxGlf9^AQ%^;(h9{Tl>C7> z17Ov^h!$vTp`2_LOd!ehY3R-%H0zP0c;}}jK^+z7i!>tOo|x8b;oy2xqus^Gn4-kQ zF=oLH5dDiH(jbyJ@N$Fz;jrNF{88zLef(9u-u!N#-R;4Bc7^!mjjl}eKpY-DAtCdbUqitB*5{@EyfGho-eb`s{{=U4QD4*u z-&x;-MQ2@9I)F*FEk=x_1YQ){>wqY}htfHyA0}+!Fq#2nUFhf}DC_itY5nE=p25z8 z?CaM0?`uDQeuqr|JU}WA${r;sL{HbV$=i6T(G2rpzB z@COQTf@nklU^0H2rsVlmM#hF67!VU*-$Z)x;^djT%aHy@K*;cE4w|NN9(MAiwdQ3* zl>c>6hWcbH4LZS;uK8xf@}LQLM7)d6picf7HKGDktpeL=h&|GD4>36Z6OCgoBS%l) z4P`_K{vZ!Q;0u?p_2x(#9>U6M?JPMD2Td5=(O^wLLf#B~@0-AN_7?5)nZo9fP;Zhi z+I0ejt7PB-1kFKt7ZOzE1NiMrO4#v{;0%%ZKx8~rV>{>tgs?zRgjj&o$XbKj(T!1A zhOI28tsP5kGs|Ok+KfAN+%XSVXIE4jSo1Vvt}&H03&x$Pp=9*Q_V5+}tO(p8Z{H&O z4k-N{v_devL?hZw&tUff)dTf25Bl)fFH=yp^y4x?I3b8J9V9v-8wh|v;VPYQ32qqaC=2E>J zIs$!{!*H#f<+f4oH%OBj_^{CIi0fuhp8*&(p#KzHpV5LW8v@gUi>0$j0Wi`GaDHUr zR6?UDwQSE8YSxL~^h{-72rE}H9%>L4Q0K-W1>NvSsZ^ZyM>F-j`SgjDsv z!K;S~^eXbZTVd7<;#HtF6Dpe>WcouF)aHvfo_q1dg3bsX44s6Gx)s3!05G8kX)jR1 zzAH8fK!p}LTbPJYGi4#_4RD@SP^l=48Gy)b5;7}}UMTSTQ6oI<7NDQ&_mJ-gh#a_n zU(_8&H4eb$$w8yu_v6P%2%>HRxW>B;T5Ry6)N1q1?d=&b>uunf5#$n3?bdoQtITS? zF9E^?+RK*(A^P$MijJ~c5>GKuO2!8&y-_nYos<(y7RL8;p=WRs3v`nJ2&4xXx*muR z0Ok@4C0c2?5%~aYW6Q5L<%kJlSHfFI70Fs4Zvf*QJsZ?)#)C`tGgw*s`Gw$=`Ck|+RT=En>`5e0sl@n|2mfaS*Gv#d~5HHGzy zMD$yPmx1MJhj0U>(SX=Ov`G|Uw&xkEch6NyX3w|8ctBANds~2!b@%WPI`S=+ z9|aMsKxMVKqCP9s9@X^$RR(2Lw83%JhPHOlev^NWF)M8q3P^OvDT zSf>Z{9f*|V5H}uzl)CA3Az*J&-=66f`mYu$!c2CRv+i6~Ll|oZuL9~y8|L+Eqj~75 z(CSu$LS_fC+yi^@Ly+HpG254P3cxkKM*RbTTTDY+$h%_rPio@&(B;7td+l&)$lU?b zCkf=-w`J{udulQ{j97N5v-y|&ZU@8x1Z2p9b}#ghKsg}c!U2a$_N{V&GZQVuXKBSt z?uaRcm=0Tz+kj!e4gliw+>N*J0S1(a`0s#F0S_1r@t(XKwu6p&`r6cdObpQLP?Ac3 z06S{|Y8P^FXbxb!n7*W1oFRKiCYw>I5~a>S7v^@{$%iSB2~d;A#lwS?LKdnuJ^>1T z0j{UeI0!^i)GIlUh5Nib`>nyR0$P4slTkgHS{~gT_8W^rVt26tg}CO_i|^1*LZvV` zO11Wl&PR?S?g|)fUFfG9B@^moLG*(-t#GbTa~{%X1uRDlLRi}O!SqH@CfkDKvB3&B zTgd+%v_72^pFq~4kjt{p_nIxtVNDSlAX5mZ0#(o)*!+iR;CH6W&;$BaV7k=+4Ok#E zAG856a#2cn<>>>pe)K5GIcP7!e$6;&FJhKZyB3QK0$Ym!dBA84J4_?nFJm5@`NadZ z=LNYh{^$Icgz9L3VscsBT$7>e1fenMELiDZGw3IFd^l|IdaRL_U5qxDCX%U|7Jju<908cKahFJ0tVM%c~66n*;;i2wv17M5#S${Q$3bo z;6zR#7GDzI0R@jfFwYiX-09&JdqYyk2J`0FbX)#9P|zs1g?MUzVSi5&aCWKz4~e@E zwGW~J_O&;=kIlp;oK4LuAu7}d`^{~?k=~6kCrD1?a2tkI%!vt-xjsp7lCoT?7zCcO zi<_H1SXESnFSua>&H=4ZQXRT0XFaG;ZSwB#ETf-xm!h;E7WS~t^6~<-hvzT8>~Q^( ze1nr^ry+bUxa`KQ13p~{&$ouLEZFHK5JMb52_Qjfc&vwB9xbZkIsu4mzL2#q84;)) zPlPz(*I~~wZCYDB`5dQra@*rL90v-$*2_;{l^T(KwxxzEF2iv?A2dbw+iwxzl4z-^ z|2n8Cl`rQTTJ%SYoU>NFWMq-M$U%j@4QKZ*Q))$Xg)04F*Q25hWl@M%Vb9pOnDl&QP|U2J3ONW~jr+ zJ-rE!Amx}4$hBp;aB??)`IBS6oe=z0K0r3fKHV%Y)xq3_f6K-=a#6usIxA#ra-yVg zt$}z;W6a2d?(GdbU|b=<2R-<0c-i4ddgRc8c*z!ZbbeL*oA7_9VY~s0Q>w?J0wgPv zWo8`U8`QS+dKICf9$3}iniNx1C&XqBpx2qt9!%2Ugi^4`6D+ODo!0nIGGgM&mL|C8 z#ivxseJMZ1a%*n6^ZN_TSPU@RvE?qf9Q$CDIr1p;8#9;PhH~=;O>bL~IOi8$cTRZm zhqX@LAn2Mp0b0;I8ZtGcY%y<(twzwE$&%o!=xhI3iMJSY8-J>z-EQTF^VPw;{3&nPp1W~VB!g&b3-B01Ju+=su|p*Ug&>${S27r zxbrS-pDcswrtbL8S=_9fl9vBKzy5sLSXi;K&nwGZrM7;1+aes4Ul)q&i>oq2B(x?j z;%>Qz1qY6`Q9bmbm)(9%u+28U6@O`a6&h2TP#QwP-U`%#r@$M*n39simRKS|-NWxF z96|FYW^>^xUe^N&r%YDEQ6@hwZqP&hRw4`98cBZ4f7k3mVZ^Xe6&rdYRn~@ zCRF*BMMJIWO_r(zavjHX{pu;Yi5p+uL~C>9EWdE)W+TWcJTKP%&2$-1gh^0Od*qEr zNFEQV<0F*$9$NfW4I2z##`Flaxs<$0B>0Z6f8E8ekkbAjM%pc*r8%(|8a5~UAXnx_ zYv$^pY62(LBu{-aDh7G=JaO9<3nn-FF($oIXP7~q%7+eDwZ+8%OSj|VW zFvz5J1CVvu5*lXZjPQBR*{;cIRm)>%TNuil=cfD_Y@hx_N2*yh- z+N1ftAN}?-WTq6Ev_?6_I0s_taVBDQc>DrK=+^NE@08$GG&4KV$nUu%ks7;+^Ai>o zm&`6%b&ZXr*HzVp^{PiV8BqrJ>ac8|ZOzQqsCad%^9$J@dagjqc>$vt5s@F5^vb`V z0Et_9zNPAGE?a;Vgi5R|G1fl`dX_(ejTdrGo68n&jeC_m%!(OSCbzYk;PLWnSr%u; zi>c$Beqol}pTwJL3A9+UB%MO+?2TxH6^B$<{%mMeb-$=3b4z2@bkrq&lgh5!eT^7j zx#?x`kSUW+Zt)yD?(r=W?mP|7nGuufBEd&?TQ+mrs_elj(rP-en56^u`ORc+Yw4P+ zB0J6;b7H#-S`Daihu9img)3dlLERk-%W7$H}9fq8q{q^c`ne1GxDqz7>k%Tb^z#6mPQ1yl0AgFV8UI zf#r@n!Bcaca_;)TbQ;`@O4oec)F+BZW&#m5vgWz=W4tX}U5$zB$li?cbHpy&)MHYw z97+>_-v?!cA{D0H5vu)W&?jgxhx%2H#Z*fc6DhW>Li_$BbSC3}lMEi4BxDMRGzbur zP?*OTjjNOT&>BM?5J?^vNv<&LDYR+-OZ|R;WUw5+_(x&UQL< zp`K?@tJ>z9xz0Fej$_jwr%JwgISxb_=Tdy!C4WsD6{4$iKeIZM-{sk81Aj{S%uVYT z%Cl_v6p}S`>*iuD3c&!@6nCSuFtQw-(F!Q@5_VK0+}!3we+kbM-Wwp?vlVaaB!hL3xU(S~?Jy|otOZX@c z2YN5{@(EY_JU)jo34;l1yfozB7VaJRd{rzy!+^b}*C?RE$S0uUI&+b@8lm05AiGVs zxqwxdA=r`SJ*K`!f${of$yhDjDgSH4sdJ5#4egw5Wsge6Y#rzZPa5@X$to%q)G@;h zW!V=_?Mq>%<1(`XA7e=cz6EUCESCr*Cl#`$=`%f_jeo6}$wmqetckBgDbYIE#3A3- z>w-a-`sn_`r^X507hdCiZUok7~>-uM%BwMy| zwU?*h^@Xt$yj}(We97Fw*NRg7379k-hmtn#tMMIs+d6&rN-*f8ody{z!JLV;@0Ji0 zL2xg|ciqYA8?+1SEW|5r(XmDy#WYqPr?(E!#rycIn)ieH)greWgC6E5P55l^Qp@s$ zuk5Lv`nfFst}G9NZr73<^!6oQ4=g$>8-yA#?M5=58)bYnl{6x!POT)gOc1QBwg`eL zKhbx(>91s&rt_0CF%sg!DNGgD+Q3gj;i(-0+^f65m5~DE4Hz_rcx+1(4&t6`4Q-RQ zGG^$D6}d>q291_FzjSS=E(&1E9;a%~my+brloR}1uHEk4e!s`kt9EK_+22cHxa6)3 zi@8M&S3PG*z9Q{i1)Aq+GEHAf79ZDhyTm1Wj*RatazG&f*bqS7@}tfIOy>b`9vBdP zYr!9&MFuTUjMUh8>Iddd=B1dv?l`K{fhdyW+18Rza%Ar>RGMcfTeQ2JX=g8AUCs?> zD_+gvBA%UydDlE7RE4{3Q`1@ZBw_Vd-S_46&V-2}wU_MLpwk+Rdon zkIYT?SLN8Q|1`JVGk@Zb$;VaLQ;g=p%wyz@E)((Yn7qPg{!u!=W7iF10vbwv_J7CS zm=GSw4IsIMN|&7{RW(jL*PYBz2$w}tJSE$DHty{fu18lfoqWG^dUaiXS3@1APxLQ_ zhfN1z){zKSxEeVLiFL;!$6z3m-h;FbZA#yxWtKQnNuiG#S_=7YWHa$U(f@HSXS4jK z7EfKlms!P~5qEjLOBYNeI%yZe^2SO_=_XFzVYx#iQtDWGKRoJe*zfAU2gRm!sSYpe z3_6_L<}W;wMb1ZJ7ivp5kL%ns57jw;YN*!++^qk1+>Ox^#^6WDcRO7bpZOvALfrmy z$DUt&=tG&lEqh2P_)Nw&bF}>fiq<2<8|uG56j(@T_U?lpc|=Qx9sbAH~)UQ4@|;vw1gMTS(1o1?2qOf|SuF7Lv#Lh3t@ z!;QPrHucy1id?}eopF@=EU3!`e;*B^F{+b>Kt)|||BW6Y(*vYqYDV-nUA{{2QQR75}5JDfV^={xUyV-IG{ zez?3;iXT#W;5*gk5ugtHrxN)8Qie2In)(%C=ak68o&Ahv8>51P; z!#JI&zkDlHp;H|~roTpdSx)MM(gI2ool3y2&pan1obkJHsCZBG%5Oy^xnXSy=q(6Ybi1E13 zlE1k`Tj9nz?yw-aSo4|VoAJt;pSami$xdinw#8^2wc(mIO3HzBvxGCE*BZaZe_>4CNyzxQdwRZzxMeqy<32k>WieG?fo^ znF#LZeOQwc|JaDg*|UXR=a`f}xnxZnoF%gWive>XWx^0?2~zgOoZjeMPvN#4n@;uRwrsmC zO|zsdJ>bTwZ>jb%$bNAgFiJ&LymZ15ZFCtytwe9k2Ea5d)>|>@f3|8(dR5e*oyGR^ z4<(ZD_-i78THJXb%JK^DcgVp-V1voHUAG(^kz*6%L>0dYgnwmVT z+j}zOvjeMk60}^`;vP@1pImJsuvZ>!T;vnrePqRB=W}>Y;A!w+^XXX9@hQ)&Q|lSp z&r_C`EeOOg#3enRP;0*Yv|lMCY`B-YZLpyoTiQY2)TRbJJ~khCcaUrO1x?9DqzOad zM&F!gR_03-D%u%Tb~cA;?`_Ao8`ECbYVN$x3ZIxuPZ+V;++OdH3mCT~zBDA0kn{IF zL};SxWkB)`ex&m&!ExW9G1hI^v6s3E{%~D=CmW3SWvnz6Q0mJsMCWG*ifKaE!yhrX z7JMmHLz}w5ZQ;JS@Yuv0tu;1=NB4+{Cr1}u{8HP0sZ3KjBg;hQpauc4u$t$bUF7^! zduE%y)>wY_=Fm-Ka)QbvUhIb53w>0_BtvoLOKE}KI(t)p1xMnucjU8!*s>D3?$GTM zppml<*R`EWGSuN~yroI0aO;>vP5t*YJ98K_dM-SEa4l5!fmQd~f*~noyir4inebqB zq27fHmxDdav6Vq|p>ml5HVk`hrY^1XSybrxDb`52cY$>;$uLu-$LWqLvavIf)Lg1M z`$`sFhI*Zx)W-V*@cxsNw(gb^CnKc1pyIEi!VZlyBn3i!la{txRq{yNOQ`pd6wR9J z;)^i*Kl_#r@dydWX||(-9>Fej%>h`Zi*q92`_a7&5gLivf&RgdRI#HuC|$l_#3`lQ zdHVFqlg@EVOA!z3wk!H}1!7P7xh?c7wQdL7uAJi2{PCCZ*qtzzytsMhN@?NF&ek~o zp#1Qwf32HT1yWO^W6Sz6oJL)8JKBedr_7N?4((v;+ zK+q7p&dYd+*oY@(GJfO4rkr>EfbRF}4=)7>=R`sP$C_^Lb1Iaqb(0r3w7-7ZCWHjz ziXxv!nT0>k5)rmzwH7{LEuv2>TEI`;YOAgETK4y7*KM9}tD&#zm-_eR_5Z@5tRB0w z(iKORV>(!K2d*>bTDHETH&y!e*VkZ%zN3Ay(vvB$(jF_Uz5VsUm7w#jclTAx)K9a| zD(LyQ+B3kt9|qKiDwpWDYI+oBq6O7Ueg8f>AX4_ySKW%Lv+Tl-QSNT>T#%^_2OHrk$2JOR6LjNo+yDM- zd24qHs3N77eTbPZ_3EJS)&1mPJkTOS(U^2to5em%yac)*lRJ!J*R<2xYqAZpjO$mQ z6s9^{JAln%KO|@gZ{;ZieO1i1Ku<(}m2^Yt71gFzhZ5xjMKA1!Fp1Yd(MbrpvQzhi zvu)Q+v(g^c+1ly~9tZ7^10kQoivgqc>5DD*{w<4sN6?mXjEEihuE?)uulu*niP(7* zi1A?Pw|~ma+;5aCe6*O&{3VAA-Ue0@_TcLFVO#+d_n?ZL(LaF9AqjQ048zh*7D zgF=AMXQD+vfcXjbhH~r*2bUAey-JWS-Z7wxrKM=z8+MQu^*E1fn#g*=f}CWYNAek_1_D- zIg%fm6O<*p{Ksv9@Dp7i$YK5{E7RW%JEyH4&OU4lS39h?qf0sWz%|E(y}&MgmYsdYgiS&;pI z4<4Ll?OlCi&-(nO0wg2KSfbCtz383B8Nxx}1kjE0-!2Atls;f20Th6AZvnO#2aOGV zB%%Sx2qe+~%9+X4HvhlCF==3p&<*=V((M!iq+oA}C~Iy;tJAKcaJsng$bT%>TQ=R( zFMxN71i=u-3kk~s71^o79C%QPh&4ZKf<%j<2k6CWv!sdek$ko*%}Bx&7>2}L%z)zr zw3R$SXP}eK&!|@X)VTPkD;mKH%?L?QXF!cD06g_AiazOHqlwjR(^2-FW34akbXAFN zY?%J{N!BBS<-jANOh!aZq!0?!XwAIv-BuQmvDS)FvZ8GC2#W`o!Z zGoC;Hx4ALAhp{I_D&wGvD$)W_o2R2cp|6g*@!ru01&>5Q$c*wMA2l1~^`f4+NDGVv zrzqIGjd;51LqtD0p7wrGz+V|nwXmSETJVf*WDV9++x(?Gya-^Ib zi;5?{m;(vVRNdfvZD>_}DUn|7%2q(6 z2~~+oLk}GKx;I5Y3S&d-^>UNgcJDGPA2(*W@oVsTQR-)>Q3g`M-r{q_{x-H-+g-)UA$eLiJXV&jB?W4MzU`!~Z%E zsqu?S{1yaeiOWzR$+%oWJ$x_uG6g%28GfgPo5Lh-=pl(8R zpyTs@3FQ1_GdvW|`Cp$%gt_vRY6k<8*7B7c~O4-=`EX8FEH^T9FoLlB!N-?5qK`fhM3%Zt^zF z7@!gJf8?E7FdihDMmDxy+v)UT%D+Op$ie5g5*z&+T(j)|^Xd8u!V{S=fLIo+vPD@WSy8lhxx@ubt*u+V_!ipVmQJ|-cLt6LbHn62{|_|H z)R z1StZndw^bbt-~wYfc}#=3-_Z4YE4N2b}b&Xl;_c2Dnj9b#5fQCxIe_G2qM{DXvWG; z_MwI#_?R*NG(bSvLO+RqCB!8BT!Lbu-I@}ZH}>tY!aX6?*4$VBa-5i$=R_Jq_<>v! z{p;UtE2$(|w)d<=SfHKxZ}*B75InB|vYblkiu!w^tlLZiWud zWWm8661R?01^kfV@*0bN7M;*L<|^8Q^l!Km^))~J>(L_^P^}VLrb_w4&d?v$MupqM zx;S*ZyS|=E0zpi@oYD6No}WtOtEBHVl)9zqr5qvQ$T;Az9^3+|!0PXK8{T>U*(*ky zYxR4R9os@c$)(RNJ0 z$COi%$Z;o0#pWoTTYqex@j^Y(b?hybpMCow>=h05tCpV+h@1X0Fp#zGmI#oD)}DH) z=YRI}PK}|?^gX@Y4{_Q-yEj@g$1MEJYmINi3+%CBy&oFn8n*4RC}n>`Fy-T)Aqhn~ zX>y|4AG$f_W38rJulF6UxS-+v$w{uKLM>Tm%oII15?@kkKewYSl>`7cEn}S`bv56U z%gy4C8ydHH@|d}iJ9{*DHpip8wyQ4tHmo!<DQ7Qe z7Zx}jKRV840b|?!xjqv)4<3N02G9_Jb5XXUkX*qW$ zHqS{~>D5q4GL8c|WU8aC3PW6=!7)D4usc3b zYfVC2aMqh;nZL2^d$HF#g(M@ZOgt~7E`vsr@uTifNHfh;YHLlWG?lDk)OiF{2p1bq zzjG*YCkWVeAV0OZCLLj1Gi(O$35SnU<@#tMX!M}YZMTGQHy4u@k1D_-czShdd#9xR zTm9C%rmW0tbNQ2oW;Y}|=VXe$eL1zV>L1X5-L3z+v70I(>bC_K&#o1he)Y2HoipP! zSjwE(�qc__QD5yO_!ZRz z5~q(PDdGyZ@07B8M)8XfAH7Iv0BjsfOXhMmVN3KK=*4^?(B z7)f7$KOeH`>#(QP-q}h3}V#*&prrb@=#f|X0=W}uvaT`XrnsJbL1ep-3mr(A-p zg2k=uUU@B3T92-haCPiN33>4}78|Ku9Huo%-OMkU%vq|CbL+Qy;2D=@J)U8u>^S*k zqYK2hxMroLG&CahiLH|v)5;!s;!+C8fJbB^g+`>1os+Q}Dr2}_iP7RhG3Sbb&OjBI zi?L5sabpf4Vf#842u`WDl*2EPrAt%?q9@UdSlemps1_7Hl25voAot#uAn`bh}bzyT0(0 zW4LvfL2Z5YpZ!d@c$y@M1=$nK8&19zRu78GbC%T=6y;doQx%!`*k&gyJgyEdnl+Q# z=a32;^RK{h7=5l;)6^Roak)v6y8x>EP-A@-sQr9`tatsZ(rx-M5RQ@F$&l?|ALiEL zlHC7oaqCZs!g$^IL(4Z5Da~S@4JBY{me5lxmhs|mhAw3>sM-2wHRJcQ1ZeVmheW=O zeoRwzy9>(lPyp5Y?MEEmmC{p0Ycl;9V~52N_1eVp*-$|$w*j)AoNs+mSt24eyD|1s zQW46jRM1R)MXVxZCKa%3QHgtILPdD*amz~b>tRzh!I02xQ$6jU!)AM235r~mu60w2 z5@OoUFIovIO~6qkM#p~7^kK590oQlz<%5uOj)y*N42T;3?on*1=j zs`1b2YO8UkvXGD`)86&-eG84+ctxe|C)Vf}^TMybmONg%`r6*3|2f;uYrzgPyl>mZ z9`zWoZ`yK=&WuUD5R6gD{4+VE&x9*S>V-YG3e!c}xUe%!H)3??`05fE0^CCz47Bo9 zLTz}=0**HBkBp?(RLxlQ_R{A;;1Zz9WWzJNw7%*u8Cf;wlImku%9uAuzhAi7nN7@j zV^*0Eb3lct)$J(LT9+H8d4xVj?WS z&xc2BP&dBo(k6F2sfz2U*(Gn@5l6f&PB6Do>CUZL<}4$A{0;-w zAsZ}&A@J-!5C&tvCSQ097s4WyHPrjB;o=NXa~v;P#W8==pN?U^e{3^eq-03wuJN1S z1i)Y3T;GeuRixnlksVa}Hrh+Ui1X=<@}j;8pg%p5{zJ@h+^tTxJ5G4Gt^P;F*iLs% zLSUQ@{RCye_Qp?N1`3H<&bUq`mGtR3Inm~6fRH79GOrh0wMe(!GA$}zZmc0v1|xyn z$apq;eAVwpG*>U52?-Umor_KtiG7rN#dWUP54B8bqM>t?7r#wiF3@Op;EraxWI;mq zm<0dJ>9=)G6k_dm^i~S_d;1>lQsAgUTtAYT72+2x=~@GGeG-g@7+ ztXfGP?v$LgVQx-Ur%&d(xtZu$U=zjd$eotDKP8wq6jq?K^K&_MK%GM_<^SsUCHfhM zBsrvAWb=orq4sy}lpf2z*_nZ>k4KJM#pf*8VoM7VoB-8Sjg6>-9*cGA6%%cV`S<-% zHKZbr^FMYIDOP^^Riivgm z&`u!sNd7%3Q_{}xQXoRA9vxJb8(tI9&ju%s+|F2#JVuS zM;zw4+cC(r)`h9ZkMd;sI#C?KC7=QW2>t4AjZRQW#cK56=c-n!@CYQcSG*uTk#ag) z^})wx`-PvZ4NAo7!<{^kWJD^^mysuc)WRMq66|ZY;@zG16 za_|r6FSXe!K_gG&oyfhrkgo}fXJOH(H|>UK$MAkH>3vH=#;ZL(+SJuRVCi+?Q;9xP zO-M+jkM2By)ePM{bs* zTkViwQ2eBlC8SPmEUeVfx!=e~H8;yu?nrK*pTO(aed!X8T+!O&ldYE9bUO}-miO8l zM<32XiDp%h%f4hR{raA9QFrRdLP}soQG!u~*WX733j%q}bkq9!V(V#+1Z9u(Vhs*s zJY#8BiF~duh=8~#M#mdkJZbYMjNXNRePf@rN>+aI;0EKayQ#F38q5B~um84&ZyzpX z`K63i)-ISRq-An{M|-ad8i=|*6sm2X9V2pFu$yD(8S)q=fSn92x(?08zry3Qm8S7t zhO4=EhZNXxw|SmelY5z-X5XI^7VCD!VuhFCiY@`fyMK|%C09WJ|7HEA?D3tIcY1S+ z3HY9$vgD__f@uB-LmQYppjR6g5pW<88aKEf%xC~AhW?Mm5k&bD!P-H0w^W)MGw6p zncYr%5pzW~_*a#6-Ve;rV#51AlXclb!jMOuuY5>xUk_Ns6xeP{nf6A`>IZYiV_M1Y3C+Kq-Nelh^7!=M1`wyeLr+}b&S9l5$|tF&DE=C zDJ-Pz)ERK3j>HkUTKbg{i9N=?=w!d3K$#a!{gBNr?KKj-Bnq#UlAw_ow1lSU#UF9s zA|3V4PKT4NbgXh59sCx#cZz!?z{{#g9E%HSLNT`E$CHrOs$h62V5M|75-8ehy?DH) z0`J3=HmEvK^>vG%x#D=3)W^SrxAY&#`>v>?4+83%GkoiRuYkKSTOZ*I^jA%2B8BG} z0kg-

yfC{H7niTkp47dS@NNJ>6eUsCK1(Yx`#y(|5-o(k8}0Cb{rgie1Del znycm8O=>=H@t0uwqv87fW7}b21;GM#d5u$w#cLt>&|<^Il^bZOJA_oCG6s-=0Ed$4 zL;?39B@T*wN{=2lwDf0n689XRnj-6&-v4|sViC{;2(+gRl?-~THC;V_8CXmOh{Tfs zz0wAl5&53wORkZBwI!de=!YcLl7S|vB}~$Lr+Ch#q92FFx+!TU4eyHM(oZ^vH6u|` z?U)W<=6!SJmXd0ntb~>YY&CEFD#o>{uHIB{w9gViD%xxPrn)RBoj(i-Yv_-r{Fz(d=nA?^~hbTILlPNPw?H&9j@K^@rWm5{H)^EIts=3kwg5`9#|&>5Di17T9q zPtxJkf_vuiWtZB2`h@32Erm5-h2@{AJTeEEd)hhPpwFAp`&fIZC-wn@T-`J^B_}03&pHUw{=pW;uPorWxEF) zs%>3|g`|nA&7Gza6GW43Z+CT(v50s-3x&(kI@7|H@?ru&-d4_%y)-??XWzbgc)U3k zuuzq_->;a!(NDSdny%m~`mXo_xX&}N7naqH+Oe&k)P@*tbPUt!B#h`k7=C+jOVb$^hJEcvngI!$idoy zqPWZGYVnbv!cwr1OZDBuBQI!9Lx8z~d_WH3_Fpe35HQRLHs(JQ0*4M^V2l#6Og6pi z9a67jaw$YxgsJxF0zUQvCOF8}jwF=o3x*0^S}(}XHpiFRQ$17M^~sVcyRy+nuoITt z5HSm18W<%)dg-=Z3M#q|dhGVjEOh`26pJMtk^odg!hqq|RVY6yr9X3uy!{MT>2_#S zDCQv!W27}!+NBw%l`$hee|ky2MZj=G~!0l z$#Q{Fh3azb;ClF-?bOorS9@}eB@yTeFzw0*sI8nr>=MU6f!FHXGtr(euu|6G)?n2O z=nDJ))f-6^z|oLRfSOKgk-+SoUIHO|Ht6j7Z~ToSz|euzYGQp;k(d7LS8?s$!QM!4 zH8N4$d%r5W=*!yxIuw5v#l(XGN@)ghSAYy2&0#zjO*qm1?*@9r_+^Z1Z|LJXWJ0Gk zm2$f|#tUgxds8F-Rp-XA5gobJM~9FS{3>aHy$Ev1v(iue_@77#DV8&s3Pad^T`s{Z zb?Sd;t2fE7A_@_<*!*`-2GjQVSh-2CZk_cE4DW2Ps#^pUb%{qY#ZJ~gjfuWj)PzPq z%f#JQmRjbti;SG}v`v;eH3vzw7%E z7l3ma1M!aPwee(`2f1~#!RK3_1Z1E1H*g`s3Vlcf@K3M`bXT}qXYD5U8#^0R8~@`| zYv@xZh4_5D`wGxbG+66z&iF`KP3Q4h9onuUB6sV z_!|X=X#@3W+IFb<2NiLv0TfvZ06w!si>B4uKGH0b>z+jkp{eJ*y*>SwlKZO0Wx>Z+h+EdykC)Xqc#CJAEDF@L;iDp=^a>3$*KTU;R$MF5 zF+=;-dYM;1kSvkO|GMU4+S9uZ%AGf#^t&l_j{W<>=Wb)(+v8ao&=Pt5X#ctR zC!U4DP!QffUyL5Bx=Q+*)QBxBy?m%CUr>$owdVW(eykJYGcKz8llDH;Q=x2bS~(X~ z8Y@Pohm4JE{&}+TXBgn;J)8)(h@+8R&I|4E?*A^@P|cpZx&3k~vcr-s;`!0NB^0Y1 z{Gq*e5%VCDfemL>KHS8!pezBF>dCYp!=^%V#P#;0qv{{?h<<$*BjtbV^>Mci_>ugo zQ2ha>7W+eIp_f7Ze}4>ov^gx}KYurReMIA*znPrF{|8NnaSqn_HFE-er0>|#@_#-$ zbp0acA6tdt_=5S@uFgM+_pcXME)HKgcy0`@P zW5s|a1Uue#R|jD*mlbfc$c6}kBN-JV7H~lcjX_F$n(=$-D#(z53akIsYu8|GRx*eu zSkCpxfw+(yba++UHUGImqjyI%00&-}1G;+}*1>T)djh$8fZhT1bXpL1yb0Q4(7Kj` zNtb8@)h;LjE9IL!d$oO(+;1Jn1r4tP`IqZ?Cpg?e{$C#kS$$gq0Wxs|EbLV0s{U67 zO>tPx&$ltN_t2ObG@it*RxRJ;E^yV6fHZ1?BS`{pQ2a^(XJ<<~bsYXs>~0UM$J;*KAZqC-+{r$Da^P4?8PyKqkVL3-n3nhFA~ zg!40s0M9+ID6MZ5M5@eG-_#Y5vS?3&6f|Lg0O-SW~0I}Fcv z0NyO6oF*V>N0W9z0i*S)_1diBh$)Qpnwv-H1T@C`#RPkcIH;)#BTy3m*5oxb3K|3@ z7_)bP|BWWkLYE#p2Jy+I23EPJPoI`rB9RD~NH*uc4|*bUAckGk%cNeK3 zSn%l7R04>lAf;{~#RniYEl~a?PErLdhYUz;BH1%woK8a9z75n%BY}7inhq~OL%?bw z$uLJL$8_WYR*nM()B(}P+s8)^hMmf#si^{-K?-CXEI}O)<{-ycvyX8*?G+p2 zf>a(Tfhd4bOB={vAZ-S;(JHph{`z$L`TM;|m}?Ac%^-pR52gEicCdpaNtjgrbaQh< zlgHFbY^T0Ir1ArS_XbDM6hty*XxuCklTTGC_>6Wn%E{2@Hz_ zlFD;#yR~FsK+2+9O;u3@0T~5=nxPp=a>-9K%6E|-3eeZ(N>|qge7so!C&352*KbQ8 z-H*ngh)9ECcJ{LhJdqY4ajhV&dO&`l$3qJY`yvsrTw7HFdy#Zr%3aM>K zehS2AX>Q!0hr!SQH$gJ~?bx|=4}lU2BSe^<7o?ky+D^mnh?u>sf59_Zq=Q=Do9zfj zu)7MAeh!peWZ~V(z`DvM$up`JnB51h_3tnpGahF6yZ~XwAQ=4xGU3c3Fusy)dTVQ| zr#}uvUZRtn7*)t&h?>>@bXUK{bejkqb{TkKl%N}h=ELCPc>yK{t!X>L#KD$;CzGh; zPyO&nQqp>hUrYwnb^1p#ZI-pg!*~_pRaEEAff6aV`S^RJ@`uRCNPq3>)vM%=jnM4@ z8Ej#=Vvb!#1?;%x99;x51gbp9>wiqMhrueJCX@N2OaHbkTKa@3r11~Fh*7h zgggMXrfKdds=(aE39k$(goCat*Xr@m`ufMgRluy@>MYyST6gmSsX|=$M@-T}Yj&E0}5q}y2;lLu)!l<4+MtDQ8IPM=lV1bk%G$jZ^ z^0^2}nTB)>0sci{yITP8KZC7qwS5La2ZHK~V`)ho9xC0-$iy-(zdmB)c&uoTPEMaWFaih! z_Be1v;8e(FX^Vr>32AHe1e|R;G!FzocXv9#s3gEdfkc&pOwj5-4--*g%$`UKXe5zu z-BQic_Jp~Em$CCtK(vSf3^B+J4vbg52Bi^E(EsX$4L038L!X*AU@HcP%a0oNKk^L< zpmhZg5ew(*QQ@UVgswtw52VLi!1RK^p0fnQ&sn}pNgBf&#R6&FJK`>H(L8@*raTO^}!^2+_0Xc+H_{CJv zG@$?)DfuKh?-iI&7Y~x8wlGljl!GJKfIiOZp!?V0{L^H{-s^yPO^fVkIn(IyaO9b5 z4<VqL)v; z729hIyi|kONi@I{3HE?ewfZRn0?QX$Oggki;RUszDR#5QY8nj_ARUgS1%HOd!0R7d zf&+u5av*6%>(4ro|5>}_$_&V}}ynE)S)gIAgeBNXoQSAnuz8YuK2LkvbMYx4^= z$acXo`}Px#V;nm3QG9^@L_;@_>mMPnC0K`i0bjoD`fiims=>%z-k)II3tgQ7K z*Y)-)UwiOg3PKsf#_yncHRVF0)?Dd9LL&mcDFZ!A@EsaIYugmS_tL^hNATwDNV2Dj z!uq1Z=^HR!7w&!wjE-j}cy2U56P^K0e|-Tmm}qZil!5mBeKfEYb}5QFK#)xj1`puo z#z@DA0$48qp@Ts#_4eRQ9gJv&2S#JOLirt(K#0e7#FGQX4)!dhydddw>Gtn$(O4x2 zJ+M3FenGBNEFhZI)zz{3hoBE-Wd99xqV-`^sx8PXP>X>060%e9_b31fqm@HO6%BQp zT|fs9z=fVkTLRm&ENGmDRa0+Y;Kd~-5=%!OrHP*B1dbN_`ZKx)!FdU!gE-50A=lxVRlGp|@NU~SFpyY&j zIcTQ(I*vQ&m?B-A@9=Mr9n;IOdtgRO5R6)aX`(fzi7&IXN~K z18SG?pkw*15Ojv0f;d+P44jVy#rCh){}fXY@Ig~cMs|Y$XbEb4H8E)@92_H3Y_+To zQuxT0c2ytpynH;TBeTC3$b5Trr*snU*s*xB|pMwJ_3;zMZ z4wFa=NJ$_~Ag=6%A22)T@hgF=FYWARkbeY|!8|ohMC=J#TcC}2x=(#!pu}*VEeGV@ zaN^1f*Ytk8mQ9epk0h7Bc~LQoD3v*upmXmw?cWB|7o>-LK?E;NU0{Tn2zHfRq!4Zi z3}j$eD49TFnh9ucw<+&9H*vzIZ5i`|uLXLGFQgwsaNr zD?1_Cp%`eYV=hZZxy$C#82c*~G;f)_bfOlVYb*o|-O2Rx`ba_)>_nLHE`$##2z6V3 zo(s!H>hZeY;3YG<+G-q4Na`;Iv4+U?_k*QZ29M zt{kp*@Q3@*f4kIoGc@ETV9nPD?2IF|j0(ms1dpNlMPXsMlA}Z5r2zPQX^!7!X>^rC z7L9OwUsXVolrQ~cAqs|zRoB(km*cygo-}d)7m0CGAH#gU;-ZQcL^D-A#cY@)csd#m-Q?EAq@rWy?77urQ$RYsd6gEpa@kLCj1OS=Qgz59}F_8aB0w)PGfIDD3f~W3V+qKyQ2p@XelAc7l zLNprBG+jQxlP#{{JHotY^n#e z0ze)4ICeVdDqR|Q1{7k|t;O;c{eRVcc{tX2+rGxrn9<;wL1s!c6D4WSQXwsBZW5*J zQHks!OUTm9s3~c=i9(`e$(Gx`rl};dWVvNc*^;fuzP;ylYvx&w_n-Ip9q;dWUdQn) zaWCKR=ll6w%XyvGb&6r^uj;-9%Ndv&SJZ~MJ0@fMfos|~7$qWSD0w`7E3dBrZ6J|m z7%X-dL*gD*PRiE@1O~RVw8TV3je2KR_B%MJdc*J(DqWSm<(F_%TtD>)CoyN5a*b^P z-e6f7u^nT8sxTNA=g>|+QHO#rTy3(L*Vs`KO0#Rm!8O4QzG{fEMWc0v1Pc(h>B45n zqPyUHK~%6LT9lkpesMq!-No?SfHF$~v3Hpx5H6{r% zDQsTPb>Y=8={GgloI2=WtgWV|cD502=i+dldFf)-MM3+)FQxYUVsLa+NZMlC{OJ#8nT7~* zG?peG&dm^WYyXyJRAcqD?d>1n-j09L{hSU47yv`{S(J>xXfy)n zMI(gC(wBUKP2KfqYBz>t#bN;&2jCJP4-W=8vjx6=^yraXsqggIS>{dPJ|DB7IrDi^ zG_QMUzI(4KEciON9!#Dw7rFNUk2~`!c1%bSw&sDbpm{RvcHTY z;@oTDFwPt;kb9cFN3&&ebH}YaJ?=(Zg(t8-Zm~>^ghfP7G^wjIpdE4wAG)msuB@awDmRFlcLAVez`^|t|h>-jw)=EgU8{^Y}d_p$J|y+ zxz{=L;lnzyt?o}Pa2oxe!4o*g?!z#=k%8AIWiruulJe!k9~Bf{b%yI>Y`as5m%||h z`elx%PoKW#j7X}jvLN}EOJd&Plrw#vk43+3f~t<|p;Nd5E#Bgg%ge0qtozq1z{u23 zMT<`=D3^g3DurYSPbKQ5{1M5*Ih8BDTrgX5(Z`1?)bb6I7oFp` zVC}P63|x}^dRBB>afsA6pAVtdB0!kgkS6~9ShHyyW^KzGEe*DbNq2O*n20m?y7OWU zY111hL%j?sX+cWUgQGh`k9gqlP3LGmh3t&<{*!5x} z%_9dwq{>FCRV79g`gb7lv?vK&SyEnJt}BNwo3;6$m}SE8cwzcY{$6*8wrimqZetLQ zzQ?5L)XMlX$pCBSFIWg=mvYNK`zHWcXc+X=srJ*Maj^L$gp#6&2pe{~eswNo8@H94 z-+0!=?tk1VWfImwdB$rZT-v=>m^GvpfCvHN8aScA+_ebl6 zmlNRhjS_#U=i_QMpr{5x3^wcv#6}3yrGcy>ZDj@r2k-C4sD9ShS`=xJv=tOCerA#) z4%`m;-&_Rc33#ex>EOv&*$tD(h!4TF+?v@!qwZ;fHCBZgh1;{hZBz1K2N&)8+iz0n z_93T%Nq2-I0T808`TI`#<^G^}`T@yAF?6B&3mu+egqDy?+94(;Mlm1^`aN9q&rC$b zF_48WkMtbvt=ERa>vf54Y==4w=e+|j^J;JXa3QzAyR$rE_Ji?`muYM&btA1Y&XZ!` zSO$N>Ld9V;C1v+Ofe0GI_S^y8g!e$+F#6ugkuzq#jV+kEk1ftjGTsh}q5YwbG!?Xr^;vWAM7Y~-IOb~-B#c6#$=M2Lq&Ot3w$E$6E^sz- zuGPpSpnX8LVng-I~oGubuZoPNUIx@x<0+wSRsW2@j3dS#JYV|BS2DY&`VnUWKzG-7gWV| zxWw5G{wpKSE>=PNWjZ`1P2)#AtV_1|pwz^~6-sSxZ!^K&Srtr=HTQOObx2*%Ph=o#Po0)gl`EYjeWN+;}*E2dgjc8 zJ^AWZH8Gv|S}T`j_MXUSFU17_hcjAqQR@NC_0awG?pBsT;uZ*nQfDbb8C=tSgeeql zAv4#o)P}!7ydQ&Xf^e|G!NCKFfGNJGtSM0Lx+_pJ!oMLlc{d?JhWtO)vIa-wy;iH! z=uj(mIzog4&_#patBKgojYz*dQaoGj*Ie=0!GFQEuBIw{Y37@IyZU(9Vs8CZT)&X- zKjU73I_1)fU;o4v=!RWSPXao6^&0DIAbpN?l?@sfcsGHmzxQFL%;oRCARTvuG>fAy z8?h){ncLZU>#xc#*h6qhd67^h{c`T-e_C((^L>yY|Nm9u`H%Zb`z1iLqcISd!axje zH<#y(nrQu!LXlSYgX?nVaj9wIsp}*e#X(}>J!bYp8%PUav9wM?Y(&IT8v9K;9e6~M zKthX;rQB$4-vD#2$|E3FgZW^>P=ZE=^B&sP*rd!TSba+%E^9nyC~E?$HYan6aqs-K zTLJHfpBZFONTDZ>G8Eqr8=gq^;ZbwhBWf`g=M+}L{O28M{qibuC3pZMWZH2k2{2E3 z;G%%)IL%DCp|+w9jlUoXh#l{QOxAuC6`$s5RMCF8(gk@B9EIG@`GNB&#MR(hyOv*E zIe}Ezkr6N@fD=OCcROs9e6|@qFqCsThnG?6PU*mxO-6`g!@XzRaKN9N=O-s8BS(wE zN}z7SG5qCW`w_kD@5EG7Ac9gI3xhRq@;Bsu{WRl5g&6K=7qS$Nj1tYkrM}o~KH@8A z^e;#;!iATT_p1jZY}!yZP-X1P-dSMCRpTB=~c)i8Ii--zTA4k<1!b))(A`_N(G_ zSx&t>SO=1imp;4`jCyW|$;Tqk(37toYzG8WG>9Jh1>dySK|LE%0{KR@X^t^@qJGFf zdN77uZbAw9?`_YKn{Yj*j-~ii)bkQ7{qP+f9eP(0d2*EmFP2X%Occ$%n&8&B&$7Ee zRjq97wWsftXI@RF^Af(^S6Zl!FVCnH^`828{$mzW51QleJa7tUV$oI?n}T7)2Wf@{ zz$8Pqd3&9mECtq!D(fd?|5A`3#m*^t@q&VlT8OU0jA`V1jIJ4c_8aP2jcIQ%wTT%8 zRE+dpaFqejesvljG^5>#hg3<2U+8PzG2Ev`u%0uqvXqCy9bbVA)xx`9VUqzZD@5|Y^tDQ5l~GvmZ!ijRaHxj($RHN8 zALz6~6#VM($(fQ|oWdc@v`&$;c@=?+BdIIgfHuZmlUBif+{I<_*xA>lbL2JcPG?GF zt?@x59vJpbE&yPJKGk(;ute`Poe$5evV9a$JP(;q2gZlxGw%peyD9er0|}dD;$>$u zYA)G$PQTu0@L>8<(Xd6dw^Kpfr^OZ|!`qfp+cIxMJt7|(>i-@Q?05z&&OUTSR|7Zs z1T6~+K6Cmq8ZV~L;^C7Zby5BWM*?SKG;gyZ7%q!pjF_rm5FQ?0H*D}i&b8kN)vS8G zBqZk~caErc>%rGn>3|yaX{P(o5};8Y-2f5^l?9>_Ph-|mIk3<2M65)Z?I&!pg_HT% z5^D5Z)p}FbMB@7)#pg>gx5ll0!as-9gP@1 z&kq|`b9yziePAFN&#@FT1whlOAe^8R|FPhAfB)JAYx7zwd&rb|ptyP&Sh&;QkXpl{ zzIe>Ft|ZY^ggIOua);Z^7~3&n>*`4P7ea@DT&C$mkSJL8ROeoVLlI9V^V%+Nb%MS9 znZ?&c<2jH8Eopd$or%SgFdYXPT7$2O0D$%?kR(zt^F9`@lSW920G&l;Og#35`vad5T7IOf|3hUf}`2-x*%T+(kSN}OoH~J@QQON8${A}kgtu=)7`5X zDp&SZJN7fi6!?(7NvJ1Ha>shpSY+TUu~;rr=P|%s5!eIX=2`e$5+ERnrkdFO6R1KY zVbOnR96HyN2Xu0i*VJhUMbj{NQl`bkU;jQTr_mk3XR0v)B_{zwERNo5Oi2v3pT9~= z?EYm`)8N2<*1w~UiZjCz&t=lVj77jW(fE2s=iOhcPo?d@R$sfIEAkPaZA*r>OUzs8 zOBh%4+ND93Px@EXAQF&jFD^yVO&eMon`~aKn}*I135lzD<|HA!1yo$FCx%&X_Y`}??7u1an1 z-arln1-HLCE${Q#3LgcGbs6-CKA$%ekuPh~$sbK_appUb>`~CE3(%cqP zau;f=y#Z6U8;SIWAo!Pf%o|TAZgWhe(#cn$aA#St`2{;l|+limmd59tstrk$-dV&H>Yqxc(T0Oqz zXD5A9HLVv!rfR>o$BPeJ3lx1-kSM)plmgiI;-kHbGvD>co_E0pzm=gW^4)^+;y`IrkzW zB6gHgNf-WJKDrAd@egNz-BFj*5oCCANwMH}-;w-^sT<_Z_-lJGbMf1f(Sg^=X=!Oc zaoa%w>0)h$QTDbXp=iJv0GA!!{?@#~?X@cb#Kekd_nMxbu0`R4>ekF%Qi%Gc%s5O0 z6sdB)oHLpwE24dhoc%OQrR2{x?OlA?=-`=<&DF|KaXLUcrU>YrVS`E<8*>X;C;$@X z$WiZU%qWZN%ffy~#Aq%fvZA&9)((P7RD;$^r!}r}Z7rO4zR@(p2l$w{20<>60~WCs z$8v5+Do($7+IIT5nOQf&U*8~sZ@x*(J|BD+^{Bpmb-??yfA45H)olPD zThxYc+R}9HZNUH@YAZrZ4yB@e7ki46F-TR6EjVJ7Dea~Fo}ISkQQlX0jJm?x>NH#r z34uN^{O;vW8Wx2u4Usx!g6z)=h_f-{MR>^)TWw`j9Cx1k<|U#f%9vx)-XI>88_js` z;T?UaoeyxyN7bN;#nHUg(3)ub2*sp=J0T^~YYa0k!yv@c<-G4Z4?wL6P0D_1lLF(?q`G4Kf zkTlEuB_}^|`i20iK(lYS9)ruKsHWtk+!TGeU}oUvvARi*&>SoMwNJYH7>sQ9T1y7^ z@AVkBkC9Wm<1}w8FM{ttwez*s`O=R}{vh1t!8(6_6WFVDvL$?*?3(llIR)dyG{NuEB_}nh ziu_>BBrMmKO??L#sGv&&?v8YH0}7H8rIZ9UGtNcD#e{G8WgJgM3UM2vp|nDGHxp}A zU5CM`$;wMK&&tq5v}~B*@e6XE)}kG3Cq6ik>SoG;RP@dJ?>QDaKdAYtCQGdB;-M2p zSIk*8Qr?mAX;k7d-#n0SnY0Hx+%~A~?b|Sxi3eib9J2UCc~w^fh>uP5V(wSt!Pe?zt*kD$X*0G#BC0Vl-^q@;d6H zz@~und){;?D$@V0#;*=;;ak;}bju(iVr7!zpy=s6(K5UX76c$?Ovyv|@m% zVw(^^`#&oS5gynR&gz7K^h;z2wi72|vXT{SKV`H(vyNd<|woPgMv2dX+LR8ft2K22Ellpfw+U1s+^`YWu znDIj7#cPiNB~MT~#!FfpOIH`wm;uAPH+?h%qT7qF;FPqbc~Ev*=!wMtN`k)EHOihS zZ34qPYreX_N`CDMqpd~Gb*!VV7LT?ySl2(;nr3R`ieFEiyLHhTL6cndzRe)T7O?co z7e@_&8=S{hp>XwMcx6ML+o?!1Wl&3?diQ-PPPsF~P>rhIgv%C# zUZf`vP6R69jnQEMdA~$KSEd4E1p!`2Cz=P{-y`Ps|zku;63F4A!NQaN%_T~cEQUf^19mb?mrGD>pkjSe)B!>&T53bF^J`%eab41OC@qn zviG#QdXUU|HYBiifHUb*3Z2BJ(^J57kxFD2%fTcYL%Uc$$OJm`o{+RbhO#ov7^Ga} zyjw{nSyg#Y5^C|JC+U(&D{r22AIOR(5)-G>ug#i)V%`T*)YR#<3s+tX7i(L>C#I+e zQ+rJUNR=h1%aF7^ac%O-g$pkcL7t?fF`LizT4ABe95EDF&M;;hu7XZzr8_VQO5&l> z^6o=^0hXcFt2iWsR^ZPdLZ|uFh51?Yj+>iX0YISK61eVNYz_^Q_eC}r2hf=)*2ocg z_(dxT$5|k<=6h%GWk{dCzR7!KGG=I5A?apDQOTY34*>&^{B#W<=nEuE*HD9)kw`Q# zly@QdAo4IF%FW)h(*&$S*%%~=>qruatA=A#?L9kb0-Wmmc`PYXzyzFy>PO=GJpuoC=CUhN~7~J8z3oF94;OJ{0GY{Eey*a@^T|gUzTmxOP zL`4}eVaI#55%6MG0$Ks3?WzP(Sc%s~RgYKz6M!5-;Ms_$2PUL8DDNcw_MM=hzxqtk4J{m=U6MKQn9KEGZEreVFz64G3gNIYpES#Y%O5tWyCTwwhZ2k(pIx8=lFCZ$e->98;h4W8i%BELd$ z%&5Tf9`AQZ%TbQnM^bWBz;?ZjM^!u6QMzFI&7&o0sBS|BnsxI|56zLIkiiUmQ~(%O zy{}SG;aQF^A6aYdxK*)dx zgQ~gA+uNJCpMWY4LGpL6r6K-Af``a=NmyD~nx^4hC5Qu1I8e8Hl@pDbC+nbr?%#pL ze$kbHfB=f>pFVja1(XZ`krqzVA#h=X^TP-G5eZXq1iKh9O*^1cl+9?g;AcnH?B4N4 zvA74z)sTAP3Z#p`Szd~QBz6{E6;(>{)hAp8{qE&0@pj$ynb}7?;myVRqQA24D27$j zD%>R-6Zp-)a)iU43RM>JNg(ke4i>_)GCMV8fV2AnL>*=gz=Q?&H68}Ap@RLkF87Bu zx}aQw;rO!G9Lvkqu}EL=GKZKJ$RF+VKj~MYchCL&d;LWCg&hLYome6cAM60o_veZ9 zFuAh0IS78qvY9XbF@gC1v2W({Mc{w_{W|{FT6O-TzLHiu3JY!Z<_}+@{1Pwjp_eBh zB6TvtaOL%7|LBL*+P99&h6Q!+Jycu=Ojr}SlLuH)H~d~Q!2DWyDq$L6Y`^qujwtmShmdf}Ro$OHQjU=k-2=zVnU*7@Q4Gf&r&R9xCj5f^NRETt@EAlbIFVL%Xr@l&Ky1IISOu%ID!)&t!Z(&>TjMSLY2VMq;iWzm|#*HsZ!%$Rt*3IBx1pxYA(|&>L0@3JXJ+7ydCW{U80FD&tSN9xy15LD_;)g&bhe z_6ihq!OJ0PFp|D1J2AR4&X12H+^?Y0WN#`n3ZAJOP(aZHNK6{w1Tu-8Ewyf4W#9rG(al;;1UU_Zu@H;#&UMovC3mQ&go0rJway<}w@-sAh){tIlmlG4${jsmm>&WT zT#JkcQ<#$g$@PGGQj69CVBaQ3@=+J;3D8u237|_@Ts#>o@DAkpIuH`D`}4o(L40pP z1&#{u8Dg1}HZLe)q&Z)U*!w$i?$qm10?ME&mxXOkt(;EMM`2;Hd`7On^4M+k-Jplc z3buhdv(WaIh;k!QT0l}k;1D}AlKD_r@AgpZMBt~fQ~U~b7ll_*-`$eUj=KQIgBb@N z{dKtYWkjCkv^t#GhS%+3O!y2H4uc+l9~q?jfymjlr^F&QffNloqSIB`vOVlgBKp-v z$IDn}6D*NN*Y9HIrL5;Nx=i#Yq~+n*@<;ZHPrsg>(MA2#3d6tw9Xk970&@O+A``D30t|){XzPm%55MqP@{i;P#j}9B+n+I{v zHn%EDc5|0=*EQsOWMZY(qF@wXYtTAd8pb1~aa;q;>L7kr^muVd=TFulehK<=oX05R zQ53n6b$W#t0{)^2P*Hl+*XAEj*OOrb_jnZ&5MZjkcEyTk4Pa5z%aAoRgj~OFW;2Ns z;IEv2`s3T-hYvQG!(OqGh#2Z7=p9u8`04!V-Af(6r^lvI*yK$(g;Qs(r*|VA)`*Rj z|DmV4swx2+h-DzNum@_rx08)QCP+uNNN@5)8PFyJEukLtuRbuJWtEhaOq#pGJ<6?8 z^f+&f24S}c`SWpGu$VD)Ihu{EEx7&fHCiDfzS^ zT4OQ2$*v`@-n}T|)ug?tmmu0j)yJCnpaGn@6 zrO#AQD{#X!>wxj*Djvz{pLUB2newuCus!C+8sXgz=0_m;eM52T=H-;$f9t3*S*xgP6VINH+dzGu{$vn_Y?uXzEs1 zul^aYiS89K8JUVDlQZeAVLgBZV8YcWOuXAm4tqh4p}pG@GZDMH<4`jBSvt!bpnu{P z=lAU%wO@&AQu9`BgyaoM{vNs9jX* z@W@1~dnJ!=BWnM&zR)Asb2!IoJ{WPtk5V3 z*b+u%vFG(wuGtUVt7=1IV(#60UQscSW327w0>gJNCPrO)>NFfq(3paW;Q#;}U$)=x zzh9LA|GgVoRnB;4=n}V;c5e_ezzBw{Ih+%PtV#A{ixCpgL&WMC{IvZe`V!TcHvQIr zxM{ifE}6|!;4xZ-3qkKtphVkw+Y6^6HVN18xH*U^Xo9_9>^Xdh2yR69i!|FTpcPUa zf?fw1!!e`PhGSlD+~>A{Dti;Trfag-M7Rq(w7{T~h(icEKPXN=HCjVo$QI0saW-r> zXLvf)?JX0w0$2E$1%i{y==DbVImM=sfk^IyySiNO$xOOQEL)&<-0_dw<6;?xZ#g z@%E=~t%?LP;MS<8c0Lb}h(XX0*S!k~-HYIdeJg5Nz5^mjSO^i$S(x+Df^P82{P+Hf zaRat?Xxr%vCBWu^G8?wVr8gE|-HjBOk`W3s$l+i&)dG2`E%5YUWIRiYP%fTUQD-nx z^luwxM#Lu$B|XgO_b<9Onw{Tf6gKlbF*Mic;B31cUXp!uA_k-b!e6M|v}iW+wt~YB zLM!C#nqsL>3Dt4p()w?~RzSJP61EV$v-o+$POAWMT+33NJ1a1qW`eil&A4B$HJUptbH*a% ze8L552Z19^A4m%Xkpn$pP*kq@ zsdWf$3EhLeN4c^@OY+}tarJWXa7>OWUhVf5#QaK-X8IjNRN{GP9@6ER%sm+7xEp7 zI89y8w?JzVT)tQ9y7VE@`;R#`TxJQ5?Z<3`k&n}FCt$%j$G7^-@v(psVHuJgOZ6^McZdRPIDH?@qxOevcD zfB?#&eN4m`(HFYNyomW^vsOdz@b>)*0vAufN1)#KOTl>OV!rR!6?VQx z9wE)3ran^aC zcs$A;s)C|%>+fJ>Jctc$9YKq{|LoOd&xch!zM&8kB03eYC2E}Gp!N=8S?BIPdnkpV zC3w>7il1M=lg&rYSx-hth4DNMzJ8r}?d{-wn|L#rM4rMwEzr=JTYb6 zQsn}ZK+(B002DriEj`y{9aewoQyNLtV3VaVz}HWDAWDWig2TBBhY;v8)Kr4MN*v~U z>lS1EJpV#Ye%o!(mk{HsF-b6pd@x9F54M(V@1G&4aYb6enNp4qM#()M*74^91)&XH zYmhw|fF&Z=2)lX7D?h{Av4?mC{k7`Zeea-jgwnI$bVySf=xCTivIeoW@A>nMHg^Sh z!F;94iQ(C^XQ>R1KD?kWN*%=%OdKwI?^oanTcxaQj6-xyqD@`7OQoAt$D{g2Q7W7| zsW3*M{~?B{V7kX;(Kw)uR$xy4H_P(Pzb(sA0l1|aD8{d1PyvizOwl4i^XH!{VdEqj znu;HPgZ7h#rHPvZa$p*rfn12Ce;aN7L`NruVhPSh<354zufgHrT7)7f!Te#qWsSh2D;L0{W*;!{dJ z8+G(%=eom`6hkn@rl0W@ zIeD4OL}3AF1r%lL0@{6i_Ucff+eHsp&#)e>kO4Sn&28gI)GZ*?0$524buj8ov1t>v z4`+Uw*R{RAa4t6FM=n~_bw3-C0fHt94(B3+ske&uCqJ|DwKmW&1#pVif?rEr-CkZ^ zv^KCGH95}KR1Aly+K_wGtbtf$AUVO=5?lAaLHd2*Uz8oT@di@G2Sgn*1RBkR=urxb zA_BbmzxNWNonle7YX@}%4-dRHL3V&?1L5cj5L&V%5f(RohcEBM=)>7t10~I^{8d{ny+uv~;(y)tj{7a;1 z0*`LMa*f^~YI!E0G1QrITM15X`?(dr1jIn3w%Yg)Q4+CDy3uKjqTzlNvxZUXY|oYd zvCHRFV3dG@fLaB{(WD`U@)zA)HI`T6#|eMjrspiuUuhHhNWvs`v-2=Y%cwjNkO@|$ z26u5CrVY;&Ih^q#H1`1hdSPdLa}*AY<_b_*0O1wTG_!PrLx~2mI8yJ>bGXLIDiw%? zCz?i3XgcWV$f98p52yQ1F52Q0i-puVl8@E*1ZG2}Mq=;6*b?kO?5M^T^;M&OD*L@= z#!u8H8rGRNQ=?OfQbR?dV5)!fSVL9-|v=M8Q zO&%7yV;svwZ)ycV+HKuH(&VT(M>YASCy zbISI)qROex#3N02AGSLGzCHh!t(nT455sDso^y`#ZBF!*QB#n37Tu}FV65+dLlsQ) zFa>9DX02(>5ZlE58}3378@f?NxPywZIyHO8!<7gdc%e?f3F)YOyvB}(?{Y;bQUR*< zdm4%6qu$O}2ZOgN&8Gi)BUR>z^8ZSjUZ>)=8;uGP7M#Z&Odc5oD@g`bh-K=PH=UwAF{^?yG(A@^_x0b`=cIj`j%xof7USSJV77ti+$`BD#KjLz< zHgPVeRAGZGft(UFJ0YhhDOWA>5Pho8AoZ4gUb{7?fIHqt#g7?9*bLoY*$7eyI(N({ z>Yc^}2bsR*oRUPz4^#v51t$jeu!H>v4QUn<#vGJok!4qw;m3 zGF`MvT=C)pw2l76pt=anUNBmee6a1 z!sh_-cabpGCzz_Op^GXUUW&!JYJz!r_?Mk1b$p_s@u&ekn5j|3CRZ*W>@))i8)1qoAcrJRm5zbK%dAbL#4>&CSjG zA{7j*twWw2w~SVdIdA9S(Co}`T^mcmE-YeWJsKM^MLS({@a>94OYv7YwG3e{3}kBy7#Yj`Q)Hr>DF;iE@nq@0`_ zJ6Bh^Z}FE6jE&<4n3II|?gaTa#nl|Nc(T$Y`o4elt_iUSLwwK5ywY`LkF(U7w8f#(Au*Bg07UY4-)0*(5`$ z2MM$*evxv@%8Y-$=ii>1vb%BPMp;V>g!0$#-<%>Mj0Oe< z0YO319v}>yc?+zCg z7aJHFMo0cN`&Pr@bN}N#8gf$nrKh)c9WxoulU3HFPgLe@bv_WFy9IAQ{ImFG{wt%d zEc3;wk2GYY$>N2*f_f#|@dKwl7q5+YEE(B$=TKXB=LmeukNEWYbLFekH|?f+&z^Fd zdQ)j#TwHv_?CZWA?Ba~f%*<>rO*hupo`~7%26nTuvX<4=QHhF*PR-8Bd@%X)HBC<{ zq;&D_-MfY>^AjgL7loCsU%&Xh4NFYDTU4gdW#VhLrJ9KCPi;#*NnNhFv350eQwxir z(v9`qSQ<8tN$l2>l9iz>2}tnBSbq&eM---jCI9bd_48HQEK3Oz9Tm`_qBo$KR>bpmx-Wl z42RyRB&)K$J|H2nn`ZB+vWf0o`=JK*?s>Dm?gtw z&O*D?o4u~2;_0(z{a?g-DrZN&_4W0=owxt{_pklZjPVO`XDSv!?Xqw0t_J^5lj5-Q&U9&VtC-Op7+^_ zuI%hCRqFvuo$MgD)S|hkD7IF3zxmxkb3^Ld?hYEp zn@lTy$QKn0trD((da=v$7cVl{aEO@EmMo9*rM^gh^M*u5M%Gc{CGqD+q1w1hb#=8_ zTgtAx)VuGludO_G`;55iZ;rc|=RC&DdEkIb&Rd)qx1pD=3Tws3kO`#LW*O!ud%|L4 zV-0hYaYTN4E$G~5J+cXJ6es@psE}TXOsV(UgUCoO&9smb|D4*=QiR!UtZH(xkFM@s z$+g+*dro^Os~--RPq*unF8bRbp=q6*6Zh`j0G_vECDH7`?Ueo!sfhD-cEX~zKkr|E zcV)kXL}AB-MQf5jwzhvDOzQ5_ry3dEKZ@KL?$K~mBW$LJn-1XkM7ephu(0gddx|wi zE=;hsmnu;?h@5h#K7udLXTv+ATSHy_jnBsV6Jdi}ZIo*FIi&UuHNIv^eYDn~{_wGo zUbO4v&)P3v{1X#-LfIwa#U{@+@3_ZEAFeCzGR_($pmjGd??iSE&CZ?GGlTUR=HCwS ztEZnCW6|7BMfKp}!&?y%jCcI~`^GyzOf4+PA?Zvn4ZV)h%(meABNVAX{C)Ug! z>qU_Yad{qdyzh03gRtC&2vR#cyK9zuUM%$Vw=#@sg7$8Ab#+~xsTaZljNhrHdbFf% zZYjTDf{Z-pPzdF6!_0M<3d}}-(K9}m)v15^By_2OIx`V!S8fx#g%Om;JrtqEyRY6Q_ zb4?AeP6y{Z4n?sp3e1dm9^==_*+IE;k8f#dY1=K;U=|_Ot?Q~>Cr_S4>R`U|T2dh8 zn6PkJrg6OpTe@}=vykrnWVN*ZPmc~)?L68R|NXmmigrG|R*se8n5U3#QQ7=N_Yw2v zSbFj@xv;&h+{dmvIf=BTYTfJcUZK|WUWz;a`ZcP+lar2z-fQPaJ9OW_zhnGo@$P#& z*(>_1f{*Bzk+K`FKST=V+w5~3nSZC0loYmy5}U4`VYo&0y>?<`j$)M|=}Ew>!r8NA zC95+R;|>fH6?0@{8;YC!L3$ZExq+q_x!T%V+MK|!uo`To-P*GA=hsqtsli>w=41Fn z`?*n-)#b4?F7FYY0;j}gA+{@5u1w9$+)7OqAnoJf*>?4{q>81};m9xLw<(YRS*)q5 z>cg4}zGIHGLe3^CE~%`wRmIpe;{;LS_4VKAHhgc%K?U8vZ{L}$hK>$aB29mJc3cr9 z9_0}Q;LKLD3vo(TKMI(zzUo@B}-(Ix7 zJeL09!#Qc`O*TJ^cs%!-7#kZjMav2wIilG4^wQOf4bj8nz?3f;%&Y_;Y;)ncH+4;egZ`>0nz3R?3h6hic zaBEKf`n8#chUSF0I44dYvM^2z&voB#4GlY;M_c^xVAvbvH2;8rt*1~SXNwWF!K}wE z7+uCY-a0&>BCXD~>k+k-i6RlT;;r}Evapz#oRShfaK(X$$7ED1SBZyfX=$o;-Q7@5m9lu)U|H0YnlTjjgQiBh!?B`9h7#i5hpt(2(=i?c1`G zLr7YeUWzlVudffcr9Q74h>wpq9Ph}$m!wP0&B~Gi)avW)^&KC#I^{7dU(33Mk>b`+ z!^`bPMgvD_OfFuzLS(yoryV3#F`GT}e|`kty0vMKl=q%Thp(zW)44sno4ibhsj{_| z3Ax6?TH@xs)%ODDi|Xs^E0awr8pk46B9-4=`C&BN6)hV~9OA&X)>JJPRL7dw7XB(?UP(zH1cOVqNu32`1`r7jQJCnKi>m^{@$Fsw=+(>G&p$s z(W6JRniVw*T~ZTb%*@OJkiZQq?re*0eDV47C995fvfW}fj7q-+wex6Sy?SNV92=-P zl$n`Xp=Dd_k6Ija;n5+gv=6_2-LU#uv`;A8j4S%J# z{*t~_`q$*IH2k3mDs3GdfBgc{JNNDx^;ZT)>nnEWSRb)$))iB*{DG<_YY z`z_nDy&A}(DgN^Arkwoz{Ek9bp^~NFyZcT7kuHuVr%nBE?MW<>s=A<{K))p!Cr*ak zleZ9yb?NEREx=Q>ZM$~wHX^W3Q_MY-9nn93{saUD@*Fw+A^vy6OFpV~@e?QX+fp=u zjcE7<1ekJsfR;9qlQWe_aU1FD`~Lp&JO-J<>_@?IHZhx&*}qwULp~d8VkPU#mI(c= zXZkFCeipf(@LoMJJ~46eR3qxyKB{##d~EgoU2cd6MicIyd`HUHKI{3{{zT)i2Qv#u z1I8-9dUvqL^DlM(zyMjq#2VtA-D&bC`@1VIZz=Ou0n%lde0ipn8|-wOOh1~+@Wzc; zm-n7a4KXn>Mo<5aTMn=ePAv5Js4c!*`%^e+*zn?Hm9FsdW5>>==@x%Y)e`gk`$a68 zZlH8+ez#_p*;{*FCgj$*u*kwf34D|L`_J9g$00^>@we;7sU-FT*;6y?s<(HEnFQ&{ zDfi6guCtkY*xCDrhA2^vh>gY;Yo?~oB+US;BUb?K?T=KTqowsV{}!KEyn&kGUs~ui z5+V{e{dP5a5WhW8FI1{rkeZy#SF*8w zla-CFA9$i-Ib^q35)O?mc4BIeDTFFzOc$B$Gkrex*hjPM81&KINj%X`aj zGvyo?6C=tU5@i5T`zG%31kaU;+>RV;t-zs->}(=y3p@O(r5_vW?CK)W;Q6)Xx#>@b zUvDkXJ!aa_i)2N`vW@pPYP=zWS^nx(mbT+o9ke(@*Pn{(F!|$%c%{Z-fm9Txk4BDGLh#xu%zCEYh zXpfnEJ`=)rJm|;5RNqH*a?DXLU%t$69J&&BTR$WuWV%6O@{P+x7ps(a(bU=^7Wv8P zn=XMvJfFmRJa6B*vzc@T{o{kEDAnI3{w4TsHAsFkiN2RFU+zQl6#MCR&XxMxgGY}Z zeWm1IPE<{0D{vl*zebstwUa~gzEMr+wtAHGN?=)q&m}~dUV3qY4teC7`|4H6C7INP zM`(=1UH{x?7SQZ1Uiir{x^Re#%K-g1>bgb7N_$NRL;91 zT6K#&=8}f%vmQTYG5P%DZBxEQTS_3Xi^@~B=LHAD<)_-TY`4`TudCkdK~Ju@{H?#F zERwR*_dLMPMW%?+uIyy9*oCfa%lholcY<7p4+onxqOA5lx+k`rH>Vat}t zrRf?D`;lfYBGd^;>FMd&J3g+xTwUK;;#FuS;({$vrg-r4=H%%t>uy!`5~mS*LER$t z@%iHxEoDgj2!l#`a>LA@9%BOj$|@-Bionwj)BUnV8|y34L-`sXOcn

+#A?Jw|kT zr*?OA1T^y}QC(r6D-J@w-n-)0_0hXGxrt`8FQTIl4}HSn*VZT@y-?Sm?v$6GiSVfW z`1P>1y?qZ-ePFqs`{?HAmd^94&qb{nC0{GYeT=%AEn(h#Fml%eoaF#?f7|Po5|#T- zj23t<9Ysq=^TzJM5Y}74d5>4bvu8o}1J!}IN`L?U-H~o^E{flg;9wx=*DGxb0dE`@S5ico}bOX>l(axp~FL>;cF_$QT z7%LoN&W9+kH>RV~0TauPDXe(cefm^h;lJI#d>4|~xw+BSi_R&WFU6hdDO?#D8Pl(7 zZ977`B;#R#)E^x|5C{CNETvnvGru`|o1(%2Jc(+mW?aZ2-X|jF56zZFzP*dSB1uAz zu9G!BHnyS=uho1wQ?CTr>yW~LDt)J74^eMB(%(jD#ecs|Nn^M0N5>#PYtG=*sZ%ls zieI0;c^5rnM84ND_tB$D@|{$S6tgWOZ%&*z5zst6`1^NK%Ev>m4b{4TS-!1n6yynA zb(!J0WsI-8ZEi^9h*gJhQ>Il-&QSoiJ%=toJGk8QElGuMb@gu}SA28JVb9dD4a;J* z@)bS6%z0ibIn_glIg{!-XkvpZy!GEEeW%$>RXUj{@imSA#{aU|={VHufNaoW*Q^GPY@GYW9N; z;&&cl*J=9wrl)`Wdp-ktmx>uV4zJj`eU9b?kEVQ!lb=7Oa$>OO>L|rBm;-*cH#%L) zbBG;+sPe7wGaa2zyl99Ls$1-VBej#AeoOv^jp^>gbR*hmj;_2q#iN;P*2IaVZ{f** z?%X+k%`Ew>KfgXbqGETjo|>Avw#cbG8qs=oR(~*Uv?#v& zA~!eJ?Btf@7b${QuN3U;ltcIBhWuOn$+h0YlS{B zG0|e;uqDcy+wv?A=?zwoRl+5uy5_6g^z<~*Kj0i5PWh0LQI3S%@@Jq(Ayz5zK_)8+ zPkpGagb0;FhrMpcVhM1mak;_qa1}ylEe_+FwyxzF5*^#Y%sGKFim2ZT9!|fd?COs(TW0m3mIKx$}cDg65M?F z(4iok$CpjFF@|nxIo_!m0y2kaH>wk3C6|6NDBhbzsX||=)FB9p*Rn089Lw}}O1<`> z{6T`~LpnX;z3Qf}rDfT&_cHaK6W11xdao|pd(L;7a>d^f_gOz>Xk!ytLnK$^_vwL}X{7Do zu5K`l1nrWl{Al?OlB-YZ^CwT<&a~(f$~FWtZbWR zkp1dBFD)5q=h?meT*FVjmkte_w|rvefsbnoe;rgd|3^Apaa5$WF(6c5C`T-Pad zDQ|3~0Vq}Kybz@jhX$~Gd2UQ4Ub*QmI@WD3gVAk7I@{m49+YF*{=VY;l$86-W`a+2 zKq{#7{LFJbK0wd`z027cp}0uD!otFYn3djg^6Z7kA8qL3(JVKad=q*2;sw8Eo0Cb> z>UF7K;AM=VIj@+ELzHB~9{Zwvvc(|EK{`*f2>XdIBsz=7sN(D7l7jBNOtt`ma zKRZRCl*ot}pqJ#Jpk_O`T%xTZd;YxPuiEhF-Zb>^Ikr6wy|&wa)sKl-wq24FP;~0A z+}5;_G2D8#}p{TNyV*~XLoi?BR64H7==hY z^5shbh2TwR_~wL>GlVot0JrmFPdvQJb4V{nI;WbGq+RGjjTL12;*?}|`sO@i$lkWB z^+pg*vgVf~bP|?UCQCDLcAHo>AyoYPk{wW$^$yX23Z@ar=7bpKce}A49 z7M0)N0i&M@cp>$uP)S>dzYcjbV)|s&>RM-Lf@p1t5a|tSyyoeWoQWTWET}C^UV?KI z-7Lwfsl4{T?@10=7C$H{Z5x!+tM|$IFBp>I_MfA%pNmUY z-EkJR;VS?k;K7#3qQ5)a-s>c1?zPoO)hT2KrxTVwJ~-%);uV?K2&^x==BwwkrZVV` zzE_6lW@<`G%X?>(+vK2I-jIP70k>Lzs_9{O=qF271Yb${2&OpN+n0l&i`*IUcB8gU zoRZ2imE7N}0?5{2=GP}E8xqe%%x~pbcSnOqQO~p6kKSMH{X_u>B%mch{)-3cBQ+R^ zQh&A|l)%3I`_B&M|LSpl7F%{jUcM~1$D5DeXLY(}P)SN7=j!u_h}bFi(Ame{pWhvh zR%@c z=CZ7PTmT=eufLDr*=>FlB!2~d^aG{tU-bzJP0dgz;cQJRNA2}t}fs&2oF`M8Mxc19Zi z)-|0yfquUVjI<1k&f4l~x>e^<#WnTI)7kIYRo1fqfY$^t1ezmVTeUjO>A{r2QR}_0 z%v;|c;PzG0WcA=>@86b9D{esCi=Z@_EPT()$sG_EXTuLZB1s)Ftla83^XahWF5hQQ zp9Z25e9yBdFLa%(Mt^yy8*Bl=oh9>2D1jfm^y<_u9KS8s77J{_2V~5-fl!`C=Go87 ztDr7sRJ}u8T|J@o_u`_PV)|%Im}K(_rs5KJ92B(q2RtqjjRZmTUfO79qLvlC2) z_)y;5oT=UTHfWV!LrqQX&Ye5G0A|%#Q`y|XU%v>x6=WQNXucM?JM!xkC{Jt{aeb<< zKSxkjlAZxrZGD{a=t-VOhb}9zevkVM(0c^rJ4DBQ2M&-EgczE&pvz;P(Ao%&v%&$; zA#&YEpaFsqPppYeQ}mXf-YeGG3#~Hfd1dR-cPNcb@B3jydNRy(#G-{eJJ1B}rO0So&6@9``3{U4Ye|gx8Yp(`*hk1$|B}%%nk3KZ zDwf6mV4;7)Ci+(VY8BFKD3Dm>%B2(l?!aaZ_7iqzQWYPAY>l4fw=J5!D{XpGQ1FV# zt;vbpo@%ID7f-#vcaIWhOxU2}4ps7jE>IX9U0s|fPin~f#aoHI9{1m=r>B=KA=5r<#$1l@JU;7cYpU<4z zmVqHA^UscJ9oqy7rZf%;2wb+b?oKzjolp|-=n;d*!k^Ua0n2YtqwK-BHx(ew8cp|C z8NdHn*IwYFJl9_E^yx0d1J~yGlbX*0cfSftr%@lIkj~H%7?hh5v%Hb|Ped^R6M4mMD zp^FzUIt_m%L$6l3+qQ`8z<~)L&5p!s~=~QPS*X{PN~{)cnO9 zRk~~Hde?OTtC^yLS}Q9z7kkVRN0!9+Q*0@8qyDjwJt$!VjT{?Q_eTnG(Je~G6Q^pQ z3ma@g(c;>ykYm+(3(5lT=gQ>I+S=L#(Kg3o)hB6=>VhE6ux}&!Bc;^D|wG#=(0BRQ0H{vRs`e(BHO}{+j z+U%oubK-c>%AZ0)Oa}Ez$He3hDo!7rL4`NEpeLf%Y&+S-V_U^1AU+U!G8jDhkYhAN z?0^j++8?{&ulr?WWer-BRH|%jQ3VNBbAG&&s>dMc5wsTq9oK_APoYIyt>gA?Qs|j(tgStxuKr|rXt4fq9;z#$SO;%amdFh;eVi-62`Sby>K}G{@fJot4_2U0a0k&?^jk<(k}#p9E_vojc(Yg zsbXkoD2$WWv?ByG;0I};phy+t(%Nf#O4>GJ!qS{OgXuu#h$KG=7*?tDe!~guymQ4* zrIbTm_t&v6jB#tnMMbM|B&VS?mDSWx;vsM6=87#ZOol<3A3z19pKzTWR#Xfzw`WIZua$!U$u3Mt5Rat_hToO zxn`ce@ZI=%ot$FFnKNfd=R@}df&%HyZr24n$eylWzKJ+3aMx5wUbt{!v1eobU26`Z zfPqPbDWR~MD~PmDN{Zu5pXs>^mo8sE;y!bOKp+;BK4bx-L4^echLFP8aupfSu+47B zdw*pa5o#bE@{sl1SX)5wJ=N?UbzpuhD)hOYIjFcp}_1; zJJgxD+2Bh^ud>-Y39|{Y9~}i(3ub{@Vvr=<)>lvZR)nsj%l7X!{uWa_*QRBh#FWId zR3_(szjp62lN~6}f-T!Y9ZCRhT>s5>s^Ch!$}fOgP`x?avMLvFRPF#*#DFDQ1aD+u zYN}R3t^5&-L{ycQ6Cn_Ui3$kl!|QeV;K76XfWA}*zWHVPl$hMO(R-Ulk~U5;MKOi~ zkXJW;AWH1;;lqlWi9mQ2AVy|&g#=34vfm{E-Anx4wryJiI>W>;Ihk0&dGb!0h#K#G z2%cgn_CecI5VP;5rl)yaN&^`V_Y>sWj!jY!zXDfkwLX|oBQZe~cn|x=0hpr*r5=?o z{J{fXfQiq~#c1F*Q7`t$*6d_LA43>E;+{H35J(H!++7Gs`XkM87I8myKA41jRkL?- zsRkNp6@OfUbIAs;kpJ13w{PFF5{m|6^><@QTFEk%p4VKY=(EJ!pqo<1VZK7r77~5( zXJR4*R1e_-s@PbBXXljHG9y6^jeWN*o?`~3&Cid%{QxvmLi#1-^ObpP7_|(L&bDsd zT84_&RN{3rJG2;15kh!O{j5wV_Qf8@y;rB@F2TFT;W_13cXl%khh!z#PJVtL5FAWO z_2?6l8*dlM-$!?@ONUZu=jeEStmf0F#ELGi4P)ib;k4$5rQ9hK+rQHX{!6&P0N0R) zvgbi^pGCuvZt#&pyTo%BfoI{!AZ$~4g@t@S-a%o)w`D^A|Guh?B~;b6DcV1M46*CCurb1WS6!nZv_=YMgn=Z+Qvy=It^uF~Xi0rt(O$W2X zMHyQw_?7mu{6hZ$c(&Q#B%mjx$d+Yyr#=({!j{Lq50<^OzF?t}GPDQws4(AQwIN(7 zP;l=oq(jm|5fpYBzUJTMRG@ESau-5PP}>h6ax_TQI5l>Ee_uXzbACsAnqCd^4)I)X zO`doP6lBJ#ey*?o2)C1r^N;cI@hM>F(LveM8qh_q<)vrZ51fbgb6!rMS+Jyyug3BR z^uMeZk&!B+Ms#%Mx7NFSuXo4bSCV~%qvwI8@E+^xut z0#PxP4Sfv8xF*0)!V?1`x*WEeJJuUZLmUM0PBr2@NMs*$S}a?9zI^#|#ARHaph$^0 z{!!>^zrN~j^*xWSq@<+61**qKRBsiEP!35?ZUuiSpLNev6R)paSFc_r1gdCX9eMe! zfJpD0-tY!8scGrFY~F0$l~oRKvY-EAoYEc%mrE0lS{fRGCFJgIZiMZO2y7Im?LIy} z`xBl$c@lu-9zY`C4J-6oal!ey%D6i5^)318+M4~J?^WLwp$0#;}*hs~am>P+BYaO`C0ZfD^p(Da@qxH_O4L;H}2_ z`}o7<^^J}F;U{>5?w@ckK&9A5tbT-x33ysuK=6B^Yem7bQ*yfWW!vcD2gIIA{)dYyFh_FN|?E{a&1Od zw}12{Bfq1u3Cxo8%iZAP0du4Q#CWrGX2LGI$I-E!bch``?19ES%gL(DyldAkA~Oze zUm&z`l-V5|l6wea25|W2FJA~p2w@{T;WSLUJU_u_wXwS7`1#rKWaY#USx9*{1eGK6 zK?B-shQpx3j@_a=z{vpa9)R6I=zjkKm`(X(@fXDRYk)w)QdGfObjKOq#*QNQW9VWO zoIeNMfPxPXQZ(Ai=s75h)i^57ExZ%Gmc=tgZG%!DRdnuyKO7~@CID!I9W*Vrb(>Ui z98@i%^3Yb`6dcxc1-YG?RgWeH@)=W{SSh&gjN`SzDVo_-I4Uerc;`?;H^WoN)<+2# z7#WcRg`YTivM)sJXBl*4qJnSr8(&@ZgjFxMqqAqGJP9PsTby{~qTJ;;Sbi4K7|5pi ztt?C$!EeB|^Zow8$;0rFW}9}+kF{UT;$P7eOa|sPL=mgl2m~2Pyj7+5Ulck<6Ke=H zLmipM+dbyTx4}%ux_kFWcsijll_Li10n*&u-M4j9ZrNf06jAlc2mzVr{`Zh?h41*# zYaiXaHxqX)ZOCN~K9u(%WE*iVsk&1jtSxJzq@oJ49YM|AfhHH^4Q~p8wTVUt^(5xO zpYz=W@C0MAUs(8>$U|!GqOI>gnTa~j{`sLLa%&9LPa#*rYdLE;`jUN2NpK51B*>TN zp&2pRTvSk~fUjp776Jpvu*1!9+fn-7e`ttD3efdgb2c~$M^m5lJ;BlEb&bsR4Gn^i z|DqxM=XUrD`Qg2GgNh>IY=q0nptHMsH>%3Lz`zR`D^K9^gr9-lYr4c`;@Np~QvEmd zP3ab`yo&5i{e(BAJJ(iCeGg zk3`(@8+_Z5+8Ql&){kkc6kOwsP6}N+a|DnLx`Ij?hdNyZwTKn$AYCgkxTt5XV%d-8 zUT8yKE!%S}7iG|@j2FJ=()H!!bev%XahgM~5u5@tJTDv_9MTP|sN%ZoD3XDnGN8#g zPb$P6wd*}YwD!?{DWD80Dgzl>706eUj$&W7ZW6O-k)OztCiy|QDdBFKPCmb%MAr9|FGEkLt zI^Rp}`fizJxoBu;X?+A%MSDOtrB{#)!8kDI(d89m_zZJ@dR!dLKr5K;I;p#gCm;$q zdK?+&6Ty{LRYXbeD84Di%&)#rK}%VYV&|R!99%N)l(klf4t;Y*fb+sh>72FglMk+L ziN6f6RfCFtzqVGMR0bniD4|&|d$BSz$A9_F1Ih(j7=o``L*1S0B zc*hyIw)r_-9nnX^%@l}tBHWynprri^_yh$7i6lzIJdz>FSNh)D*iN>prM`O{GGsZ? zomr!ywXKKI5~1@x{nKZ|OFQ2|R(%6XFmM_p-m87_iD&BLu%t!J#~VA+0@yCg$SmLg zmtF{&LNyiC_%%_I%%k#qGq?3%pReya7!GRUEmH1@oAdmneQ1(kkOz&R+A)vkVx$%} zYk&W;i<|=%ju1$xTg$&7*X)NK@9m8`@NE92l+?soV|DLhV*}=oh>595!~|NA^73YX#QLxjVRjT=831+}XGTJI{~fvk zr?zZ8>PnK+fk&S|e-^$`ON%6{puT|GkowxRx6DtvhJ$)%?`efqhb4+L5ScK_+Cdhk7mzKBk&; z;$r}bL?9IKgbe4S$;)6%B*p-y{Z&fZ>aVKYOs&^jjM}&l#03s@L@SiYd)ny7we`OB)>=s-@(#z^$Ro#hQN0~>-~`M$*!p@41!gN z7Vq^vp^pWyf$>#q!jaK^;fJeiJal=o5-Yc@Ugvnsjk3Ld?sS{y4YaW!aerB)alPM0bsRBhI*ldc z1#5jSw3}F`aDoz|jgysdo7xAJHQkdSs;C%L%u5xNfW>#PiEPDl)u8sLTX%~pb_tLO zBATygZ$x(XaquRG`0FJb3glO|aEDg#O}3bUor|X1AdbHUZ%NQWJyW?y+r?&Bvg?dC zo6Ck5zp*edc-u7o7!m-F_M%3*zF)&nnywq>=7C`16=&T*01*0Ubnu%-j9|# z^U}Hg{{B}4xf$q2cF&Us*Ldq!!K7y~N6vQ(Bud34_oadXu5cfr`u20}ccYh!WP`Q` zeahAPR`7e~gV3NUTS-!N7)KbWn}D`8WbY?OjP5>|yR|+rGQy+T7HgBaYS{rfarb5V za2omK74PP((sz@S?+7(=NDXJjse`R~GrXL!j>tSLUT2F5hx(5KmC*q>?XQb`sIRR} zP)z0`!!lp83I{j0Jnqv;;{oR+BqWNk&6+3EWH`^lV3uKA&#Wj= zN6__9NBD-m-pSlqlA4-2eCv$tBQI@Tuq;*+d##0!9wmqu|3TmyXtf338`NF_SjyBt zi@p4)CnqyLn=k+q@xi|Ud&q={2GNXwK9^-70o5mFx?DCk#9R_WbQg$9mNq6vMt!*G zw(i)`Vo)#gvE5_PJ+(eZasIRw}=%(2)EJ9p06Ri`#7kwWB(1YtVOabK5g6< z2opMXrqApT7~Spl;4Ebzy3+31!>^Fhl549MqSTh~@$=^$K$OgJ98}x4UoV`$pW`&z zvJG@fKv)<{8{pW*u>~~-uma4@9YkNV^mix$_&5J$8aONh%)`k!t1K|&S`aVdJ)t@R+prg zzNzUwd%+)-yX_vZIGSZ;#q%Tu1|;$RcQC_LH#$qs z7ZesoLnL>8_`nq3s=hl4*Em7Kn7v%Rw3w;nvU6!yz3eY3H4(3M!oXZ#XR%+e*h3}8 zXj9qE$>MuBrQ7RaH5N}>(`Lhvk@S@-I|;_v;0L;=Bc=^|2)85^xz{hRxmLBfD>b@L zi4iTZ!k9MPOzK7F9}ohnfO(1l4E0^!Qe;7KT;LKu_lnlC27HN~>tu20a>P2)jU2=An_z zgSi4>J7wGp(Ms*ydy1Ri6?}Bx5yjtVN{RMP);y6AAg@ZTWu1DK$O4NiArau*$*Dhv zOaX3Z0KHp*<&$TNnrVB|oR=DHmKt3+qt$k;_`!ts4{;C&#{5S?Huu^Hq+71Kd8HTN zT3{#LUXS4ef(}&e&cSLaj?kemCOkC?=P@uEC;Z&;%ZAjV*SR@M>k0AD8ytmAn1TQYYrf3e(360jQFmO|6iyEr^3H$y# z347z=yoTHyJ#zxAZ40*@F$oDJIQ3FJ8=~82Wok|*_pg)IBd>dc?`1pMSEC}95XRbfml1LRZ zG)JvBw!EVGm`mbz`PZ-M&22FUIeBTuDR~X0()+=w9AEZhs;aHsiiV0I3VU$G_U9=e zbEd+pPEJ`By-^tbVV2lFiv^LDmcDgSl9I1=J7kL$%E6-BSdOU@1fm-~VO3u;@#h`>J}>Lvi>K<6-QEl-XbQvvS!JELsDt&p6SmS(nu(Sj-@2bEVB zy(O++|7JKD{JwUb!vZncuyOU_GJseWm059BKL^J@re5d>4?HmD03y1o?bZW~Ou_R> z7(fZ#HoBV_ivxrASW9AMZtU=K#X8C{CTXvztE%q4@RS($!{`}y3*P+QXto|1%88c1 z<$$ICNa)$b*P;)3b~Ezn^XCM8QsE%$qiG|k(lp@KtXr@WMkJ)Hjy-#9xnwHX^56PAlS z5aa(f1O~cp4HcMf-5clw6gZ!{-u;b~Nn$7A9u$F{&kdVM_@WV2#_%6g7#+$2zHw>_^1};@Q2(t8)m3J^$ z3IdtnEX^LoV2UYEP;eO+^LGmHSDI`tFMG+?}JO*nPHqbD+)hW z=?!INq`$wv+?L@~^jIa#@WhfPD8>-&sDI-o4eSUp-axQX1~Hi+WLBi^|y6xj{MEw6P2VIMl)fqCwf=_olV@AoRuo^{t*vl z*$G<%*l~h9gU9bHK*LV6ynHCy$DX3bt#gM>Y|S9QsLg+WQe2j~q&c zXR3GGzJYcZz5Qt97vzo*gO7I!+5Sx5=L)bQg!kVJ903MUi0K39hTwF9l*%{xkK8O- zo7ajQ$_Kdw`$3%QbK+3Njj6QihyQ0nkt$GzkeXvafu-lU7? zdT^wV@|QxsVS61MAg_nPfW3q~!cRR_bZ&IgxE`ndGH7z9onNx zD5j2cVh8*mKUNDIa&HA#(8mNPPz_ycaZiCWANtDrHOCTuLKsWhuk{cPR6d=w@r{xq z0HXoWwbI*&IaW`|KEz}bVdhfT)a2ylr9z2Q-g>I>$&quZ{7C6K^b{?51qIPqS3I;I zhKQQKeWTmEHx!0Q?may?A;dt;GYp3JPg`4C|D)#WB6y_B#6Vu52RahPuW<|NARLZ6 zVGJiGn^9q7zvfzFJ}}W@HtHR6S-_@g)NsLnHT(!>IFW9f;HJjF=OqzxZtNQ#ir?M` z>?Sc8X}i8lB~6zddFCI>O(7i{!?tVth%x;4Cs4fz8XfOgrmaLB!Z7Zf7Ghb#0#G=oF5&Xod#ev zqPvy7SP0k8yVu^D2rM-Ko50oJyUfqUG_z`)QDO_6@%6Fs!* zfYCqTGl=14g&E7ulBT80^{5%eSX|B{(}RxviK#k%Q7Aqt?`B>H>>Nww&J29Vc_@| zl3tp&;%lTSf(OCQtLHm#5l$Z*_~h)@FJH0~X^t50M!+!5HQcC!AUllk zQbM_SgSWNN;m03TEKUy)Wj(v}TCoB#URECB>+4H((iR2vpJabh&3$@mia3*?O7-5r zg~yw_FgsfbTALu{by6qz6p*f(-d*L0G&hK^fSpsi8V6UI!U%&=L~ zbzC@tga;>eJ6r5748wm!3r`G!7%YJ`^wfM0<|;ML!u% zzvAN4&?)N~13*<)!UIIh%zU?_L;Z${Nd@vKVRVNaKH8oZ1|G^{Vl4vf8ctImIyU(W z7tTTigy#k>0ZLn2TQ?7n(ZQPw=q8>VGYw5ILWj3gF!2}4qCMtZFjjs5Mi-QSpcM*G z>RaH5P-6eU{jCLOm2TIrz^p70V(k!NFt#S3O=c6dN;oQ8cnFpLF0oI@-qH`xaV zh+szTHjHq@l_+QW66VnmjOMAx49d+r780o<*XCuY7I8mNB=w4)%;?hihsJc<9!ZkF zvoiYN)B$EX6&v`RAjwh^lZgZ#GYa%okl(mFMoUK*ehZeXZO4Azx^;_|jV%ekZH^gH z3F)a^U-YE-Iyyz;O7DH)9vENaTs;kml|AQ9f&~IP zfp*my<^*VlPTx6ssVRH|0|I#cup0!fq#(iAs*lVdZbKJtazo-50VcjtC`J zq)%hO#i@fnI;js~;Wj%=_s{yxdde*wD;!6-r?ASks4Rg9OLFc#owzQAg*_Le^fz>h z-00NObhnTQmpVaOW9I?_5;BdX9$*FPkSl@Nw!BGSKXpt~MfcyM5+D8FFBLi>wg2yz zvMgbt|KG3wXFp*bDIimJ!Sdkpoy{cRrL&yjxY*{u-YD`p2E$SONM%q~DJar@E}G9D z#}46@l^nhNYzIk=4715wckXO+KKAd&0Z9jORsd{93wAMudR;~YAxdYizw-K`X1n#+ z|2+S{e=9Oo?J8684R7+fiWWl*YL;`yGc`5Loso(j9wVRK@|yg|fa4o2&sFEJM!!GP zC%>MSq%5czIwJu>!oM>u)9bz3wv}hgev{_J@7?s==@cAR`oDf}VR4U{ob23jy)U{} zOb4{?eZsv9$mil#Uy}I_AEqQNE{@Uu-+ji4ZfjTWxaBBi$-DpgbOFhA;?wc2!+Zbh zuXzo2w%-6J`M+P$ytjyx`M+NO-}nhH6U-2sDJheSz2K;X@u5EBzjv9~>&kx@gqeRv zr;BSos?oByP;H{5q%=bJjtQJ2w~0{x@7G|!zc1MQBrQ$o#fulTyLTra-Kx#FiYZ%v z2=cg%hc7+ue|}j;CXM;usg^hT`ST~y^6lCjMYB$^d2?BJw-%+qq5mwD$8jQpV=3Ci zxs+mgw|?#w`cJedR>6oJAio8F#H(w6R-eM>zk-kmIj4{E?bQ1?n_XchQOBhKwBLDo z9#m<*n@u9Flfuv~fVuxZyC%z0G=~)~z_7$D18$+Y3$2wBrpVhU62w^8DlSQR|M8~u zf1X@9``-iI^0Z6N+b3*#O3C;!GAk(e%Ia#)Lx$|8rdrwvX6?11`{-ZQShxaG;&r}{P6ML(I~>`BkFqe z1ZKkC(UPL%4-(gN$;jO48VZ5ga$S&$yY3hY8$cZHEqDMI8>(T`1cqQ0uAjk_T$Gy) zN;d{9VH+bO0jRxo+6@y7>Lt|}R6dzwfx#xsENf_~#7><)eeqPTq+ycBe*%a1-!oDu zBsq@)xqxh#h4mA6WubAso1N_(=tG#7ZES3!9p&Sic~72G+?_PTFNY5VoFVEqnr2*0 zA}#(q+8MU7j(o?%Nor|=T?-ih!F5AwE84PQ_fEmFp-Ae0P5=WG_@BGDhyt;H2Wo&R zafg`i%&w_2AMyj}Z~TY&#Kc$v0|Vvd<%zpO(7i?GRN?v%#D7Ij=2>lHL|c5iAnfr5 zr7vFW!6O9UYR}Y#)7PIgg+}oRYSeyx{?}2OqNlUbfWpo76{v#Bi{qHe>HnMwCIjL~ zoU3XTUqh*!LQ4rKLXM5FU>`#VuDF%Nal(!S6l)MzZnWf#4wtGtK8aQS{rwMnzBM=F z)NaQWZ2lTa#LYbLXWltu>*A7AHvqFc&t_lD&bQixWn|cAWIH|<(4t6GrouOcU#1*p z@+O)UI=R&B;pQ93r?;zKHqD1YHpJ+HAZ-|V*o=mRKH zZ)aFBVya-OPnK8#kp8L=fiIP}r0 zSE1MnIKC54H2}mhZ7Sp8;elKwh=&cL-c!GB4c`phwZtuY(F2$O{)m<0JNB^NGRg=U z@PUcO{9!~Au|i5fyj?@DU6KMadC1Y5-JrHRm7j`j9()!m#6TPM*r*Li?=23 zR|oX=ws^hBX4))B4F!Dv8BJ#R!S39?eJ+sfS7oIPK8l=Ml9XbDm>|ikszy~<{ikz&q)5&?#p})(CP0&QZqZLzWL+c3qQnZ2$U| zmVk_LadFbAI=C%)u)&!RaI_pbRJ9eZflhr~SiUIS^WpiEm<4N>jfYb`P zBQ1#xgc%E|ZvO-A<`o5;kGy@EKv}rb=;6bM3Rb2~j&o3_459JDz@fHmWNxm}Z~s3* z5%6E$uh@71d4t8NzUZ{5GOi@M_?{$g{guwD&FHUOrsG7$N;NFV8nK zlawBR_QspVGY<_@(7&KqHsXum`yGJ$JKv-Weh@GJ9=*>Fex-C< z6(te!+`Q#F5@Db6cP4Hs*kQP|s~Xo3D0N6;S{;B4BpT24o0C1IVSH-ud%=h**kY2L zWy>A-VD5vWju?6IpX{J~gop*DbHFe}zxk|F%;Yoc>}c!%LDzS{bNzmAzq1mllu;@<}M6-q|-4rPRoN|8;rNU|c8mHAw^>h~Yd^LV|!E51JO zao^`Y=Q`JQ9aeYSniekZ{ewW;9^N*L*vmEr z;S|Wo{EnB1SnL1zv5~b(AG$GwkbA1)u@*B2x9etzD$`#7;#k>@M@qRP&k$jwj=&`f zv_}@{tB9)^cYlWd=Y7f)lk=6hd=YaU9FE?OfE|y<&?l4%*k4RdPxpHAHgxz7|NM^=y zHgTBuJPHmbWZ-?EWOMkbquBw9*Ru~ctE=}Z7aE(5JT%)G42)MI8=+eelqrrS1ilB( zF!ZzQVAK;+=#y2v@)b?;27!UL133&IL1Rgcnfe4~Gh;_vmQr|t5&jO+(9@`(LO1ABq0PxrSn0V`i9Uua9K(LSCC+ffwN_d{1a&VS=%=hv_6BOf6p77wCQQAZH^ z*2nJKQ{4~Ias1f4=ih!$UNv_ysbGboeZyReR#js4aFpg5hfSsaF;YU0WaDg`ObjFA zbhxG%-Jzl}fk2Fyv;h#?J#EtX1VB3O^Jb0aj4d>$5iGkW9 zp@)s_ixv;s7ZKP`Z!%x5?Yle^GkJ0EzYB>Jp1Y7A`LolD0RjR=SqZ$G(BBNV^aeyP zPUpKe9PrG;!1bV(-SOiTNhN^cLnpzK1&TV9#DZYgb!6hW0$@yNfdjvUc`MI>d`<)q z6zOH5TX)rbF~t9@(pOtsT8?8W4>!bl7A%~Bfy3OF`bFYx7_*!Z8?zmRQE62@w>TPF zdv|SWiDaq&)e_%1%qy>Yj`54VA^;2?pawH5E8N;bXbV51FKNPWCVj1lM)M|o0H5jW zM7IT9y=uoSo;pKIOUr#Gk9m^g2mjr9*L70vaiVt^-oO6N%@PIjLASx(BDfjB2}?>; zI!3XNKvp0D1*t|7Z6IylaYvktqAU539t#m;SDecLi+2OOMGjU4_|k+I`wP0I+fV+J z{6#tvq~kqmiICodN}9?57fAmG$F_>yEE>QhH3nX}+hlYI2NOHi7tHKTT+Gt1dd$+j zhvFt&?`IlH65@Uf*TEB548$0uXB3s%4tOpkB_-h+JfA)_u^5#4x8UjJ{4J{f|1Tg58}+~3_`Sb*8M z&j|^mb?AL!pbBXLOz-FeXKbK1$cTWmNE$$pCw4@(M zDU1q)Ti*h!z?s77`I-=miT3B-y`$gG;M$T>#L*fU&p|vwT)cYo=EhW4;;4%T=xIY( z_yTNI+GT2GWyN&;UzHpuiN)I!upvNY6TIg-XwG;E_%VkT5kbK?*dyMogq#OmskB*!SyF_X*VCqw=`8SDzKxp&tO`EfCn z8{S>uuY_Z%%30$lw4CswE$1)oqQ8%S=~{Y0MpDuppb%Q{J`9gXWAL zm_z?ZRc5XYYuCo?hz5UzmW>DuK~(zdX<&gJNVgD0!QJJ8EJnRyJwh3hPD`_XHBJWe zp`gFJDo${2X*+b4a1yZ)EiF1ay86Zk1g}RzalHNg{qfoYsX~fq*T(#Ml#a-)QlDj6 z$AZ?>vOuh0wD#9LIDK*gw#kZiP#h;gCCL3TNh!s~8$afW!CI-GeQ|bi(TOXF_@i+5 z!5b3JGt7MMnSkX(Z0T5Hqi@6;CnT&ogQf!t0g8g)x69119^$6pe&_*=x~Q;l9@2ZBHqn2z*EC6>^3Ia)gL=OPnKFc}^%qNk z?1t+E;J5*F4uGi&MMpsM_HEndOziI0r0qnOL?9to5D*UTfObKE4ke-ge&XRk92}8< zoo-B9hpeL1VfJ5ybnw7YjkY3O#9 zcA20(L@A;YU|P6b?b|CfhtXUu0MCawUKXPL4W{&Zc*<1*kVCd>H9dppGWQiQf9|oN z%ptNT!!2I8g6HUfF%vc;Vun)x4bOZ~YDjkz1tBpmCyjd`iki*|DBe)DQ%I~7kyQT9 z^Z0xDB5d@ATioaVBPRFVd^dVJx=I|+M7|9I+S0xmy<^9og37(S?D#RlvepfP;Ra#J zb!NZ*&tXfPh+;}u7cJV0g@C92wEr0{TAfjYYieY4l*@K(babzxqI*gi-o_cQs@87U z&S@)q!V%`28viK#;k;+sTGc59e{y<-|F4s6$MI~QGX?R`_NJ+ zYLhiPcN*6d#6FxQOk`X?QivJFu{San;`@zIMd(>LvwTr|K80Zg%Otu63*3lpClPz2 zfvJ{kBu^~TP`mB{2lHjyzJII83@2>3hz;7UL^civ0jcBRb*%OG^Oj(HN{#C)DKPV) zP2Zx83ISw%A6NYU_g9Hx=R^DR7OXzf6C>1IWb!nW@kd9u|HH5&nVz}hBl?N=Hrhwt zU4&T==1+~uTwf*p&=ZA~6ntUyPatnqU$PY$Y(v&sa;4}Ofy3a1z1E2y{C*V`jfO$g zzVJ0yc)a7HCpM?WrJes)s0=w5v<~1rB)CP2H7NX8Z#1AAzKP_?U5`90pc9DdRj=@~ zAbLp4VX;8E>8Pzpjuo@=5(J!8nI(P9fIW#_Hvyw@0?jAm5SsqwT#*YA?KQhVWry63 z6{kEc8~PB^IG3~LBvIB3A?7s32sCBd+#qyXEYm-?uDGf`u)GoUkq?KgJq9eP7&HU%gHS!gJKi(rceLh9q6*6aW7S1op&IqCuaK|#JK##RG5)kWQF ziA~*~qD4Nm#1+B~^r7AoQ^@m!2G1?)RC>cQ;2;hj0a3-ojPLz>ma0dQkfyZ)snlC| z=l;+0i(UOaLQ)lozhu9I=1#iC#7U1xBM}+HZ361paX1vA0Zn}8L6pchezEW# zo~?*1j&0(NN*qjx9ianxcU5QMIEXerVJ1I#_z+FuMOYagV2qQ}?b(CBRepG!59A&( zCFoQXV+Yxfemh2{NYt0D#>x>nPb54zI!(}|x7P zOiso)4lv2k8t%*-=#)_n@Y~7zeqeJ<}k6bWAAl%YBFx7-89(HZ>i02@LLw8VzQn)+yhAqfE zLBjXNZ4zSwFxD_fBQE?qk;sxSiv?t(2@_QV(6kt^kDzW`gxuM13;Fo{5{N=bXDLvA zA=s5Bvk1Te>p&!eQ{p|^C@5vn-mN3pajjs=2g$JejVI63BiuYafHDeSp*#3BT4I{bM46w6UQILFMx zw#EF5nlt?*DvI!Ht;*wLsm-e47}ge=&|I|a8#PXG|3--XC4!MV#hcT!q}KupWK*?LJJs8v_+I=~+w z7pE139spp-Ylw5We=_-ih`IH$Y!g##u(5H^Jp(^VLcFSj=N`#Lp9D^`nzoG_bh9;H~@+WP(X6GqTk7Hg8cJ%SU?trJ={h%`i9 z4~J22uq8n(K~=ag>c90#pZg-Qs&+w!1n61`5|R{KoW-ErRL_Out1$$wX6NAWCZ`|> zXz(0b+WPp(liT=}%%y3&j#K{uAuQOjWg^EwMb_clc2-GuurYlD*jVvJgyzBSzY&hk zIQxm2FtVlBAYa|W?_47!#C+uO4)umM@LzCP5l80~ze3}`_dIYMkz%+BAwUw`c$3EW z?-#&S|IYrke~vqL&SLNnaL`~+&!a%$(0s6R3Y1qe3JA@cVl>B;CO7>Icxp^FwXIGS3*5C|Xy&V8P%kG6^q%IL0GJZy_fJ^ zcKW*;SyqI>4f@y7xpYM^TmyY5FT71OAY$yBv3ejUGWc=p-~WGNZm*O0{b2j~q@>`v z30-FRnVZP=iE`12{@ zafRr-S8~G$fs8dFnIUNn<2WFVcQmw!Um!qL9ARy04R%aLPH$qWi*!rxtxd79@P zUYxe5?mte>_2tw|5VU*9wTYh`$xj@bad|@!Wtid*FM!&=32JhmY1d1ahM;4PbA1Ji zjt?Q*mH}*nB4w$0#bdFSQM^Y&?IuG~VD>ssS~^|hlSFt4nMw2~`U5}ggkYoYPkba1 zntX|D_(;v@;OK_YP4W^zrLaUgj!qHf4s!y_u1g@G?ZX@}LZPkQAtr^DKp4=-JoUEo z`@brI1sSyMfS~BZ>?Mg0$#GNDh|a*dh5GkzS_%uP_*gtu${mnQBHwpZC6qggR$77L zlnf?Vg`uLQ+_4rdx-KbA^!-;r$1ReY@_${h#4~s88bkz&LdRXjE+2KnE6Ds&?-1Jr z;I3RWIP3c~xVl5wO`?&$Uq$|8*xzHrQa|15S1 znIS?;109*zAOX}^nR)%sBcJ( zhKN2>2C~Q6VzT6MED?wWJE6)3$0g;C;w3yROwmCfffTJsCJKareX$isB99RDJ(}Po zRDj|4I4Sr)Ef-w0=DJPPMluluoJnashgy~yG(rTBtsx6!$qGe@f-OQ(xe##xB3arx z2oeAq!K4wob85(|CzIglh-!ZiN!`rt%8~P7J4JlQ$lM()88V=4`MBc0g+cylb_$~r zdv2gwA!A%9HI%G>(;Cf&BtT{$-Y%j9wryJh_JlA=A+eKaSY{wH5lS(VFeZvr#rRf& zJs%YO#Qp_as~A6L|MkNPRIHzI|8Vg0z<#|v-Z3AQP(ZWm`ZytE^_*?Xpq;uvPw%&r z6g&UEsy8#a&j!GGdCC=e{RyjIFg`uXOu;hkGh%Tu=DH1G(iYsx`Ed|pBvTPl`k+#O zN&q~jY`skDYk-*GgtCa7bHKqs>9_)R{RzZc(0~JxYpOG{!e#@Maqlj?B=U_vx%j-QWjp_|+6Spw^_ zt7G3DQtCQ^;)K*MK3&8_aHzfTzb@2Y5D{V{ImZa$219?92rP`)v4B}|HWBmWc|GqXG3m_>TzAl8%u?6*H2W$Nm_ zwPpX~+}Y0`b*j7L?G2?6Hx6H3iKYIJL!n#=bkm`g5{%O$-E(47 z3z!0(_igZljvzmZ?040IYlq)t-l(u|civik?x)X4;QA}ehn-b#S{lzc68|V?y_%gp zFAgxSvPT=q1+k;3<%H-Gu0r_S)gtJKkr7}>yNOEuI4BB#fAiz{az|-i!=q)5OU_L^n;5`7LP%5OibFlUP z@8n(TVNl=@gef5El7|+#@A8x=$HChQYA-`ISxmMwd0e|1BLk|>5J|KZ6pXr#7#W!? z@I=5T_IcDyFsYXq-QjFb*cW1Y&lO?O0aRo@v7Q7_&v}`jl+Y?9U7h<1h`-aLxpDh5 zxp7ge#?<*epSxZ=R04(kJvmuaU#~N;A_~inATCH8N?)2~qg&;V(jH#InEi($PVQoC}Z%b$7y4fe^`plg@%ssr-{ulgcS%0Z6p&#gTfbb3?yrQ*jC}Fxd@zc z{N(F{N#ut0r#A0aGF!^Sa~ z7W6Lw$PRSeGD`xFbv&mhReK*0Z88=Rkkn@cb{d3&wF(>5%?dn3Kp4 zy=c4;WdPhk$vEU<6Z;Iu}+VytCnk3Bv6xqKLI4yhSSJR!WuV`VGm{kKV?ss$x^u4o-gm z-tgRd$E}0=oDW})^$UG`@Ake;7N-Yoqg(gF&ch>IE?v3s?q|<}61uy6<*$$G;4nMw zl$6Hbczh7-d}8NTwF!5h3USS?^58pn!j@1@_Vz+fF38nzKZNZ^I@92*qUCVk)&*{z zNHQ$%`@M`T)*F49?J_0~dwm2N`wqB$?{u3hSFqGM{2l@g0>O>=Y68{wL2?gq4=G_B zzBRBD90-GDYi-x4E5}*VYhAr+w2r`bu?E==3y7R85MnGcXQEYhE>V2GK40zg67 zbT4O_&UMaAi{*$5ugNlKzWX@w^7LN8@RGe+OJ2wn8l$^NihB=zpnR8x3g@#-LNmm< zcvXyXygd5CCWZ$*nm>bYAd=AE@>2TM_-9r$!dcSKfqZC!mL7->zz;I~5_>o5Yu(P6 zIwy;wgMoqJIBdVk7&_P_pFPJpyTJ&T`TO@QoA+H1F)_3hd=eixfb2ng5z}9Hvvt5r zv!JcbC?a!N0Jkeo3~1AjNYyD5h^K-wD*}B->NwnHiN+FHXR#Q3ECof5IXkV|9exzi zb@9QYdoH}x&6n!+iP8|(kovVGIMAn|mhX&!j?-6Lz0IF%($vFD%G!SZjN7qOX}C{B zynt>h$f*IF2Mq`<<`~E^_;=?SG(I5|p6;^%5qYSOGVKrWic`w*Q`pryqOECOUi7Fe z1D+iu;^`l7p1JW6)V+D`O@+p)vAnq=!pqnD@8{@SLGN>W049d^GgB9%ViR#mgKc>? zpqV{^YKvm=dW%Z1DL+CfN^KQ+nK4tT8&s8e;P1SOxaZF~R-SHp9=m8xS)-YTP-mSp zzHok$?~Q79{YT4BRQAhmigR6wJMK^Mi3;++w}Iu$mEvdtak2UDw^h5*XK(Y7K7Tab z|DLhh;J_wXDfjeS!XXKVI!>FhY-cs)d%^D4a;a=h3C@!thfEP)fULdTzo~-=?n+t3 z#l@nbVv|Tiz@|7d@C#MR9+3bM291ESHZaB6qx8fl1~7;a5-cHWJHQ%vfU&Jyu8>5= zI1Q=HyaZ)&fDcJK_@ng23`dGK}1d%$m`4!!SUs9{+rv|-v{b#$xV z!-Drs9NN$JvUo5qTSNbCyxZq(_lQd?X`S^1bf7Vhs2dJ-po$&PKONm@+a@zekyl?0 z_V=F$0#wxiK(N=KJS(N23n(*BjQc}dY>SmEj1NDqOK6M9BIw$KyrT@M-VeAB>;kxl zI*3?q`#j;6FbczA0ztf5!z^KcRXe!Ax=%O)oHS+I}0*x`-2F3h;*;_*fzjnCmAAinQH!^EuJ0 zUMI^IrNP`BptEEv;>7r{_3POE`Fz2TZhiUiGclf(TDqY9xW!N7<71ybg)4r~@3$$t zWpZ3?;(2gHQ+;7_kdY7@EvWX@SW~_D$hK=&MS?+_ne#bp4q$YLV?8}E+|~B_EjJB$ zd4reMY(Uv;z(FgLKq2O>a7%gqH$AQ zq4BX&KY2xNi47Swy9m$f)Yt3gT_rPq_a43pzGJk>(b49^O*|PoZhh%>=}xR}+x3HK zZPg5WuW+7AEh~M)^JK&#Ok48RyHl<_noZAJ5X6Xy5E46Lq3F|`keHZ?PGyv|7q6_P zH)+}`EsLdl0Sj3}evpsqja=A}W~#O4IH`^IGt)~h~RPrsmxylm6#aEtZzsBGk;1_%~ zCKnPDG~^H0b|*o!NB}RCP((8UcvBhbQw&UrD+;`uMCe1fY7`s{z%>OTZ(>qWYV`of z!~xHIF2k4@oQ`b|W;roJ&kM`jgv+j~0!u?V{R2Wp+!AUOZlTVHkj{Y(1Ca}nu7@|{ z@jSajr+4gD|L3L&bZ1XgG^3-Aw*6BquJDINX>Roi$Cg1yVi|fHO~pcF5V6FV0lODi zH(*3!HRjk#`rhFuNP=JBI3h0hzImqNob_z?%Rv3ecMWQSj{+u*1$1^;)xU4jjF5X_ z(b(0{BlW2Ax2(gDv$WA06{)k%4`ME-MqOJyWpuvCWx(p(R|)n!4d4$xXfCmm<|ewR zlF)3DDUW|KUQz}zV-RQ*uJi(!Da3S+o&sq}Xqx{$*D{2&TZ(L8A5m z(FVB+71mG@1GeRH&^C!m=fZ`ppwfEhFI&2lqdeaF_-;MDM|W!_;-adsc6={wQtNrhj+=}D5fS^d? zx7^nje$9H;xP6I7#Av#swX81ThIfwoG8F7f02}p95!YUJ5mfcq}A;cm* zKg1xFd@`(Q9z_aqqGpCI2c6pT3-{QV>tm002Tb{eCnK#v5=bI4iCcsc$;S4ESa+Z+ z+O!$I67b^C=zoF6+MDu6@)&V(nvm9Tn(alTUNR*%!6z~(9f#gC@j|ceJazIUb76aU zJ28?<>eHdfdG7s)UbSJy@#l8YYz@yM>F*_wWPedmbt8T zz@c@apkOFG&A94vWH1O6m&qaiO`7>o-&yEy;Q<@a#IsLd9nIZ(!VD`kLfqNUlE_?7^bh7jF=Wy*5 z>8)FJ$EW-k&(K9p-b_nLA@g+S^jqhFs1hfrs&5_P`N#`lju4-6JMH0XOsoZYcS#?+ zG5k7ve?z*5WRndH()}`9qlsjsyd+_Ka*~V5fi7p{M%1vvy{og;FWjuF})s36Unbl1r{g+6t z&9BJ|5jKYNw;l_o6_zkO+j}MMJgKV)U<0NkVM8Jc6FVAZkX`QEit-;{uyEl?>VXWt z>q(qykRK3FH^e?NKxJ24#gL;%>>$@p(xmB*`6Z7xh--|S@q}|T{`l=YrPyENX1~R?Qb~`{u2Uw0;$bSYe z$7cYsBzAZ!YFRX`ix9A3V^egZqPwT(Dwr}SfZh-WZck=zF0C-}(Z@B~TizwvRzAKO zr(jeiAjn91rsf|C5Cw_TN(b}|A06*q-M`r9e`g@&3St`;zUxnp3-sGAy2beZeuehbK`k~+ObdgvHpyaUVwqTEX~SduA}tT=Th|}q>%(&B#MzaI1~fG7MDNSc^hLy zMMDoM z>|=Mia6!mA2*n?$)*0o5Pml5FzYh#gUvXCwp%&vKZk|Grgs;91WL$YztgyZJ=<=*=LhL@)Hq#IieZW zY;?|V=B837Bn3>Ps zCt6Dv3>i%}Mdi*167+_ZjZGQKluA&WA?H;+<+%3Rbr7g;ApQ|ztBPF^YGBv{(I!BK z!)e5fCrL*^8Nw>fDEf(k=_yn7aq`MVq6o5byPn_OzT5+ zTL_aUqzmEg#?VAknx{cV)8dagVPPR0St<_A=mMhjfYL!Iai4)qx=tWsB5DK7R_FFR zgHs}^8w0nNdD76x^LlDxa^MkURv?}$|7pd!{2lG+YY`L zNk(idFXkIL`}IpKH92{j+Oy#JX#o$IWK3tD&q9a`YVHr8dUTjedTYYaC%( z`p{-p{_)d!ELj2ib^_-PNA;VP2M0V{u!p{&B&|K)skD{bXGE$vaKFH>EOX)7<#+0L zP6Ybpgpv9rWY+xKGY%$4vr|9zZa8&~uC7V5jJ8~S^K4nQ3RYK`xEGpJ1M9 zQ7H=F3^M743{D_ZO~3=Qe6pwB@h}-phVx#%T2p&K!`wR+qRBqY`|GpaP?|m}j_tV0;t^ zCBRuR<62au-qzIA3+5D$&8DvsmjA>Nz>ui6y@Ax2v?6?U^z_6OX|ww$?T_*)3=dkc z;=;oR$Hup>rk7YIJ+yUwV(qSQI$?VWLFLlKMCa?Cg+3mGvz{$iW|?lTliFpw-~HX% zx{>hvjjmmc8&wyY*w`*Sx`zG%4=?T8r%%uJGD%L3W%by-w7JP9dYL2pReX;_%gf)4 zR}YAVgz^T2+&ZeH2X>{BE?<1-dmZfm@5{_D{S z-Z%x_Z9>ilH(1|t`|mGkYQUQi2N`+JC+N+M*%HD#xh?G;TMbpDJ2+L~)*?YrK?Wd2 z&OO3A1&`!i8Rx0%y$z`mv@<~aeI@dzmV&S|2M?n&<}pGSedy%kYP|U9{^EhLY~D62<6vpP zD*O8pUCAsv^|B+V(c$5%w;|Tx+Vl75f#de}Kpa9*NP%C#Gm;zuM39D85dQF*GMQTY zkF2S-)if1`$-A3+KuwmPhVf(-PBg3k5~%( zaDAk}S*>dT-iME7x02@XA;UvTlshqAfYOsA5N)d4u&^Vtk4Uy{7a5t4lyG=vODvO` zoi-dlYUH>>Yej7?6JNq1e~Ex_#1%SWh?ZC*-y})gnaLUal>bXBA*f_<_PeuBr;1r- zz_#qBe@weVkv9`mU8h>qUtSgfkC&}hqJ7s*-Z}$+p3k~g&#V;2Uu{BY4h8bwzdl# z_pTtM>OSHrtZ8HkeD)2@C;w4mYDnzWC;R|-7s0A?A)f5Y%$qTnFcXnfZ9PpHqLcto zb?8r6n%~#6A3kuP5J)Mp&Z2xgx^QfmfU!@Yq7u1ioV1fWUB*_f(h8V8GbJtE%ei~0 zfZ~4l<3iz{SKW^n8@^^dBWATL^*Hy&c%kgibSI48crMiAW%ca3>`-q>Vq|GlZiVwcaCWm{EuBH~j zD-~sZ22v@NosUtI2JpUjDG`p20b`WY4@afe>C>lWcj_xD<6zR#+tVnyQ2xj3Wjxa6 zH-F1smI7~$Gjkfy>;zo1M6Jir<;7^r70f}?llPAvj+I;|OMQB6Cr1fMOCLSoa>n=f z>j-EhiH{FdKl%m_?Mj7(KEBO6Rw8UKB2)QvRJG_gqf~Brm(+8Kc*W*LUoC3}y_zm= zHFil4UAN}K;;)wH&0p#7J`}8K$;BS2eQQsxl!&fsk(bQHUM@}!o#_L;8c*jfx|KIo zEB9migeKQU-YW}({FqA2X39&-{Yxb|mUC#>=JY@4G&HX?I96cu?wEN_yunVnrH6tR z=V$SMsJY3hZ#8u$^EykQ>Sj)k?OJNzG%l;Z|Kzxuk~qsFa+XUpzF^Wdh^f$HX5XF@ zJ9AWd$IR<`oJBYD*IqB1->RX1uhN~x#P&UN+i}?&3pH!53i!;I{b0PqmuaGRSo>Cj zI6b9S@zOxN)pe|O0ti8w4KlTAU;fK6IJ8(r>2ssghIB+8a>;Gqu8&hs`|{;CQPyec z=_80@J?J$1YyNbJEBUx>0CdzVi&n;g^Q-f~c|sS;=>!KC0>Um?q9*=D6|8nt#mtT! z>7c2^a2s9Ng--rguO-!0zH4~!voJ(ZgY!dEn%ND^lM-PIfJKBz-sLlAQZZd7p)6Rm zoGTLngyj>NrFbvoaP-n;%Q!hXwd__2GWE%Qnk^HhZu4$Cd-7$*)m3g&XF{F(V!5=x zG;ba(s~+yMc$DsR%13yu#P_D%dz{xx`mAkvE4k3@3+KZ%=d%JHCS5EpG#J<#5>4ar zP6@E!_?(RQ$ic6*^k=3eU%0mVYRkH=YPo?8m-OR#`uQwSw>=9CFT2yuPuw_a$@mgTu-Ut;DIvX3- z5IeQTEr{*f!LgMtAAb1B&y4uJa;mR?9?i*-qq55Loy4f#lSMhIszvmoD_-&@nC~+x z-pimpewmS?+Dw;US>-yc4(rh7mO!)`00{_7NmbKIs89CwhUa^eat1Y^Z$f9@>hOv@ z$X2_$y0o^=fX`hAD6_;S2QKa-P%`SgtbjBYc#F;p9qd9L6VAT-%Tzs#oce}!j4pOX zTA30I)@`!+({ot8+q7n9RL_?xu>{#DYVlrz$05?+*Oa>cyFtAu>gb{3H*YpMFCPle z{?V*MtpdKy=Q>=q&pZcZLN@d#VhQiE^3qSHzb*Tu)*JI8WOsC%eyZyCvG;GW8(Wk!{T=tCG8B90zP2F;EI*yw$aS&s2T(~>Kdhr9TH=cKeasu*$SB&-~5kr7n8R`qn>-fv9t$CSbz zDKiDvNbmDyocNMmOi}GIz&VY;8OUt(yLGD$XkZa&P%Ja^TIe0j82K3*b z3+gKrw2@QSA#*sePFe?_vk#i#uQ45I$&2AQ(koM~@!S73>h=lZFIg&)uentmGjJfX z%LW}r7SxqTZi1BNf_aja?oj>rUEAGCznu_5xS|E}wHOD(Tto9$`qksM3vYJ6q!V(-Xz zwW#kmG$JO>ANS7nt&LV}v&|BIH>h?ee~*7$o7aMtPMo0&&!22y-c?pD?o-ffu4ZqN{cBlSx$pUQoK?g9mr0R zk$LSxpMDQ)rINvkQBzU5Zks>tgelkB5_!XS@m4Jyk11Gmwr#u>Y&>%3;9V@}uvY`7 z%sD&W={)P)yl~HCI%}C))|U$ZJ7a1|PAkiUYE&awOlkDD{o2IXF5bS`$2NAZq{8-B z-*8dX;%$6(Coc&HtKPghCP{a3Z0KA=S9gl&+A#C(ZVnEsLOU1B9V^dkB%F_4JfETl z9jlbNd!NisMJxRNvJBd%UTCZ|0h=a@v&P(;!MtdoS}Gt?COd@IpzGP|!)>(HB89Pb zKDbg_Eynfx8kSsuhfBtc0*Dc6C_H@z7WwYM+JcHFc-x5=4z>2ZwbhJ_UGo(*H{ZPg ztNSLXNSxrwhqG9#fq%t{SRBW(C|gtz&Ow7n%KfQ=Q?tIWGcFc`bz}^uiJ! zlMV9S4uci;99V}18JK`#eCdY|tlE>oJ<>Dut9bNkN1|RtU*ui?0_i}zx(B;jzPWg2 zMR%5$NQi7Xd%pPTj&t&DD}8_7*U_9#iI(c{xNNzCGkiSFYG~EP*0eKMJ?Z`OwN-lG z?X%0tIKn$(!ymDtG)r68ce6~v(?Uc2zAKLt7x3h{E4Dd5e&EMTQMD^U>2}5h0Zynq z&Q-|Qhct01j%YfD*2MYQGW2Cvx2MECHML;=cB>NAg&NpNxXZYdimtO3;9Ye+yQeVl zyhemC3(sQND7Mp6)K-b;C271L9?&s%b@b^3K{gQrrE?4zte42i@ukf%as8nOVQ5$M z!zDbqy5dRN&Wg3{bVXLySJYeg)4zI_lA=bYJ+1AnJ$gh-QIX9zAxztHd)Hdailg6m z9p05%&NHTZYwx+=jW*fg`m8>T6QTtbNtCbQyvt6Et9~z2n<(y(%;Im=b`I(6$z0Ps zy>jytgXp*VpXO_Y`sLJi20h-Hv+5L2f&CdI$4%g*&?r+Z-IG(hqo2xF<`hbRe zQBvS7uezV_J_ zMt{-x!eVl=u9J{R8g*zfzSW~I(Z-EGnYigPw(gy*=$;ZwIYUe?p`;%qF|I)RK^*= zz-U(VB_ercm*;2GJzdqN^wTT9UY52VO}^lFQNDOn{ZP7S<(cOazO61M4yw_e3ekqo z1@}y4Ot)A0=OE6>Km1*{s>^T%lDbvw4r|-x<(#*?I&fh7<{fHI5|>})ipf;o9y#c$ zIQguDM>-~b?~MG*UGslxo5qo$dU2c`2hr_Fw4ah6Yl)bJXKPm`5*-Wl^6)@j zesgm1ITttaEBcjBevHdFn?Pm34is7}ix=0!DoJSP&dUs%%7;B%o{c(gD7wRP-4O3z zxU}ztrd0K|g5;AY&sr21Eal)|(P(gBx#6+XD|HVncm0-nG{`2l%AsX?!$$w$zz(g> zC2nu_`Dij<)K~TqN>aZ=H|@7mRk~f3;pfM>d+f`{&;tu-pDbath3Qt)D)oCA)a133 z&#@GohvvH%nRb);#F?BF!f2j@u!7T zjv=|Ig+zPjTcWzba6%43Ku|O`R~$HWD8W9oTKzS`a?^`55lC`%EoMPV1BW#6KBioX zGXyG)vpy-XOYvYjfzv&Etj(ot6kFg~xZqfVh29g_*=UulPg<|e-rt3dnY&~p>7(Zv(D_yZ8oi<|{?VohhWO-AnsV^Ds;#>3-CLWjzi*mr|>n5Bz z)fn|z4g&ayEh%P zz*5cJ9bIa}-yAcua(L`$aO^RqJ5ziayp3#BU7BX7`i4=s|P2{Fng- z93EKYJ2G|-8Qg2Q93B~Oe}}on@t{`cwJ?9K&QDJh5r50g&z}sHS-!!z=mDe_wSeeH zFj=KPA-!A!Tdxj-dP2{pfQYI z?l;HRNLt5(!W|XZf#YIu$>yeBD zhm3(?TGul_KsR-OQbqZmz{I}SF)b}^7%6NYUe*1zZ(hHSA;m&}Qcf-IxPs$0(=}>& zO!4*W*B#KkhNWSB%V=MGvIy>;7I*1Jjag4k#`a;F=g#L_(!*L_;Z)q?vY{vGSF^BG zgYn&yS&cHbGgp=+`CFw2c;{!0T6Dgz=8gPYC7}37Z0go_BmtYEwtmD~NW|Q`0_&{P zRlA{z(7v}`bz=exMd`-MOsO6@qI2lbO*r?jec$pC2x6;!5t|BbCC#k#L*ueil_N*$ zL7I$1EA|ylp-6K1J0G6TEgg#Tqb=tuED+DlUC;>6?g_uK5mnH}>2vvHBYZl4O1t5F zqs1BL@w-QBMdGF$3UQGU$R%Y=e`~vDJMDYvAA={%HrS-GW!Spxo$+FxYUy1lnVe`# zW^^6@Ui-)3t!KjVgimHhgon$xj9O<{H&)Z)D9SxRgw^d}Jtsh(O=GSYwXJgCfI0}9 z8Bk_pXq7IUM?M0V)Y8;kdjRw+O*FqmhJT>zFS2f(GAix5m)(!!QP3b%eA*si$?Wdo zZ29Io=L8g*A3A{pB83{q~ubps48XXiJm6S1gm+8MX<<5(8nH8eKZ<&uuoc<(ZfbyZT;UaI`rA0Mo=I%_wGeN zro}Q%M%19`5qx=aV2f(VR&_M(`!GFs1@n`OUmxO*hrjyz$$3Z8#rQ(C;8?u!{JXBR z2kyd`mYg+O6Sw#8N!yvDDB+f_&Y#pI9vwe5R5c#6gxt$a+sk7C(RIvh!!^B$JQu?> zLJsu~H<3t7Q3dxP@{Es9%2pwlj{*jR1qvrFQIyQ_4R{|bB^bozH)A>{5Rj+hxkEIS z$@gVtb$Ah_HW(`T5zR#?vqbWSXsj%Y7HQ+9I1Ik*jst_MuR7`m-y+}1ou_LLFcc`t zpIm;Wd~4@IFQrNlRf6?l>PLS z730K@+opyz5-2HZM_EP15x|7GpoDxdPNAdRuUH{%-@t~eWqqk?+30 zzj{XqZZQio-{Jw@_ zQcI3p`L53U$kB}Uu{_Sv(2sHctMZ?FWp7YArnZ8-s)8Vx4;>9Mbb5$}@iUkSJTShF z2lM2M{Z@y!D^B&?lrFEtfu#lL9OcBp%a=LV^X#JQ1y zW}Q<{TsJQ+sFNm3=?9L4cu)wS8%xRNILP-wxR%v;I`*NJQ*49T{pR{`^^{dJ&BECU z2SdI&1uJvnqaj+WL(b&*hEwM>q{JjQ5YHRdz?M99-On#GTLuAW83`-+rl_^V#*Oz& zyW)oVkm6!?F2wAIG44GMd#w)6D6MBp#j-JcNZwE?*Bv2$J{iDa=r%n9qsoT}ZIS5} z3eA0YbDse>vnZ_|?xXDt2rxFVHNFt`Wbjp3sBK?Bn1Gw_zw0^%7ulp&6P7kDFREGs zA3TUb>)@CU2Sq80F)}dpfI$g|?`+H{3CANM3tQ#j!FYRXb#;I6PoIH2rP-ikV6ZW# zb4ulm7r+2rmcZ$>@M)pPH$9zy@q$QJa{&2PO-ME>O59eQ3QbT2zag(ac; zS_OLsS=ciis0R1wIVAVeOAa47vgI~D&(7ypKDgBKnAAt2UL7VkfDU~GaCchc1Dl$l zA3Ki~>{ERpxeL^okAxL;|Ni~XD+Nd7M>JnX-!~KwLUn35;M3E8S5|+q<+oh635(_q zmdSFvPfE9qKOVC&rR5qgCl?u3ZZ8*_jj`J!kQW6vAI2|mB{c6vi!zfW@%_c{ZK`M6 z(h|mN`(oD>lhfe+>E&KXC4FN@(77=CMZUBZ_{HCx0$3 zn{+(y;~J{3)Z_7MnWOgZH9kJ@`Y zV>FCzFApsW`YZ#JX82!>AcXbfHiO8~66|6By)Eze?ES^Zt8EMR;ZHw!#wd!H$qCXgdG?}dQ4Ixt5;wA6Isv9cY>XjwL;+D*8zP+&epibzsia~{ar`E}& zJyRr)R54TvbGK9p(3i3_m{6A&yhwwzRnemPwB@ZBT*2*dNO zj?Q@Uc?Ro5ikOcV_y$}lfL?UtmZjsQzQTqFeyH%P0jyS6jP@Q8nG)OR)SJhtFKg1V zer36sNR|YkknI&#@%rT!5$)8!9n$v**%$$b=1{>2%uug|K@V_DiW-vN9IT2Z`M7-h z_Z{*>1@tBF-c{m^NFZAU!&tfkqc>_b%F#eDssmyl1`<~vqFhi>;rC(rTh#KQGXXvY z=+o4}0NP^JGH9U4K5z5Rzhcr8|7o=6p@c?&lR#FBLXK+o*MtkLZt5;?yyySj9bw`G zG$r76Iz|&GU?x|Ocg-qXRTwc0AmPQ(xi{)=Bme^|mx4 zCbxpKcEwjOLR!`)yvaGiZGMIwkL@lq@&?_rlMo=Yy8`uxAz1=N_Xu>vS~#8)#948H zhe>Ikl-(y{=vb$P#2lC!c1g0mL8GHLXaKTs$11}ZBaQbOd>DnjseXxVW13mO$;hP;y3V1Mrjm%j`NgN`>fDB%Ce&d0- zvDNln@v`(o2hdkcOi4`g&42h?+4<`YO~awj;Vd0?))&Ga=Y-$*+FI`mb1!_gb75uS zmBtXvF#aG`{^=#eaonaY_ZS9z)Bws%K%CdVd^zB@I;PU5lT^bb$7FfQ`}dog5d)4Q z1%K%PPlUcRlM=i$Z&5A)Zp@esyTpfgx-atCvp!g3+W8D@m@NDne?3IJO!x7#Uyo;v z?8GMsJaEo9C4Wzm#G&f*i$kO1v$O6Nl~2viz1qI!SFSn80Ta;+qrwUa?2AVtd-ltI z^3G5)e*{FFfES}5{n**^r73b!{f;T3x`*yn9ku+(2k8n#zCC0s9p<6q=bh0udv5Pq zz)`8V(*E1x7^Fx_5Y&;HvW(x_vCB8mWch?sk;1Ln6J9ea6~b8+=J%4mebiVCH!0OM zo*O|%#+MpD45M}6_^N>>w>f6vK3HxZNcJcv5~J+#&o|1}ZCk{*H4Fwnr*Vq)!R5lI z6ViZ$@NmII-PGMA9|p`B4X(2z$Y^Ln1emX5lbs4+cAxj;d@jEl?j{aN$p=@*)eJ`Y z(tA3;+Eo55n=kThJK(TY{hxLLVc8gi*okX-0@hwniD+^y&d9X~f{dJF6(+w)%UXT> z@VSJ1pPbVeiMESJ=CoSWB)6EJriuRH!+w}VWRZ#aS&nTNyDeaF(^nPYJDB^coN2|3 z3QpD|+Z$_yF+Sc=kfANqKe{`AVh_3l!bL*)62hAg-5&WK_UrI9|FXB55aDC_HoY$` zts$j54!+=KXhH~HF&DQKZFZmqEY}@!?l}}q)2IlF4ZlVJT_yQ#qwUp>WF?$5327a6 zc$U~x`n`*Vj2C&NvtF0As4{kTd2imu{jy}>#v<0rY3{80x>K2VpLlh>!z!hy3>_T! z`nE5XA?e$a6^i2Fk%DVBXwDlG$g|1tzkfdxMK25i4x!yZQUVf{NWg@C8IdA+c!%-( zgSoeA8Hdz~t{=x7@_4Mb5iA$%H`Rbu`aE<&;!dwHD^)^A-v~{U#oD$t7c1P7b+~7a ziBGW&r}q#3YVOPNEw6Eu*IV}IxND&SnZ`4nK+aia6CHB)x&p@e`dqo9r%RH?miGu! zU_jL1cuU@P?(1pnGIBE7=Kk0lfAH6en-tTfTIA-@(8Bj2UkF_53T%9$`P0lZ+>Xj^ z2{sSzjk;mz-Bkof1^$}mpBx^ZVrzP_C5QH^@YM6lJwYMiFZLE9Qzj27CK;&MvLid5 zVvE0PK^Y^82BkiHV=_~u@a+qFuoD9R8G&$$W%826M@>6H^7*JQOH_~5Aqd}7$h#X7 zV%@knl`Thl1fQgrX5r@JD!(*hcsy#Qqh;xSRuNJnQnly7=ShS2#OX|!P|Kp?#Zy0~ z%ZYF0KK*w0QIyq;%Fp*v(s@2Oj@02=ukJKjrT)BP$FG}s>^_`Yw#MeKj0{rL(?41V zl?vr2t2)H28u+w}+wm$=cHU;2CYf6+q~BWm`uJFYq6z6q=-h!_z)R|CuQsGt zOuTfkt5C@V$RPNFURz>QfA+o$O*uimbn;W?x3V%WUK zXeQwY&J1kiJJ>%xFh`PU1|@p}6PO)f`K3=%RVw#0>Rkti-Vaxn)7|?(DZItIhEtXK zcCeI?(%tvr4Cr=P@#uG_Q|K&hY~s@v-cq0Zd5dyvcV^wu^+P&SwXHkE^o$ny-t&7t`B5r++CgQPCb2yu?8NsI43h1p0Ciobak86F zddy#?1{a2n2|Y1UZiDqG*Pol5H}`Q33gwFSmDr}77{2>&kf>bJIN*e5wint`6agQj zf{ZL350H<++yJf+nJF>HZ~NQvueFov1thP=oJ654TXi&&0N9DMii)IK*ag0I^N;h1 zokSE$cQhpjTlrNx>%WKam2&fR3nWU~L1br2kPg$@12`MvH6CCKp=2$wPQiJdNW&+8YeGQl1r&*saH(MLq7c!FlVtzpExANh<$s#V* zsTm0~ZjPX4?iJsXT_jFNua;lN(O%4xPkuM`y&>N%A090mK_GL58wbf5>ei7vQ2^34 z#E!D)YU||M&6Rp4la(JjkesM~j9sQ&G{RC6spBdsy*U5tT}V_Fmc=givYE zyNS|X8kFjHoY&p+eAf5>_xij%pH%mCUFUV4?{U13_wmNq%2Bv6dDJHzuo-d=CfBs@ zdy;G#5jZovvhUM}J?W(_I+ZAZla$_Y-I~_wEb6nFAcp2mWjz=e=sAKxZ<+uEGfBPD zoCGj#1DkR0M{x={odyqSLSllbY`Bt5m5{d4v~Ty)>M^r)NF%Q=!L>zQrv=bR z*O07eXYq}l3DwlmmE?}7Bmyl4h$K7GD&<5^fh-q@L0$|?I;e?cbe{Fz2!*w|O`+o| z#>z=WJJS=6g`5{39>1hn6P8&U@tm?YlAf?OD0f88OyJS5_%p9qrN~zUg{b+^kx5_xnr#%l|i(sp)OMiQhSLFtR-bz-o901JnJN$ zb(uIyEfWZ*tV+el^|29`GL1H}Wh1!#p>5Mszeh3ktAbXhGpHN26xA2;^TB zq*hUZ+3eMjyOu-Owv;Adh{QaC8<3pHgYO2L{y|%~YZx(*KKhtFLs1Yv4Sx+@%rmKp zb19?2@11fFNtj7!=(N1!@HhO{^0-=Pd+}ChYE>0Dh>RK`y`*qpAR5pCHMmq36|#A& zIB?Du+_`&~L^CNIoSgCK0aNX2e0J>uie_QY=0+Tz$6*L0?)g(nFUmp~&w0h8KmjVC83jX(tv`BBVI;d8V|b+SHWl64;D z=+=Y!aEqCp-SLg(zZf_X7E1#<#z2YTW6*FSohz?MpE+*&NJ}$Ay3#u$>Vv_PY`GsF z{t*A2t|&10k|mjLV){Fhx^!d!x*y2VcfbTkfKmb=YHQ;BsrXQj$&JVK;lLylG6KC` zPDZA}B)!Szjb-!M;c4x@4$Neu3un5kB)9G?Wl< zsc8bCrYV@MqRrC+!HCoT+v*Kc9p)>*HHT2PyxnIopS7{Cl(^FRS}mXs6#3 z@;fxQLJo|5ocaA-0swGmeaht+l(kiS=Hpvc%jV}xA$90N)LR8W75!j5p#Laieh7L2 z9}zr2)3QWRWZ>102aeOgzy)+ats}q}w4oIt^3Ip}Z4C*A&^S;iPNq`&0ih<)3YwXC z&>)VHjTOM7;s6QQw{E?uOvnsz8krTwEr?cSPMp|}0cs205ysa22_7Dsu6sX^3yM%V z1Wg{|&DD9+qkG7BanI1asXjBp@{_jq+t?tR{8{Wr zvBe}|Ox)shI03hllx5Qw4dGkgNtc-^TXCE{i~SqP>XI^-^ce^bke#&~}&bYscd zPn!ID_Q-ENeoq14BRm$8(s$${dssgu3>MuO-kCvPEzuJPIB>^J_x^PJo;W_#J z`!qZ)mKw55bBgC>sC=i)O{*!Nk1Voh4>EVjQXQQL;CXH0=Hhw_3fkXKL8R)|Za*2h zYFg$D5+rMI;$=Na+-=$d5pF=&bOp%aB_{B+1WDi`ZoP5ZV@^5hDjO(V@Dtq*A4PYD#MRWU%1^W zjgRa>l_(wMy{cSiM-psw+o7Ii<8?petW3M?!@hf)1f4*pxOCaF z7^_{4F;)V0Q4p$9M9Whds9;g(Ld}2<$|hdEs&H5BDwR^_soIY#4}NJ`pQ@}88P`~= zam2jWkv0IH2}_V`&k!e}moSJYS3Hmj$H)$HHyWLbhbJdvBOEA_s zbLOb{+z>erywIWDOZ6qDs5CkB#DR1gVdxaMS^Qk|sgaMHXo=DeKfk20RfOo?pQHke z`~a{%VlYwYX&>PdVuK`EFCe>xiH573M{^p%5Y~Zc#jO~=6G% z6_5G^+P<(Xf=z$b6~?=kjZNl+NfPis0@t)3chEL}$6W2Osi+}YLdt)9)0D%YZJp9L zfnpmtIF#J0@ET*BD+m47Yk5?SSTc>b{qLehavab0F3brZ?H(QLYA#Uh$3pZIk zu5PbV({x$cz3hKi>UzTvR%*_mIcyd`IiP^x2vQaeodV_ug7_7Rm*Nn zI`dIj3Q@KdrV|p*3u2Z+@dOwH<-q9;xjN98n&=GiC>~>0)p!noBt{&hL=g^QgI(G*9bU_iVxYR0<0(JX#Vv*&{0k?{N!njNG3z(2`$)*)92|^ z{rI+Q;Lb80+)d3;Y__1w&TH=uT-4HGCBg`m)~tB}-d62T2jE(MqhD=OoX`5~-zKnx zRQ#fBv-Ja-qgS`>As5QO92yZ}1jLAul?dq1&o8uVySbh`K987~2Kp*CzWDMLrh`M9W^C!J8vrhotPe4Dl3x)@Twqi{_AeTz8l zf*U&r47JThZQ9p~l#G=%|FvU38@UBq4+aW%N5{kyW9#MZt!I4z|0sF=7)|3n7(@~| zQ)6@#cL)d25T$5Tza|jOrvfUmgqD3E3MAVQnalEJ9tE#ww{0^VAA+|Sbo=KFf8uQ) zBY(qMDPlZ1#EAb$cQPJ)1@@ab>JWOOn>QZc?xsk5cz0 zwuQ*LYrEzU&KEv%3ZA7FEh1no;`{m!pNfv%CPG}$$`9w_h9c1>r@^aGxVpZyP!KE{ z53%9f_2>Y8eEh-2D2QSj;Q_j_fgoS^0zUCHY@H0qH&#ng^gW-0G7<&S!g?FFy}60% zTm^ayUYdVBe)GnSPA^rh(@22|59(#ql6Bz_vnw<9U>A;88E8poA2XbMEtXQ!N!dBK zJv=DKB8m)$lH6+y_F5kSwK_)LQ+8!FZ4(%PI=NbCHZ`y9AcO5QX=bNwlL;CXUX&@l4^I!waNc1J*Yt3+yCg>(SopC=HVq@$s$X?)I` z5Hz|6W6rT}-?BZg6bK31(7NQKZ2Hp#b%CWSc@It*S&LJP+gP%9c@qd&(3lF$+|A7` zP4XaYc19k=w0_RuYJ^=ic%ep4E*J-n{;34RpjyJWs$3Y-|L5`}9^zw9pMCaBb(s47 zdjz|pNH3-Orysk`Wy5zJAt8U=8Zq82v4}s7^8HRlC^CnECevaJzPG_xZ@nP~f>E#t zjfwrQH43tL8-&+Zp9?#SkV+}WNRFF-Lb$HwxltBB+eUKFt?Gmn3A*UhKEQV*4+3HC zCFMV8NC0<(DQ4JJ5$1pl0=zII@sYA4saefed*_x++56OUHm2SLRnJI$JN5jgKOl}` zfZ@}6qA054;j2J9yP$cRe3J_xx+ye;Lmm4wrNCB4et0x4e)RPx6P{f*zJ1Sg)*7DW zpgyIaABF1stbe{SUNnEcK3dHQQNa2bG0c0(hkP5*&VG~y5V=GxsoHoPR8yOzw$$Ae zzCueUQq*zj%{)Fa-(KsYc1@})z&KOVNVSZ%9TEPm;u-7oFk8Nsz}_E>{a93Oe;d~-;iE1V*h3$-Y&C;+Jt8aK@}tuN1OJLJOF zm>Q-YPd&ZlDR7HU0Nj3=MFGXtlDKpMs(*qgc%?MK{S&{Kn%f^=U_WtTiZjLe)VRu0 zrb})I$>ZivVJneNORnsFI24EY?8s&u;eWG}^DrQK%Y0 z3q%+Cg2pd|ksmg^gHrPg!kR*;v6l|Dj57Q00jfPc9n2!5ZP6xr+1iDAhRn=odU~i= zb|n`6f*TWVvBe9St_H|OkYF1Cs%?i5C~@F0A!{Q)8M%4x`QW7hD7JctZ7JBfA&ebc z5JDPR?F|68qY5&QVdV$rr)yBl?skz;XKc^d33;Tv-fgh=P1>rrPQX_m~`HVGJ~{RV{%sXR4PG;yo0kz3_qYANUBv4!WD`+PmSiTolDHE z_1ZamTXp4po*NWZ7Tb*9crTKw8_Eb5X>8BP?;DP>BBPS;`j!ymA3$xDId*vR= zLo$HF6tOC#l%G8{@z}@oPwjO5b+IlS{#iLI$s^KuQZP1yATfpD4XtL}_j(lM-a{>0 z04mNDlVQYlX57^@XWG5V_gvDi;CjxAl+@oFo!F^^_3G;*DNeAnaGJ*%v~%y;saw@q=PW-Vf=Oa9 z*+yxEvW~I9m2u8ai5C7#?`kqmyw)bNVs4)xO3?G3-AoBYmCQaQ;M=Rn(dg!$dtR~v zl-m%Pucv)r6rMv_z%<)MCrOrH#aNK;OfQVoG|r&MQ_m`?3ll>)0ElfY>ENYU0kU9V zl{EP+wBy&ukw*3aXc457Gy`Sk?==TIX0p^Llb03?g}72%uz3(%Hv=SedXpf!RzMP4 zdl@fzE3F3e^YzPc1*8Bx5tZE1!{g(%K)f283!j9+-1t9L2=HI`wT7`e`)lyFN8YZL zf7%CX@TowtOqm|4NEHc$xICC}B`8vVE?6bi@tUIPV@Wo2oj5wsvs+^_51ULaji{xe zJt2$+!Usa|<;{h1LJocd@MTH{sUpz=kJIigj0p}7ZhTw@z^odtt#i!f(}QWImBW>- z@w=Ovi6x=Y!f@kQk(W?-v80o#6wLoQ3dW_GZN}0tLwAm}AfkTzh>JK3q|v5*7i-(Z zoReHU>ZfbZ4lQLG`RAs9H(K??rTdL%00F4QHo8nzPiU`DVq!Xn5bz}=x^mz>vk)gY zxlC=~P5acfk4xIdUm*0dQTjNw)cHvm5)}-7V|Rt@o@)4|&HnepZsTU%+^;>qtv?7F zyfsEvW=&<;-;{ICF!ZnQuTAnjC$z%cQG$9XrEmo2B(KfXFlT zOKD~Yca*;|q<)b6RkmssE#we44^PF5V|T@xJ#<+8vW)N6_pe+3yno4k@n$M>32sCD zhuL;jC}$X16H`ueqRrXM_JqI-0Q71>tqVazr(_66hKPbbrkPutGKu(^tF7rcB>^yBBf7RUNCq<`>_uv#}qpUOK_2# zQU4(4vOO7|4NHEL9SOc3iz+!3`6v_X?=~ zquE%yHY9cK)JD?*>a~|85(JnessIzVNrP`dz!!Y99#sON#ge-sy44l%o|n@W;dNkl zm2P!DrS7elJ7!yF)l@&k*~R#*D`4{x6ceG1K>GcS2hOb5xN?;d=?*TKpC37{!RLeU z%0O;>Bg4HrgeJuPzECn@o?V{{qTGdo>tYp@jSX%iT#ON%MBhFbVm0uLgKsjEJu%2k zJ_reR7^;T;eQ?$CLqmZZyhJZIasHj*{c}IgAZ|N==2K#Wc4jiLr@rsEA9b&D_I77N zf)e|5bDs=Oa@c2NOa*;)Q3|6Py^1oTMKN zRH{{WvD@i{98e1yvTYOUInhxK3{I?~_Qk11FTY;Nt=Ai&_Dy~O03vGyoL#U#9cd)T zjtH>vV+72D2k=no)4zYngLGSNJl<6)HS@DjLae6cp0a3h?QXUOG1`UUoIQ-Y83Y17 zQ%>1n>srkSP|992BH^D~=AN%o2R=a1QUcfLgBFa$m9xvJp#NlCvmZFN26&UJ$%#QW0zI5db%kpS)N?Pybyn0IN_gcb@!JD zm75eg#!`5_m3bcY=v`TOMI!}GJ}>k08ZfsnG{B?g1D@9T9h zx{_M&KDzLz%!s}#xnt^2{?N}$v{!)Mr@*5H!(0{M0h#(h*rTum=uCN{sMcG7?(htk zA*q|rnBx@-O7dZ15K+SAj9PEq0*)t|kJ?kG|MnbXqnA?$&$W^|7ZOz=Uf_XXoztrTDI-W?K^NCtSA@;tL86MZ)^g+`~FkBWq@QvczB0 zzhryC-gPW2Dy9Mv=y5^&d7ypA26?ahBA>df3{i4NRil#7Xzuc5osf&D3~`QcvQg`? zx%9NJCoBD7lim;gAZjsI_WPgqh0tP?RpluWyTRbhK7tu;xcp3q0%mF5YdjX@liu}T zS*{q8BsPW;btM4rZS+Tjv6vV{(5{ylDTba8CDhw1kg{*{YAkkv?BdcrM&uS=!?vy=L>byt{=9~~u#f$0~gcaT&Mc%C2~ z8?YK5(qDeimg94!PtW+%KGy)|gom`!g`cU#z&)79Vt8{m6@TFO%6<-gEot3Ez8U@I z;+#QnFSnt?QhMp)#ryls5dafLECN9MkK8Rv%&HE(M^=hoY94vwe1?sQj<`tC)G&57 zD&p+3e19j-B1@*`0+j-)JHC`4pFO&Aykut&0E$|$Ud+-kRYwp5djb700wcGq54bjh z0kk{eOkR2O)dk}VVCgp;mwsyV!XSa1Fo?P!cm$8SD4_ktg9&_4kvK;ab}y+SarW^i zltrynH4$yNjP$VnDZ2%^Zgqh(zl^S)o*}rs*f9L*IcUc z-ECFT^#txAi=OCE;8-1GoCd8}k%HVFzUWT#3SUXTT@GT>b1 z2ElAyR(Dc^24_IV;}^Ofo&*M(roloKKh_#YXOU+pM!$@&yCc#CsIMFVYsnY}PRNUx z_=%p>a86OGF(oFti|f`oa?@~BJa25@Ix&u?*7AoB$7X~YO(y7%M2UVdio z9_qQn14AVWWITLPYWXDsQ9+zn@h)rNZnof+qComd{N{_#2&N#Uu#XOYYAr6`HRO`s zTU!36%2{tRi!+Y7^ZTG(9(k7T9XO$%P=H7(k`L^F}IUNnNw3 zF9=l}G7Je*Ntp{5VNfS+tMs0!a`GB2B(1sOJL7PD!GXI2=1!5*= z{eCroFJgVU(D9W#x%TM#@e=Z1n*3nhi+zZH%du)+z#~yAnVM{wsUfBgnGnfn(P#5e zht*Ri&?=`djV|XTnLb}RuVZL$jkFYeXtSZ`3{K`$B8jd;f`B=+I! z6mgf4c^PJ=Ps!XW;!Oz;550GBd?aF#6#te4w3JX5h)@d>W-zZRaV+sR`3m+H?|6#M zNV;D^f)5-#2MCoA*XsL09^4@-#nE;jy}o+!&GD$K*5_v0zEGVYp`t>bc}rs=c%X*B zGQb)wCkQOCeV#ubP0Hf?wEp(;kzD}fL6odT)@3SJu0Ao6tiy^?XXW7CSQFm!$T>2s z2h0&JY4C~(OZ)rMTk31JLEYY4XIa#~X!fT|vRe1ET`?IJ578 zXg^4p*JF7p=aCFx?K))}V zI)3?U2ceqJ`WN^7uUp*+y46LvvM8imL9l#-yUz3hD7xa%Mxe@lZcpyd7ZI|DIM9@P z*7Q^;co1C}EcN5u%%<4J@HZEH>T{!1U(`+zT%qjBl@<+&`v5@6RGWHEnEC#kaV>^3 zFH^7+pt&I07c{8gvYcrC3Ijw)8O<(}Ci33bMLIz7FT<%S9s?VMdE>@juiRyKHT{w< z#MJlG4<7HkQ1Gfk7#DGKzmCTuq%1%1(d$j&wg@vD28WRr;5x=jJ7JkRacYFjOnYsz zNqYckhXrhrsh)HY&qFxm}XFg#C-%gAUDh`$gzsxTvY3lpV0pUo(*7> zI%I;72vE&Ia?c_G!YnNe1N)ooec+FzJqLUiQYH%IS!RuPWI_f|LPw7@*b)?}ZY=cW zY$$2Ec5QU7^q*Q?WS>eFymq#nVn)*f#7K_Aiu-U`_tuqvp^$k2E>8cs&>N}l=pn6p z75-|{0L^M(1<$9c#+_@A@>Gx_dgA9Yt~{^LMOs`9xzU8ty#K&CJ_~5#I<&^SSnq*5 zyzylyZt7SQe3!HF42mZDH0Itv*KWA%cVWwCm(Bfw{ULjy?dUkpA1MeOZE*J%_jf&09Z4+9$;ICjC;rFtc?g`BqRb<~&}^CdBC>WY{1a z-&u-vM6=_geJ(0`%!Tk?x$7syqNLB@{E}N-4N?bTtf16XgBtrlb?)WZI-ZD_Z&4=M zr`Q1fsZJ6vz}VZpEqPRJnP}w%;2{Z4#2`iLx8i-zOGV$_j_f8!?HB(AiwvEcArI)5 z^$y5j8r_s%YC0TsSa7Zx$U*RP5J4Y2{ZT20S~1eApvU?;3?`oTIQaVd*zSP%^IS+s zvWtpmXp7z2=0RDCtOz39zHf^hZ~rH5vO9)KxLfJD6WzQdPu7F_W)0yB<%m3HCby&X zzO<5K7Z^CCdqvt~5+|m(R{21)UP>4E8u7{|gG|Kzkp3*&W_u}!03>kQwa$T|vrl#Z zcaR0AFQJBfVzADa+{}Jy)R;JI8+Fn!bb|np7!&>>i;9d|7kezqCrt`4FELie2CYK2 z{f#^vrT2}SY&**~6WigbgvLOi!-Q)*;=u7A5X>l5Ko&TOm%JNhI15>{7b=^kj({4f z!(`VpOn_U67#oem44S_=4wt9t>M zq^`&M0{q|ZV2rZ@SrQmztdl#$Lc@_7o6u-8!`cI>W*T!iEo zKE2%{lS>+84Z@_X?fFo=ry_P!x_@7h++9WE4BOgGl#ShM-lCCCbXTcTvne=k#9sve z(Z88o^CwGo^@3m>!lwaSOD}BzRAq$j?|ZC^HF7i$+>}dkwj4N`P)qKBHs}l$CI_T! zKp1y2N`XYRf{_l>nC+~X&Ij*920T&fI(;z>Z!I2G{rn#5{hIFKyXE=mz-nSF@197g zC0C|JLMv{=3(O4G2OGkBmkzaR$|&w7ID$|G)+--{h~Wg<=R7bI>a8BTZm;KGP6e56 zj}IV`vnGUcP4bkqjXq&>5-Yo^P*eeJ@j3WE_&q3xoq;|Aa>>MG0lO2>Wha(Dr71*e z<MkHA^FiArGCEXM(8+`5+Cv5%e@6aeb>5i_)9z;{M zq^;-=VgI5j#0w5J4D-1RLFgh?PcS&Q zdemfKx*>Vf*Z2G!8io$_ps&7tq zKn*xpl8_BaSSq_GM~P5XO%YZdpGV;sYhSIQB|t1}rTEB0gnlJmk002sRgWIYKxAG* zsu~~*z}p0awB{K5d8Nc}1sIZq@=UmgZuB07O};58;BCqU215WY!o@@)0ALOD2O)y+ z!z(9%srkeqJzfF&S}2=~le)0CvoRyo2{BrwT^>hj>${^!a7hczvTSKo5QEs(IVhcT zMMXxQAeB_;{uwjiZv{clm2Mb+5JET2K84NHo@U$EpiGcJNVcw9z4|$*OAw_bLW^*0 z6}T554HCYE15MegB!Jp39~cWaXCGRFFs=uT4HYyCSA&^H6g!ZfDvON2xXV{TS}xmj zhlhs+F1%d~F}p5wb=S^YDR#c!5$J>f{2?m$hcWm98|zy1RMO8ZEK^x>u7q0VDU|F} zoD~y}78#NQ<(6gD(zwbAd_})~x_2m2iI!d40!B?fk{>2+07N8^S$qH#0wTDSPA8Cj zceual89*{}4%1K_6Tm5Wt&a1rru>rN!&u#p6RS+wpsiD zkuD$&zj~r1Z9y;u@*E@0mmIa@YkM+6Pkfvk8I(rTR#ornfR@uw6!h9QF{2 zFeRlYKmbU`5q{nw**M`R7MtqgGPoIBZ<-zhK*}=|`?8@YHIoi4t1t|Fh$(u=RWeIk z=<|2np{7a-{B8?9CiPl~`W1$r?cB4cv{49ETw)I?4l6Bq0#17dXUk8*l@#=V9~0Dy z@U8^22n|XA#v77hMQE1>lN+ZyuTjIi?>FA51K~`ehSZ6axO!%)WrkZ+bTXq5h8|R3 zr1Gj+c~*{H=GN7uSdid4B}<6JC~cWn2*!#b?);k|q1+P6TGUP6cUhv~%Sk=$4vS1H zM9D@)-XY`nAiplW=`*p2Zdq>u29eT-yEFbZVRUT@!Lnu$cAg;6AsEU`CcY6S`DkmtH} z>y&g%n*0b;VuW;wAZ#EQj(~C9-gJ$0A<_{Ce4#9#((x6E09Jz(4`0$6V$@D(L zSk?)A{P^+3+!ZTU#E)d@)~^5b`E&fR!x1LcL##N~xOfb`LQ!R9WosYY z8w*WmG1;drHUC$ary^T+s&{hbjn`}pd9VMIcIh#G+Na;&xl60E30nHm=*oTC2TRnA zcxcQw?B377v{|0zyYDTtGGUB=Xe?kK_w!sNMIA-#YXC6s&5wNUCTbX%q7M8A9j)!& zMm06abFE;&VqjBA8iE#KV(kcbRJifVbuI(c3r5F||FR?d9s%FeKDcj^BqYWLHf{Jd zp?4&!p3}*J|K|MtyBJ{|+b8FgJw=e{0xudlD3>%8LkPS2!UZ)x3h!!aPgp>dBN|x-6z45<_i1PcA5J6;H&vr2hP^ z{tvAc|8S_!?(61eShHTwhy_318Fg zw8wp#J8_S2ZPK1pKD%<+GDQ?!xVkdrFM+11?uqKqS04G9US3iw0JF)r4+xPe*fLl~ z$aNl6%PcY0u82aRGG0AMk?dx-%Y^-nx$8Lu;0aPQBv=H^u;}8PD_#1f$*&_spJ*lE zlT35sckB<4v%FuQ9OAMq3J{`aC=f1H2h*-GG<(=59vq?QJxB+N_@RpAuj{)eis4p8jJzUbDV)ECr@gROAzFnH2B)tI_n!X{jsRGDw zIn6A)PYBE%b(Q}rtCMXr1tO*|P~UCBj!=z2bxIfcDNpeVSG5#S0>=;Qc%C#1(P zxT+GzKujesyZ!rjGI~Q#6p9HXj=jk&EjuF?eBUvpWUM4-t^Es<;kwD|FCH(+-Hu)b zlEf0&gKEX5(J&#a>gFC4e{z65dWI4)M>Nlq?$rAU;~;MupcCAa3lS~xt-GzgM~SjZ zR7BuDIz7!RmVFX1UJv*N!s&UaYTdbiUm2(Xr^`#9K7fAcP8tLq8=}eSl+8>vBvlLA!#w07Fbfeh+ZA`YXnN!BlTDY?5_(oA zNJ^lWmD=QtrzJ8_q$N8g2jghI&Xz4(=HX1k^=uAYlmn%_*KnR<(^j>w={~*GBa9#;#$uT&yxt9l-M)-NCE*=NtXeJww~4U4H<`j zP{IC~9`bPg(6;mJ_0*BBZxyw=L27*2EvOn0tAKAqZ@BBe#c3wIy|khYYUc93S z%6O2zHJam8U^#)3o`%xJ2y)iP2EI;Jh;gi8%_1?GgbgCRYu2oZ=?!h z7-9;x-Xt*d@xKOb3FV;9smyp~SN%Dc6$!;wD~JU#Ep)al5hE%opFa!HNy!}oGx`iL zqE*~_-v&{ermY#ZXvBKr=jYHk1*9GYW$YYJE)3*Cc7VhtKmnZES!A2R-^piVn#%ok zU%0sp#pBJ=S*X2`rLSb*kok_?muSmNeSy>-_uz~}HqR+Mf0`s4PQNTt`p_5%+pQ04 z-OQ76=08E~FJ_EeqwzvwOa=oIYEc^_(LziSc_*|4u*MF|Ds3PG2RX<;z}z2QP* zdY?^uM+XnsulW8Co6PpVd2N&7S~c?0BxR>1l8H#@c{m7i9HTRlX2(2d?nRIlJK~{$lT*2XF-X`k_iX4SYA53??*YM8$tjbFAiTRG$ov{=nndY=` z&R(AVT8m#njGQ3M?GHw8q73v8EupM=$*ipzWJ50l5nv+)2ZRocniFl86z!gjGI*>= z7Llw%xRPCX?61Mv|4bYh$U4og2(BU(`b*?eM4Jj{-jq=mJP(@Q#@eLKL~5+q8Rmsh z@7p0x7xy2bJ$)8aY-J*{$5h*Ij^%f`n%^Y1E^A#)`+5U?_ZOW3&>UPnEmQFgWX7%6 zehXw6$u5hbUWZ))0#+*&rTqGWga1Gle+p^Wthub&N;`fZx!#BFHC)o!E53IXje>Y9 zc=;XFdP^BR5L%!&hME!nGc;E_(vT>fCW0S~^wG`i58C@BK+O$7SY=LI2_h*3Xjqh# znn5UOx7y&z#XxG`Q7(o?(krSu2o_<93Gs{5qY17I(FNMi6&_4pjj2XHQt|3aO5Yy! zym(PDC=Bu!UU7#~Ss0z*#_()K;nzO%T6Kf(QarXTz%NSo>nd4BTHXzWQxDleI8co; z;~+RZHXw`)_vU6sf+ui|Kj@{{cMlIHetOeV5iE>QX~*H#**H+!czItyCj-e04hQ_5 z3t&JkLdsdk|H7rF^V82F5!XA0j@(Xl2}8)*NUrXjD>(X=W*HVcX7e!QqC>58l- zFiSv8e^!kTvQJ@GhtMeUN{D&nTbPv8%<)?b4D$_2aw;g!QpU3Jl4_uJzokH(sn`+Z z^a4Se0`XLFd`rkVa`FT)3X->7cf}wP^gyd&f_5Iy&CIaPM4F-4QrzUfLJL)@`b4vB z%guB+49Y7&79tu)8X%kchm_;Urv}RntI_Ud#bg~a#VM{mj!ldX`;Hz6an=TuI8f#0 zvQ?y?6_78ZH;Q)fexdH^h)Yz#SgO{E?aDdJckYYMG8|I=T2w+2ag&)8_|v@+3^4Bh z^`f*uB(ol!a3EcN$j2mtCyb6gp->J6V*4U6ym}un~ZO^rb=|(|ZyzNg6)uGV%pdd3-N0b=6 zo_?5@PZmZ&*?UpGr1I}v-hVH(?$IXSi(EFVIBtZ4jWz}7zL7Qs44n3&=YlD`M~P&z zw9!RN*t%~d|HGeV1kSYR5EMi}CqM1rFadjwIS|KytLbK%;3#+3CIqE_BJ1boaFxb- zZ~#o{Gc|qt=&Dvm&p)hkowb}wQrHYo6?y;(NeFO)=f!I)hjJC^c>=S>%!lIrS+mXA z>(so6H$5t`PB+c0`&|RIT@-3J?wQ|=`7@@-t}->G1}1@ngc6a5t(4t#V;fA#qO~6V zZ+F4iKf&R!sBnlpuswuw2y&xhUrSz#*mj7C!rkqd{fsR%cX|Jr9@D-ep1pKv#xjsC zT5t_I6N=pPh!zW=n8g%By6OZ(OCe!!M((QO^Q$W$2o_(K@4o()&?^sCYxn=GBF>)S zE4M0EWdJB{Pb-Cd6u3``kvwG0t*g-x4$=ElmNorK2n-uJA@kg>sjh#bbUqOp)4AD< zeu1FWe^m_So-~=x0)lQk$aES_64twkbOdbk7ldr(hU4#?QGm!vzu@1joq39=sm3`s z*yTwsvlF8Qf-(W4*JngPLebRL+eD>`oApn1)vRcApJX?5n>;@MfhYCQ?

%xwIOEYotJ05+Rr3Ggl(^jH^pGX!4sy3?(O@HmI$p#Qz<50`nQooGu$5Qs z2sE!+(#)a%QX*mhQ@tkR7kgE@^U0i`6%8RX7ysm8H)2wHO*tnor>Q!Viid%5yW^?7 z+?epZNm!L-+IwLRKJZ4q+hPvqIYx`;(iDVGY6q%xy)4~vfwkgZyE>7gEnU~XRctJ; zaJEeJeRyfW#QAoi6lduOu(|fXW%$ngDYgdZAAh~kCN9?7^GB}o@wF@K?R&Ro&la*QNsrIHS6iqS8pg45yDSho*+GQK zux8bKTK+hz`*Y{B3mTpUhP<42K!zR^LW*cQ@_L!KI-<8mltrDIOtyB2Io`j!<`!6c zE(oK!q)9A}j7z+lse8kJdMfqVsQ&noEHfyizLx%{jp7g5@tnUOfYyVn&O30k!Gf#t z6kU7Gm}z}bOxz-iC+Y>k<3}x)-f40Nm$|5o2!pT?uA%9PHH2(#^H@nXR z%OCdp44NU9hoxFxvt2{^xw)-zc=Lewyv%Jn1JfII?GHSR zE3K4}t9oey36zU^DioPE#JL^tiSZhMgd z>(Wo@UdFZ@;<>%%pDQdqGUYy+pGhjx&E<2{(9%f)H|TW81p6DdYxzldq*UxGi>1$& zGFe4JaU^#U?P>Dx-6v@hs_HAg>B~BFooM#VTlt+O;P>1et6CFzbER*u2{}@oY@SFC z)y;SDhT1!zqZnG3c&_?O&(EmkK}ou<4_Lzc`RTQzw^f6K3T6W?n%LHVHV40JWBgdj zn{Q_6KFcI{-6K6KciL-e2<;U=RCkU~&01Q{M^^60jsyLj2l>may*Xn%bD^8D+)FgR zPPt@+z6k7Z>vsPluT>l`o+e7ZyW#U|ElcRrK}o?S%O2(xm|90k#u1T_uA)kT%YC0M zdQ9CH?k6$

8--zr=0f7RXN-Oj?#)N$-7Gf{7$X)+UgLk9#3=xcSER=Q8(JoPP7D z7HaYBiJJlh(>`=nKp90j+XG`WY9<${c8^Ql6452SQq~v{(ewF1dOW2J3hhtmf5m9H zo#9yUj!*p0aX=B1|PJZ$=Gp+6`-u&X}S z{D-iTh|wXr=ZXVyb>ZWxrj<}|SBq%8%X*(l{?i@{L;7nC{>G-4JB}DEvPhk}k~^ly zQq|pH<6rf{JaLWRjp$QW`qKIA9MNyLZ`hCE)C6O0Wq@j?lS@5Z6h0B`! z9gOe-d^ER8KVIoyk|teQR~Dx+K@L9alT~K{w9U z+qkCAeus8Kn2U7i445?kkJdpjp$(xq&_X+v`meHo`y8F9GjU9M!;Ij)C zChT~K%PEWX+tn&EO^RE@mu#2FZu-r-(Z+x2@Q?OH`Z^Xi3xQYS9U{!vHI?`G+vynm zGkWCGBxIU@-(x|KN$5!L+_1}Dzqun{JKQ+)v9`3i_!`gs*rmhiKJ4NefUSnx%pbZo zY3SX5Smb~yWCo4DqZjdd5`AS1XDM*}y7}Jwu~WIfgJmFa7jWyxr<0Y>jNEfXSG0dT zTeNM%g@Sn{<=KDz8CEHDE86<;s)XN@iWAl@uP|ju$e*oh{HB@H=<`FCvjJVH%p!q&gma10Ox@$C-!J@bx+;@F3el(6|0>?&Exubc`%;y$n%YyT_)Tn` z5k=W)k>zaHYOOqKpB@S*G67mxzu=u7tA-G3bC9MBeY*Dz+sqg7yboa*LG$vfLbJ37 znv7yrt=lA;ydQHwPCuWc9B5Nnxj56Q05Ut(oOARkC&TMx7Cgp!zt7RB{SY%WultLg zOr%T8z2`|?Yo6McfQfF(k5X(ln>{Y0r5bJwtvUT##*iiIZRcJHZtZ+79V!3Y^&7@v z!S~(M+tm^+U)vu!sI{D-0pAqNc`|8Q^U(Ei3TqkcbX5WXo;Z6r-DEa?;M*x2J)W+) zfe1iUm-N3Vm|KlJhU0x$j5UpBsV;r=kkZ#JFI_IXdmA@s_npK1vaYpx#&PnEneM>= z!0jI9y}ACodV9y)=cz@yoeHKpzLG!n)trp&oOY&1W^MB3=FL+(`|BOHpk;+el(fEN z=A(Cy4hbE*wJ=iG>L|W|(vY~&0>RBbj^I#ZF2}E97+GA5nr^g7xl>H6B*9R z_?A)pYK!wLhslqkk#8Z&#?6)g9P0cFe%7}SB=Y9#@M}+s_AcGO@9`n&$(JA z&9dS$i^P}W(~>GCB!c_%x`+`pv{DVU)8HOJb&7y-U-)ZgyNByrF!EDT>gXKnX)qQd zqD+0YH^)h7k)lQN_19I+qBPkVW(%F}qGiBkuY6wLr?P=;{!~-?p{XQcb;p!wiJO)e ztRh8(_AGG}eSa)cQR4<0f{LebEV?{>)@N_CI@+g}>K@!=s`xmOSal_fl(n1Gzgcxj z+O$17zduhy;Z;kcR>fz&$a=r`?wk8c`KqRS4~O-C6lqJEUZEbQsGyprXKw8>)V%6& z`~|G@JKedpTYQ$@3DA@D*VYhvXwAKKhw6f3e3`v2+a`-{q%ZKgk+GmqLo~1MY>%~A zOw0!Z{faLorg`xt;ahc*r%s$V7Bzor%&97*anM2Tm!-+@IIpXm!K%Jp#Xdm=@yFZV zb)eIVXQ0${{G?=wzH!T!s71=)*j#;fn7?s1lO34O)iv`De2RN*u|OAdiZEDjtw-!f ze^$kP&X!-x+|FszRd4(~ah^E3)rIE3b-l^=T5-;L zeiSWkaACjyoE~CL4YX?>tY5f z0bC==JXS~Bot&8tx!%$&*>b&V*S-@KfBDE4?cP;09s6eMA# z;nzm>t!b@Kdp2grWq9|P2wieNJ)DxAShp>}Dau&(XT6E;H61DGRt=%xF$@~g5tb6% z#C}+nMT^~Cz;g5fTlJ5RejHiTy>>Sj{VFOwo&4G1W6QRlkV(4u`g66a{dNlNS4?#> zuheP;*}BT5&TO)XJ@0z!obVFvC3YTX{n1eEHhE*=o3C>c+xn#LDn!ikO{y=yS68Eu zIJ-^bC2qEt+ShlDZ{2KW(3)+at&&>%_U__g(S3fx70aFI?{9p}b6@##(xJ1itjb4l z{*k%A*R{NrA3Akd&h8mBcZ(&49Nszd55JU%P*`NSG)BDLKC1C>L>lJ@yyZ;6Y)NLP zmk$2DLLUuS+YdKXw5Bz_EYuAb;frm55;b}%I7su*G{(BnbsxXnkF`F+`W7jI2{XlbX16+RpMUE$o3#9Vs7!6f^4(a2e~hkIOaozi)JQ_5q{T0I{} zlNzg+uD95a^!pu*v?#nMcxUmLYQxoy9P#g7PJ6Cvy{y_38KNk-Ltu&WVXNB>M{h+L z9Q|YS9S`C`;cIP&_6OY3*)Hi12IES=@f@}YDFFJllbPjkk>eRAYqHZs}2 zd@7Bgc0N5Y5$dniFkH~Fre@-Nb_pgq=!btNZrMVJ)bee z^z7GM?s5L8=)?0iPF%1H`sz{&OSvWDG#@LU73Y#$9`oPdl=Hb$+{eev!rqZqF{Kx* zvHcPs`+lE(-d*b@vz6?M4|I&}v)<2McEY4UL3-!FlW$XI+6jsERwsWtab1dEb;)OC zoCk;C?jsYX7ai^!*1ei}+mpDdsMJ*P_oIjDmp*lbTs8ag;e5J#L!>L%O+?=ghS70gUOlfx_Ip&YH z8I9qHe=Vc&|3-maY@?-*`#O;t=Ge_@fd!`g5rzkSuU+_FK6=N4KY71+w&di#Wf&Zn zJ-RFBovC8xs#Gq!-Rz+aMK_c6_Bx*YaE>oOZdd4TOBaKTuW>;~3eoLbiB!BZhnXJj z+>cIT`JX8eLgB{5tgMUL07Ui5oc4!YdcSxkzg6bVw~#|tzgA@$yJ>E}&1V~E{>;cL zPfO*{#}m#~&vZIJe6LD(>8LX~Q}kZyTGMu^H_e?2sei3dzI*)jF5kb{zPfrqV~ezO zZ*<-=&^Y#Iq4Ot7@0s>T-iN3v@vxpz@A@A^So@b-O^c4vL&fM7UB1Bx#bau$(uMr3j+E8}@_v%VsoOZ_ znZV>|?#9KvtM_eXw4;p4@r#Ql!FhL_7 z<3GI5rU?Hj@`QWsoK0)s#1TCph>E`WV{y)OQ;=YwgzRQfzOrPAl~q44W{tn$;vBiAI+(Ee)#v}Sh7hdtKa+%W*1W(G zL_qoA#;Xn3=x{kP&T%!T4nMfpEzb7yz561e533y|HQPUO-#TrHaX?$8*@RsSmPpN| z8*ghf>l(U};J%+N8-q22gD$P}a&oMg-iX=Jhd~%V{bsg3UH>UmIe%q@WLhi@z#v_roIDG4%sA9- zU$3l|VEZ-Dm{~-#I!g}2Jw)Cfd%D$8^xd&&O=DK0_Bo@hn|%g|TIt1w)-@mUFhl;| zCluZ|r}YMbN&a=7$YsdV6B?}d%WKECTue(iz^k=+j?)GLp3z!QAsy8SyYddnnri)? zM@$6@4l1u7jXHST$jRPEh7&Xh3f^5TvhPK@z~FtQn8@?tR~EAUahR`#g{`xYEu7_o zK-#FC!UF57Qjdj0L_8#d4?RUk#j|+Ngk`9iyje>@@@#wi{dI1Fd?8@!Qthh4?R8`C z_m_9Q^Ma;6L^gZokB=in57JbNGpaW^d;KPBSk&TrXpt?8Gu^&kWLzfq z^F@gzq%&6fj2i2*b0J5yo-1yO6%*MrJzGMVg081Ivdo3iv<0WEsZefwIaLo1Hw4t2+?6B?K#aR3cIq2yw z`(r40sT0$jy|2F9k{Bni8jzam7U7%ZEz*<@9ZHhDwX$7>t+&6FFK>6=hxYPw$2A)c zsRq=`VB!&HA1Ss-maL`~@P_X?#jlMizNBOl$4k|7TG#$JE;P81UpWvn#m>vMnM-wt zUvBsBEB<{Urby6Q{`M9MICjz~$cVWXmiDinz9Gob%ofx0Wk|+5F)UDWHan%!>MaI$ zk!i&4BPSIwB$7o%jUTo2nBH~hZM*Xe$hll{XuSH-n+tbt-=*+p+csCWn?HM|nCVKL zvC}8oAd%~Np+HPw8+lS&z}1D6LcxkFb=FWQm@RWAR*A|A88!Z9+Q+j*!t$S^ zVD-tDLoQdnKJ2jwJRfT6Yw$E>sggo(J@?Zv&p*-x%I+^&xoiEP_si?+)^bK}*O@bH zTgUrM8&-h0zor4WKlb9qCb)Jm-&fbZ!N}s@TPR%v&i^de@DN}p>4(s>k^Kb^?Vm*) zJ`WFm_1R;Os#B#r3f}9(&qGsT-CCB&&Wm%KZqv9$1SI1wY+=SVI}`KI-AcqH?oO!j z@-Ve|6zV-T?X1sIp*jZ#SIj+cS5;h1FFN(z8iljjdZ?N%HFei@crcy7Ns(rgPkH1Q znXtO=`*ByEtuy>GbLaU7w&CYZ*RWl=_9#nATWaAXe7N#{39vLh*RgcuM)}Ygo*;VN zdmpyyydwe6ya_Ez|28^5{+t!s89Te=ba&&HiY&mF_~Y$M#?)?=BZrENsg0Bs-6e5|Rtzhb4r3CXFS)}H|EeAvB#q7V*uaDg^)jP;T&XH8I zX@jxP=8n7PwMm7ae{k@1Lsk<0Plwh^L!w&>@S0Kg%V)v@uhx~CSvVyS0F4f3Qt0i3;5&?$KCHZU4xYK)zqb#Dw#xh1nw$=?Tx4q^%@D`0hFn=X z)Zh#zK*63wFlM-(U2m2pp}jZs@-^eM?f_&XfEW9GsSObvi1VmUh|v`(ONo>Hhjy?5 z9@9u^W<|BgW&8~{)p&%eo=Z->-6(T~9caC77wXDF+m)OjU-+9HAZyxJd7Q$?HoPD5 zNU4`Wb^9w-{#V9+Nk5z2msr#HVK3AnPz=&4`ISSSkv7`!pW6W;QbZ;h_A@*8c*8HM z$|i`R7wNwL>w$BlfrH=QP<3|eOkgak3|@)~u_SnU=guwgP+OT?LDJ-k>VovT|8k-B z)SW%pgO!X(H@w8?TvJG>i{qn>7BGr4yIL=$OQZQSEn^G$9O_SS8$@^F+M%lk?LECo zPq~sR%x|83lhjWzLF&rTh}*CW+Stf)_rkY0SF7xHbIkw(tRsRWR)zOh1U@U*tN*4>7Uh z^Ns((!yR6x{f&o{?+>FSsh#})-(IpdFx$9(9|k}j%lkci7rhM7$c-R+n;4#@sIT8? zUP{lhj=@liNXT$^foA~X%NfYE5#D-?a*Rs<^s1)q{COu*TpCT&7Co!c__OWmY#D~S zMK@Z;y}w&R#_S1e4*V4&TLi_tXmD)_RgGxIfcvLP22DWTTe5?e!^liulf4GB^~}u7jz4yO=(xrn?;E+;z|hyvUN)WA8eI4q{;Vpy1#Z&;L< zSCp>3a@tw_%+xG1uW-F^A9vY1>U?c??(9nD`G4(wcTkjR_a)+o&cLYS49NgE3W|zi z7!W})qBbZXIinI(vP4mmc@#x$5gbJ^0Fn`~$!Q$Pf(SM!4Wa^)VaNgkdv5E5-|laB zYqz#)cdI`Cumn52@AE$4-gD1Ahci0y{z`^|i~sb{-}C#9<~3wDR6Nf%bzlXpTXyM4BEDRYiefY23 zMH@5Pe958_N-VL15HMa2IxUHWp-uSC9~$b*l!_iCrm8L|OOsDXc>IoMK#|KF;9k#` zANcmK#ArVzRuA~QCV&ZVWSw~Vs1+Se*9#8rLph8XihhYj8c+74#Yx14for{_qg2USCUuSa_HNFhQ8ZDmH zXF+=;mH{)|9NNPiaWq>}w-OVw2u`e3o*7`}jq;cG^z@L|j^q)LO=tIdGQ!PEs(P+} zTj$dJK;=`*FG(s2l_s9&Yvhv-efebi+okWjciO)M2I?OlW9OZjB_NRUhAQV^9Vw!F znLSqD@^rY2Qz7GbN3$l|;iYkA=Oy-fKi8pe5+ACG*YEoKgMwxX`m4W(jRKeFHaa#| zXRnjyXcn8wBo+V!Eg9PA-QaJKh1e_D!2NxFM|IP>hYOsFhCsqo$Do#i_))$h`)}G4s|aYaGpNr z_!>JUD(PD(1%z)fkf1K*8JB_S|;Tbp8qa-^UB&aBCUr`xqk zoq_F#?QLJq`ukcHX8BDs<;T*!q5H{f4so8XJ4D2a>;g$ovc2m~%Gn;Y zEwk47OgeSkkXhB>9=T(D^&5BNXoKX1^-KPKmctKtO!+~>TzmKZ0wE#ml)XJ;=&v)1 zpEGOLs+YbM!?mMUSg#?68(&?r7t7e6^*xsv$$fwRs5DQ=sgRHmuAn-aQ}lAX4^@P| z1VL7cf%fXOy5OvADBhq)%+R3>f(>Xokz|kx?(gGfH(#xkS~)rCii7FU=7-1Np!DlRT>#gQU?kc`k;h8s5L*as@HhvtO7$-wU2h zZ4l9To;F#Z;i~A_6Pgr?iO%wY-Zkl0gl=?wf^IYLe4?=}5Lf5itOZLgEI6t!(JylZ z5|-r0%7T^Y1n5q%W4uLHq!1#Hf*(tu&uNb+;|!*_{en2YDP^NBpK(_F3kx1DHpjh& zr4Nilh+PJ;?)N`*b0Cq0fsB4VP8?mdA7t`&UJX~D5nmO+dV#-%v^g&r%Di9il80yi96W3JiP*AP%<$!QvxUb( z@*N!w0`fAt|6W!%&-1R~A52`mp)K(0qVh(MZR#F(^lf*hCdJ2uoF8MwCvFv6&=q3Ddhzk1u{lEFDT0Gpe^2;kk@RfhR*7RR|`~UvQdG5d8 z`0v5_Yd-vY=KOng{9oQW_LX$aPCo^?u_F*&Kb!gnZ7C}Z5<;DP&@~Lp;CKjXkwUe! z^ltPNK7!MC0yvopv1R%04lrRQqP8SSwJeagUz4#b{jGgtt{K|Zs~)AoWd9&23z+IOn{pu*9e4yI+$#Ae_7uN$<;JGafTY+au!9qgb5qJNpcn zn>KBt#uai?B$ah=^7nLcfjRC4`CmN4`LX%{mEoo&nS3!yU zLY}B}WH2W|`c9xGII!*Ej{ORn?corcLjPe|j2W7GG(r`ALjR{gidP(5J;=!i*^Tq@ zyuZ(y^cnp5v!B?%N%rnUyE_Txw4GBE~62g-(d=!gzcnf1$Ruu>Gz@LSNGlr(odeB)~Y{5+uUdkX*8&ZtY*FOg^~;3 z+pEr_{nn6X+DW<%f#S+&3RCbpb}R}cZ*u8FN3c6mE7N;4858(E=}v(VZ-56K;O$q?NA24XhF~UW z%-rnNZ^fuW&pI{bK*d5+WVPwltgI}OHYB%(j>f#aP0Gsgf!Fcvy?ImS&_~dAb9=IG z-8$WeJ7-aQ7|HCpLprpM9foE(lA=HdRTbpgg?{|;A-XB_qP11QROVi@M^_Ytk78U} zO69LyLo^KG>%*{A8BLXQdRzZd1sFp$bT_PJ5)G4{Du^I4maBM)5SK*}G-OZc{%))^ zfIPf%!_gu!(r!YeYvW!ISu^UDCGDTt5-M?^Z;y_TuY}J6ZaXb=ZBa0ctYbWT-xj(Y zsNH>ZBxAT$^{8KsNO<2?+}Z4>(wK@C{5vW>#^{JoAzvEmp53%5+puUe^4h{H z5>6u)S=8)|Ce><;u*2{&WC~C~MQ=_R=@rmAM5B+YIP@)8)J?{oHNT2|)qS|j2x|Ps z=o&rRVK!lqVEPh`rj`Szufx{@E!FL?d8yeY;bT?5DcQ1UHhGf(!FX)z*Hga{hG*KK zOS9pIWv}A2i82m5Odcr6m87kblhecgpy>MX64`*Dw^5fmV;ysbYP7RCA4ZYN3_-r5 zWxR5&XEuoD@nqkI4V;+}$RULgG?kjLuE3-nd({)POHqF-wNBr@ZDN!|1d$+DtMcUS zHIjBrIczMU=XoFh2n9~^7ac=?z8h4707gV%cw>xNF)x=YD>r;Jdq^p4g7Ec#hnVD} zcQ}=7avUNX?tLFr(YX7@zUgp**!n{mFrkq$ zo6!33^sE3g9O13~U^ zf6{vl8e?#gM{78ujq~36u^q;q9-VadOe3f^fCYNuP}XiRKnJ)Yt3g`TDTqjCAy3bob$8 z2$b0D=)Hd~U_+8y#@kaj)3_cB%j&iwfrh>@?4@*6p_})zJ5VduM!mi0!jd#%on?_D1#;G%YkI36UD0b( z!LgMTAIIzE^~2n`KcOb>7g ze-VHx8ZOM-fUfPZZlS57si{f2;Q0C{nG8!@%zQYd_x)A~!zN71&9U-U3)x$IB`zYG9`{1PPT+d`@X;X%*y!!m-V_2;9V+*lZV{9w?7ug91#xcI`D_cfe9X)26nn7!*PjMu~ z)WX7o(1I#pOT6^18H?MA{|P3Fy;QHVy15)!YWTLtr98rynf!v_@=l|naT^gEJE=;bZq zP-Z7J1#HksqauTla|h6?908FDkQe5kSyl z--XMkI17q4ofzP>TPsQ%8!gB&#@BIo74Y6zoX(Icbp`UHoqOm9cPJoHBXCkScKjv{ z2NUQ#JcQn66r{K1H#`6F#~)fFJq<$;Q4-w{`3(CQAYUn;G7spx>A;7CYz>v6xTNpr z{sDM4u$gx9hpzEKG*FlAm4Yc02PuK4E3T>=IF*jVboz2{Ev5*8IvPH6@DQSo72m4Z zUXy(>&P&J$>N5EoKMCw#W&*T4JXW<4rlF#pc&G`#{PGO};B=7JEyH;Ot%V@T23Q+d z*S^uF?HmzWx1&Xdi*;(%^74u*3qb~wr_Co71}m0`RiWA7S#c`valc9r9ijl>JyKmPM~#9z{i{- zFDP}Hbk}F-FrPpT4G;E3f~80iBun@|fB#!`wGQr&TKkr@cgz&RW=P^b0-MakRD9y@ z%8`B=w)K1)j@_f{D|X`$A(YJqfK(ZrdJdK5KyN)JQ*y z{p65WgY)mRpuTmQ{5Tl~z1g0H$AD+$uPwesyI?yv-oY;nc1}wVqSJu_CTt}7zb!`R zRDg)AyzfbIw4bwOeL?izZ?}gg7VoQ zG6LgngDejRa(X2l1Z=&7c&5_GSSg!>Jg_bzMsi8g*BRikwuZiFVJGJI8+T7v)F`0+ zrcS03%t6F*!t){_;$+c$GzCf;uokh*h6T-ZC^M5{kkLqGy%Q%SMBCzI2BUx4B$eF1UpV>byY{Dcf>_o`S$+5t+ zx~%2<;n{`79Fd<@b*XTGBXoVR!`>76^O13=Y6vW4ynXw&uyb(G4fo45s~uL=Ym@&^&33XOeUYJ$>YsBr0YUfzq!Dh0~26{4EBqx2cck(Sr!ri3rW zp-EZG1tp_sc-tN)tJrKm^l7Mk{$u;h;o?nbeWWDp4l*t488p0 zT7HTI7IRE9-AZgT6eg0+=e%%qNqx{$?D0%RI3Ob5fdGCIEa+0%t-W*S&I`jSyBLL! zBwK&L)whUXDl%$hHYP_pQ1<+;Lz?IJ12`x8_b$-=fufrRhdIcb(F1JE*NCy`wK_yK4oxGpl)IlG!l$KdW1>Y)oO%GDu8vpnZzmQw#DV0xKN|1cj zthibmYoJNhM$ElLh{Q-QTC@c*;aZG;do^10vi5YW!X+haz>c%jDCOW{04+QOldp)Y zk50Geb79ybsSlR5aC`!pXUz+zMu#A)uKDe^-!j9IuV;;vaccHS?D!W%8&-)7zn`gQ zgPf`mTdRp3R0I2mhE_fHIhwmHVZ+&mQPg()FP@BR-(*$ZDPUB6wD=Y{Lg^QCrGAh; zf04gA!{xLW>=oxe$Gux^yRtGk`vq%x#ros@DleAz3YtY6?H-^ww&!Zo2g>7{jEtFs zPX%NJ=NC)0BN(jc(a&)1wve>4wie2G!KyAsa&qtBPj-qTMgwD8Yh*;}JPYFE&yOMu zqjm9guLQf(qbQ1)o>P_!yM6n1U)@`!op5NnT#MC~WETb`sJkJn*3gk1bHhhWW*aOt z*y~eB>*M@F5d-8nhO5q>J&W%`MXFnIm{pyi_e3c;(Z}(o^2u?ihgNh@F{_lvsM^v3TMgCrEi(pC^Vti%0h!a{!4edNbYfm|STD$aBYP5} zFhXKtX3Z)j*B{x>42RZH!K8-pdju34%o9yBn9&&vi|;C39)OMrIVpA7c6N6=up7xz2n#9_K7Cj{KkC3Z`Jd5>ZiX{$T0OOEs@ zz9s#gQHnEnLD-|rArgXbN8EN8JFRL@Q)g!^*2DZVAcd8fx;O6Jxs&LC$ebh?W0mxw zqvt3mW_W}=KOrx+COAaK=mGGeoXRY3jL`$s%wY;uabV(c5tKKet?k+rrrgTeL&+Zk z^~)_#@-u;>gggg_@yM->u8|Ez*cemU>vKtXJ2W(u=~?qHsxJ18kwFCDI)_vx2rogv zErWk#5_jRxT#P+dV0(?sLYis{CQ>O%Joyx5t`h z#qv<-a2UTPl|M<6+awoE9GpLq%nZy8se(>Ie6d+M7=QDj_UfYdy_S%uBN-r+9Az0K z98J~+P;ldFVWLh8WgWMV&*#wrghA87Lqa9BX0Nobk)1EinL4l;w{wk{4mvCjz$n@-LYujI-{ zh^*etq{l*fI2|?dx8Y4iez4F)+zVJ$3#l$`xPaNtWXhg2;0w8`G7$rn9Z zlQ{d~z~NA6HV!irgtRQ%YM^-9gKsmC+>lmzWcY7Yo{z%0Vv32cJY<(_GC%pp_J1Kg={=3S*COWE|QMI+9PD$;Zcaz5vGA zUD@77bGkn;?!wcnass<*z+kCpwn;LSvGL@MxQ^=ao;!l#;ytgl&YVJ@|L)BB3(c+- z9DE7$PIn*x1w}<=xG3aoTJV#ls=8X3Mwm)GB6uAUS#wBMqus9WwM3RDPMqbnFBDG2sMS6weiaL37@0f4Jz1ByW;l=^58TRTtw5OymUZskK^ zI^X_xidTRAHJU;Rs#_%BNOn2_d;o%r^j=CZNzYO;8e9q@o?!!V5Kr?O_*mG9zoG$F zS9ncdO4Sn5G=^k~>V*pzwgEFxKE^%{;08d~4#?-x{slZrJXK(hh$cXRO{LCj?it zKcm_at1Z3lSVunQ;U~_ z;+-a$XCR1&boEaRY^y=YaTmZQBs?y2C4&HB+@^K?zTWO`6r^lOhFZ4Tmj=dpuT^;! z6*zhFf2}X?;d!?kEUG8tR;N-IW<)hC`!VRaR9V;RUu{u{DJ>mq+;~&r=)sdGKN}-; zBq-0@+dGt>;ME} z_B}iYZN>Yzx@aK(s=hZL`;2?Usw`CTHmqW|CsU9^;+;5>xrMUYgm{t%rvwH8O1Brs z1Azh8q7S}R3?{IZkCja#A4x6~Fi=cr=t71zLOMbskqH+o$DV{TD z4$2DAz&=lSlL*Oq3`OMr4 zRY2S9)Ypnzmzuz&hDuwg;x27WrQ!e}FPd@$2<{D2F-xm=QBMlY16z(<6_3g9OF*30 z2Do<&5s(}p8}fPR=?T33{ZBCRId&&BwCiz`unm~Cq;ZvGijbW|)?EC@>9-w5@ ziCClDGdDc8Q604|QV69C6ZnB9Ml?0ck6=^;RYQo^gvU=%{(T(m;rQGzsdgj7zKPW9AHEXM;PHE)+CHYY>`gRhv5)Z;0)u)6gFQD(a26NN{sX$ ztbqPA7_KKYTp()7B;P4m{IFq2@Hu$THrX0`_K@Brn<6(clr-uJ1vO1l?bp_vKYxDZ zqD6}=#tF69RDn!A8fub8d{3a2N5vY{TB=Y~L&0bx!{GN?=xl1DJW!UPbV{lnSp^%X zfmM=Wb5BnKP?P1zsR7_u;|)>_;4))x9jT2_H)VUu^W2DDw(hDZJkad z3E$?&c2hW@Xy5_qv5Xs$Qejo8W84%m6n)dSL#mWKU$MbPp# zl7$kKBb1VAR!ud1~o4R`P?0WF1^}zn?QY=JGT@C{H zmZRjhL(FAFRk0p;DY7<#9gr%xO6e$*+9Y|t{UyQ+PBl9EFYg*pF-`CbR0#YXiuNX`PLII&BaGGtVM5ujvRaw zKB3?<_JoX=i0I7>2WGW{%+)sIxZBpa4BImuo`(U^L@E3)ga{xuq-6aa#B9yq)YAWlmsG%@;sT1T$jCf?(Eqk$WRK8M#Or~Is}?V z^D3iXq0$erwo5I@Izz^8rov3=FyavXli5JYAmQ(u#OsAwOq3Xg!Sqqw`w)DATb0gX z`H|2zS^EO-V1|Rv+nRNq6AVNNS#cW*Bh|PDN1;2O4EBxqE|_Rm0_L*Weam-aJm5r=F-N@~?#6xV#XGaA-nc}2qGZq4@Y zjxtW{A4-gXl_N&-D$%2UM@NPSwI*!e?mQ;F5@oepDn6OiUa$fICX@I`s3+Lr>zR^} zi#>bp94Q3XtV*kYKu08AuhCfzIN=eO&klPC7`%Pj#sN64NBYBHB+O-0k>D{r7fNeu z(<&!$ehh(nAei1zTWf#}sfN6@+dBr1QK$r|Wx=KfL)t{ygNE@YY5K@p)V;HxKEqtb z>9{@^QE(y^2a5+>;wcCI<~C*hr~~G1<7@!4nEXV5xhgZNFzpe}BreXYAjMxeR}05F zF$X~eA`e_XN2i8N78URBD0!+ev87rIMY`gEKY~2~HwTUcEJP7>VpIH02Q4b7l%Inn<#x;u zXR%GT_&Td!>Kg*?{_+mbK_9Uck$cRrU!a~v&Jz3$k1*NcQ!e{tFYNDzL)mkvS8Bi#f;=KvSXx4BiD$a9DG5E1ho;+DR zy$3^hk)39mAyFt8iCB22^kzJwa(S=3UYOPUvhk#fn~;v{VFlD8ELg-i1D6{AbvtXG z%rP2RksRrI6_b>J%!B-ggN(h6cb@&W*{069?bL^Awawi+j3-^H9;@WPE@t0ViLBwpdmBGNf66nbgswuXh;ge4; z!h?svrA=6xIO@TuC%t|06Zy$U0d*X9ibKCjShTd)?}(DUuXPWci+JQMA0Rf|K2KN4u-e1E?o4`pw2|4jr3Bs!zQo2YW}fvc_y1 z7FE8VU@$U{*B)U^|K$w`8*#7zAxi;bm#sF>mTvF3^9r`p=k@$$qwD9aR$JM7r-5&| zeMeGu!Yto;^M?1{2?hg*w5$=46lNBQpV$K=DaFoj*SE7~vC-~8Rx#3)sDf!EQG!Q{ zz0v;ucwOe{3`rLk7ygExU*@=}{l2fa`fjfhwk!o<)NGE-Nser~CKs@iwyc(UeEW3` zPbdu@*9kH??L)TrQZ z-QC>}w$~0NP${|0D{QY0;OSS7jROM$T#Ju7z>esWS$Dt6mC~Z3qK{8(y$~9b**ICn zTyn>D!KPU1!J#=D)qw1QIT6R(hUhGj4v>5wWKl=5Jw6)V?5;anJ`q59X1;K8 zAm>X>ddP+xQk~Ev*f)ka6Of6ghsfURBRM^9Qi`5X4u0?8B0I&IMk)KP$}e6ZT%dI2 zfM3AB)Y|DHH6Qj?-s8+igUxw%YK*z~d^eLN6F-m5=Ji^;aIvtKSr7 zL50Y3a8}H?aarA4oG*p1tc3##qMHt+f~F#x2)nGN$A^e^N3>(HD{)5_7^Qrmnw@`t z%;(QbzYW^9+hKFg%ME*!mb%p@&%ML?!$Rzd{f3C!7_1q6#8_9K3K$@^J%+eaC@a0T;Q+TW zYG8XqRNWTDoEY8c&%S;Q-UH?2wj6(V*B~$*JLJsz-8eO3-o4w;Kyb{QF-JU(IIzV{ zMTo_aNnSy2^6_<`(uJ_FFjS8=T6%%HLCNur;9&hWUn3(UiYM?k`;nAf49vUZ#kXp> zl#q{l+=J5mMq{J&G=&`~qy(eN1q6l&3CQsS95dUOg73Nu`wNI%+Gr_*H`*h@(ZJH( zFg(=P_Y#Zuz|iLibk21|AnsRV5|XIYlfE58h3IF)@DWsYCMSCROu9Fq$}vlH&6ZtP z#L19_y#0JopddpG7;Hr5=Wl^$`3eBDFkY(;^P7w&(QrVDYj~~p!CxX8QDi3`wEooO z1bIP1*uM=ck?i2uZl>S@P##j=)YXt>03GxD>`Z`lgyMM&Eg6{r7*pLd99WfLrbMe@ zEN^Oj^Ac|jFcKXd9p@{$MTN&67(xVGWqCz85^n@k*PN=6pt0P@vs*r>mv>>QCFz#~c9;>TV`Mbra}t>v)s$h+(8)Ci1=3sf>uE3iun1vU2~9ctkbQvvaQ<24 z@vanVsvvJumi?QmghSs@Ws+>U$-EaG#KLbTY4w4x98T*J7y$Fi70r{LM_XamoJi0( z>Qmat5d<2F0G^`UbR5m){d-sHeGj;TyZmqx>miW2T?7kvqQ3WU+kDKK54@mS%$oSf zHn^arQFv#$QBraT)dGOe>!DL~+m+V#_Grqg60`d+qH@;y#8>Gw@_Q7}9%6B{VUb-6 z2{E|ejjxZLHhsonfGDbWMYv0AE8;FStMcqGH+wW?&5PYIAWcc(F9iu$riD9)pVUa-RT@r%^j579_|z(I9+bVHto+ zNrxGz#5t7{X{d4mm|9TAjU|^YTV@9U@r!hNCTLRe=#hv5k4WRhW}ya{QPM;OD!WMF9lL`9nzp9cI2v+91KSRz*StUlWyhlw1pKE{|=Z=d# za@$8@96hSVQ4ud|frbGEqcA@HQYjOfZ}j6ZmsAooE7BhU%AbG!_uCw)<%q|B2`Hai zfgA-%qC@;vJqfXZ)wPIV^m~gx$o{*(^3Q<=c=W$mk^AeNraOqja0G1&!0fM4D-RG$ zIplsYJgPyy*V5j8#AD>Im-Fa;FX+El-nvwf7=Q$!I;1r$MXT}z46xXj=Bvs_WDeLy%?_0scpTE=?AU4q0Wq3W|LO8;ilBn0&6<_^{w(hrnQ$FM_wsb`BuQzT{ z&0p`ocM!>$8WSZu8$my*Xskp!OQjf4av;vba;~lQm$zsoY4OOHpvXnmnha9N=yyO; zkpq%K>{nd`>jve&KAuPDd%peG7ZQO_P*gMuaN%qFABBZ&*j&rFFE8W&9arEoHh6?V z##{pS2tdVRQ~6LPFYLr%%^aL9b?rpY`gQB}^pp$*aLMeeEAsx53_ypA!p^t$H#WGM zX%^pVnf~8t%KzHfV$TFX#$cwUd7gA2l#!jf2+=}eCn|^=?{OQ9{Y~);s_nmNkap$P zCwZL3583Sfk4x`23!dAOBZ()ftE&agA@Ce3p@99tj%oH-$q^uLY6 z=ph6oi<+D|iUX@Ud;TuANBQH`ZFv5{lI5^=2duH7W*IC~`X zwiJB$!eIDtd>Akg`DLPGV z)DjN^M-|xrWb8+Ise@kthL6r5%tT2@&Xm+rT@bbo9d+Dj89~7|ypC z5FUviLVfaye51B*2E}rKX7$>R zlDRKp-Mach(a)L)Ru?Z>!h+CE_v@tX%QQl@K;9x|khpjP7{EKoa)ZAK;at$?^AtCMX;x77SBVU*&%*v`CBn#W+jPUN1K=n?<{N6K=n zjbc|ah$9;Rp^Al(S5mh-N;_dHAUU>5DM#3Rt#^AMh0-Vi>;t-i-~iEw4|QrGAj_Yry_@8dzQMSSQ7T-Mc*eI<@l*q^~45*3@ zq5wU#@KM67+WS4l`u0((kx6(L&Z;Y ztpyB3UANe~b?_Suy%gierF$yj_o(K(c#ke~ZSjO^96}Z%c9+XjBnRr=4JT#G{SG+f ziT5DMKyL;863>~Y5_w)T!zq5k^1{WY)P;238~pOLIuZG?!Gna|o(?4-i0-Mej?g)9 z8DlYl36DZ#fkmegrm{7F*rYv>+dr>g0ZbLQ7L>4plPrsjQ8?3f6gl`h$1M;-lggqs4ZLvyD>hugaD0s`**OsM^%3y#Hj}5fv=8Z8c{KTv+{A z_@iJB&Am_T#-UCete*pP9RRnC!0g`BW>%oh)KWpBNndOYIg-&-M7f4_EW1PWI(ta8 z*m5x5GRPQ$v4~6%g7~%8q!0;s9cd!QsUT}z=)H3Pc-VelVf(V4$xLfqMvB2HsXqM6Gu_4;(WG_ zc#wSPJGHR@QA;iFj9v!>B>J5hvrBhwZ4I-#*VQ~Z>4MC}IKFkhv=ZOtX|*Q@vOz`U z-ZU(EiJaY1^5PI^=66P(}PKd_2rWSR)UTHli(z$ z^C@bmQg2kIkX8QGDV_o;oO<%8_TywnO;XXnSrXjiuTHC0D4K7_`wR1$N%+ zdaXDW@!P1Z3b1%a@~s)y?e5vXnSZ)!|9AXpD85k0hMb97ss-$e-2Ibx&}Lgb`YFRW zY3+=HFB8PuYVW``xiVp+A2W#6CD_QV;bO1gQ`xNq3LaP=#B?nW5E-Rn4vfD-P)8R+ z?1f$q1`dDtc82I>0k5G9tcpEguMQF=k<%~fC6K<<^#qko%4m*2Q8D7UrjQYuqG3fS zCFGln4N%?tZX)C<(QVWtA}+p~f%L5!xs@BWNe}3oKE;VC2eWZ_SOH8aVE zTKF~0UKRBn_5pSXZZQ!-BN}n3JnN61PwGJ(rmz$H-J4DNJ+DnqOuCM&I^5+_Ya$~I z`o~U$Cn;02(r?zgw)XZ~yz?8z6hQwcf-Wf`WfdIi`Vzi^?heSAP*pSlria66(0xdv z5_vNV;d!k4SKDgZ2U&M932^vQS3Q$t07Sped%FE1I-LQ1Mu2n!h~)_KK{$`;kx9Ep zm*GRPTK|F9N;ABUiYZj!zT6Qa`yj)Ii?#HVRkhPff8fH@EdrYjFerJ>tCJ%$if>tS zmjwN*IDprg4-A(T43tq;Ki=CDq?zHc?=kj1a;HSN@sNUcpa9r1wo}sv3oSLXBvef2 z&YepwRg#^q#+9PYlUvjoj3nc*ET^rWCaq?E-MO_3PUOCDxYPRKl4ehCMw#km&7M@K zTy@%bQ>zQsjyVl;pTrnQR71dJ?2u_6Z;uEet_Qf%`LEBfmA;F}kBA%S79#Z|)c7tr zn9i&)cs&}DJ|mP)g~FLXW#Ab_pjQbf{|mz+xp$j}15ZA4e|H%Clb3*eW3lzdpsi0e zwe`zZDvZ$NL#yp*ey<1!(X^kU7Z%H6n@XJh)Jt#>`1UkvzU5EMwE z&8xw^)~xjT^H%^ttUWeN(;0-@o~H6p5aB=m{yae5qOf{MND3jzB1RPg7V!KIxM;3S zFtG5^L%mZ6gmIMDPkZZ3dzx`kamsXh#aa|J#lXuVK@Bu`VS+p&3O6X<#KJt_F_Iuq zFmHOo^deYv*1)AyH|MxJ(Tlozh|s@OYYiT&3iq*oT*Te%(1T@vpT~4s%iE)Ols0Fr z%J9@~PF3wA_9Cbg%y8V_a`<{Nhc>P{{2j=XlFJ)keVZ}<(NXsBW|MJqr>;?zH21p! z>s9!=O)5VL4$37OP(>R&Vl+D?B95NlIdA%S4qD5oRzY1|$V*7SP@j`AJ2W+iERh$cGRsRk{p zu3pyDK0$d(^|D#d@eLc&cw1lOnK$2!g*q4SqZ5za?*b3RZYLex))$S69NXVn<9IBe7poW%rRU~i#tm!NOV_N(kXdZ0{dX*-O_a13esstY)*0^*E3a74YwY%J;( ziP%oFF$LilPONt9T(7rHOXn?}DEJA`FG{J&Z5&5;tEPH1Kvzc~ zds#9%f9f&6;#7vE!7EaeEJeQ&Ry7jBv^RmB`5U`1fbZu!XV>?RAjW2xPPa+-;65{Ng3dvc z&GiFoG)R)RLnV){N+8qA|J%Eg;;ylj&)jpE?#h&Q9AdgQiTr}Y721gN=0)GUDeC3r z6^%$0z(7c{P5FF=!Rcdf+l^f$9yoS>Mysza_6KG-qBz79J^er->@{4kp2$X5NYS*M z#Mk}4Q+0|j{?x5o-nhNnHH1VLuxZ9RAm}ViH&;^07)O|1_OU@dVoF=XXja}vdo&9CfcqnPIWA%f=0_|1_NC8QNPJ;1t(cNcA>Iim zldbK(%w=lcL#s-;AIaB99zi}xKjPyT{U_t{^=bb#F7&id9fz2O6a!PZ2B+lZA%tes zk}lZ}Psd$yxXXA5`kiF`i`rIa&dA&Al2rZam{|e9y%`__V8jk$q|_Wj zYKlyPk}lM09P=hTVw>JVL4=f9P&XkiG)T#o9NH|AnwNlmg|duCAt&CM)aigx5YLCY zqH0iOx5KeO4N&;TScF~1-L>F=5TOq2L~5gmPbI-T;OqK#6y0B;5i`~SXJxgSXF~+O zW$(>3zjgd7NZc_ZfkSc=Jfr7*PN@q?^MRO(IBJ?Xsi1UXi`&N}-Z)sKMh=I*vnL|6 z%G769K=T~2Z!%^te3Xu3BR?q+dNZglyxtT>DIt-ZfiY3H1JJPzsYhoiXi(~U&->k% z{eK4b|EAWMj879!`@Y17+`Yx7puqiIT|IK7f-a4FRwTnpYacKIRT-O`njT_oh;mI8 zVNl%&LQlyuN}x7m!2Vk3T7v~EC6&-&>O|GPP)r)FsZ}_Wp>`n^>w}0b7S(kUKqQ10 z@gn7_z_Fr{WfIh}*2loS=!!&b{>ka!&vKgxH*GRSj9mj9fj(mbCF&B-Lzh-ke@5TQ zYm#?1d1+&cP=SF`zPK#wNJh7^hC9^t`Zc2-%MaaCL7A2vb62&L}hror87E%$a@xV zrFn&Q<_<|9kooptUNFOfF0@_OR7u)+d7*tr9dMB1Nh;k zFatzIe?kvbljrRJ73@oV;F8|;4U>wcixJv1>BmPyS`&bLVSYt zdt`Z4@z5-E0d>T>BeNv(#6Pqmsbd%**ct>R!pGu>RSe=@4cb3Zgt&(9}g^P~DFVl0t2s}&Sp-@GU#n{c{ndC6$&eCN1NpN>&0Jns7y zUV+QmANm5>KB*KyK*{f9UY_%;Aam{#>=G0zl|qodiraKwHruHBpYb_oUCB`9)@gz~3Gcmlg>J?97I;^j26=1}*a+ReD6{b-_=794!)zww3~oLH>Kk?7p20Z9GuwXs%W9>U~$yAVDwlH>Gdu+D_t^ z3Z_@Ym7Ck!mzU&5XAr^kkOHcX)T5@+&3*B`4?p2E_e*^F%ou~j1bp*uCc#Z@ZEf2) ziO`)wD|s;4PS=*?5DSGzcgH`zK=rM24ph*#aX!AyFvAuG9cIpj9Mq2RIxFsNgPep5 zDWHZuM9_>XG}eOgZJ(+HFprC-5p9D={5C!y1L18l|9IP3L?7Bw!bRf>Ts zJ~163PLM>O`uNK*WK!)I>Hr{;mJ6YEgBzZ{0tbMUHlBY0{#PubhLzMWpWBMI;H;iZ zbtb6}5fcT`2QwTkIk#R@Et_YHD$z^x(Ncqg_y zk4_*Ee|8|ECBJu8PeN7?6S2*I_M$T&FdrZ3abN{5&!Dg2iP(y-S(K;7m3cMYeAnya zJP;OtH=I3dmR!;X?n?}|(3Q-m(bKY_rZsVKM~^T6wfYR!k-yx3EYkcBWndi<4jl9# zX850fB8vX0xcmR#e)Q*ktUmSs{MPw@`ci+Hnv*zvj{5o_LuvYGhN{Mngl+pz{TFxB Bl_&rJ literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/flops_step3_swizzle.png b/examples/aot/matmul_optimization_guide/fig/flops_step3_swizzle.png new file mode 100644 index 0000000000000000000000000000000000000000..a04fbbda1fe19558a7113f4a7cce3a0372e40969 GIT binary patch literal 98623 zcmeFZcRZH;|37@HOGHGG9cfr)MUgElB^elDxGv*7&*Oc(->=tmy^fGmC)Md_*l9>465TNk6IDmkiq{QaZ@#N({n!WcE!@ghNNZbcFED%&C%{0kB5zmtDUpcL18gb zVX6H*wr*~hT#tx|T>S5E2s^u65P50h-i%k-d`aVsD~ZJVnD{TnG571yBnlGgn2M6V zXWWmkSB&&JIjE+cyG#`C(}ePd@}Hi_%yRnTV_>50@YSc#_(Il>iw>tb-P_BIv)uJf zPn%X7t6wyeV*Ccww0O+Q!)Nu zUooC)zjI0d*S8qIZ`xR+@_&7${PX{R=YJOD|JBvF5mF_jsPZ&qY|Qpcl^;h-qJsLl zbG+YNq#Z9_)XC7@+n#I2(cIkpJC|%=Ezb6{uR8L>hf}ZX>UQngMRET8`MiRHYmXmu zeEIrSC&yUw!Gme5N3Yc)I76z}f76nt>sZV!y7DZ&b9LTEJ$MlO_^|Wf_m648VPP9d z(L#pwJ9qB%9F4zXVP%y(%5E>bZ{H@vTvH`|eWq>Ow-1bsQM0hHNZ5W*3V#8JpE2eX6A&6?n(2`(ccMjKB8BCnvxWg)xxLeS`=8y z5>!-dn0v0OeD0#(FMYPsI|!eFmzQ_L+qZ8;4uXv zFRxR1&J0qLlar(F-rcZ$`}VHSpP%TaXr}9>(f^p7bl3fpoxT6Yty{Y#B$)E^^EHkg zyBZ!&7Z(?ID(xhfr>E!GHz}1z8d_Sa8XD9)<$Z&fmOM#;IXMSpWo7$^hV+v9DA?H8 zJeQ_-B`5?$7(P3@OF&@rnKNhdii(1glXu&9=7u-M2p#2(8v7=uqNX-D^8Rt~?b}RM zznWr%sGOahmzI`l-n~1)*U*~6cJO@vhbKo#ftQEsEoZ;I_0DBvj#d$lZCWtfOfKkO zo*T98%r%SJX;ph;^8=~Nyrll*NcmcYKdXA_YhTN+XkRRyttxTui`b|2h-utt{OQxD z7QcRemAceV`q^9gc(hDjUViLr`Qc4lSV=-cLOBu71OE8GOOTUkald{0HtyO+85tQ* zAD?8WuVrH$IgdLOe7SMGTrwWSr_#{Sc;jm3>gu}m`}bhwuim3s205pmot2AUTjD=% z-;tf6d)~<@Vs5nMRY!-x#2S~JH8_G$6A$M?3qtMebVrF<6m zTZ=#JCO+zi%+o@*IiCZ{JG&880eYTPvwk&R6t z`ZRWNPw~a0F@mSvj1A3(N)xIFr8*BDJlI;~U?}Uq%1X*RS4ZRay-js!G9=y2Qs$J8 zBUJ%cI;|sBXiS|)e;A9%jlQhI&b_ZEdrEM=_`md(Uvb`VR;JI_aQCe>H4RN|1eeUE zmBmTZ&V!PYs$a|8#V++L&X2Y3Zt?s1rF46&u$eM8>DemZht6E=lJ+;S^;wp#TyS%f zION=Wdu`>{?n8$TbuP%t$%$FNVep-PNhK;NdEi5SyVv3mwlepLo9*rTI;FzhJIctb z>-Jw>h zp1x#lZT-#5*U->#ezJ$<)lE8~Q=>GcrV)Jsf3Wd>)WpkNaWQLYY^0I%p09ZmwcEY7 zbnu?G{L50;Q5BoeYPPxAAD<7p{rrM6$c75g(3-4HH{z$!Z5gY~E3Tm)xwG!>-s22h zTsf`3pFDX&O-)^c4I+l35^ruB^VT{zGE&{wSC%-s2QM`9h>9}c4eKks=KA~l?~E?G zlyY($c6vSjDQj*mDr)zE19UimuZkQxr(bPV=x&DJmtVV(-v1sJ_?@Kd7wC%?)eabv-e0cSECMwN-<3egwQ-`XFMr-jnY1d7ER{H*k zI(CWrt}l;@iZ=$N-j;Oe=x@)^qo$=jdj5Q>aAuByn%bt3mc+y<)z~=Km4H7h+~->o z!j-OX8ZMs-4yNA1DylftVuDp`iWM1}9d5Gyl*znd!-i8;zFy(Ql5QxqtYYVGjFnDL zPuCP*>^X%#AlI?qS+L|cOzjZ{QNcEWv2h-byFF(30*7-@re$Y z6x+I0q-iIfLrGzk^L9(1=-;g#VH@*MKwNy=L`Tjm=jyea&#Kprvszlv7Ee4Br^Lr{ zuA11#$7hB6DdjoMvqRW47~RI+H?D&7h?~aVr7XIwTer@C|43DSWlGK1nEk$pc?7eN z!PrwL{R&pnKHlrR_uft~bed;PDqXO#p+q5n@#2LdZxj#Sb6{XV5o~z5FiDVZvhl-V-<#{C~>L*TY$u+MQ_T8D9ntJZ#HA>Ng2b)U79!5t$ zT^tOKiD5?J-^|Cy_u$bZnv8c$B4@FO6Il;Avih$tUx-iP{1_{9<)_8#U`BG-PSk)p zY`*^S@yzN_yTH-1f%pmHuwes4ViQ+*&TOb&TjY>mojyvJK*>uk2$@tY-WW#Pb2X)$ zYYC?uhb6Tu;>twF0iyUj_f{~*pkcjgYN9=P@+4}_QETfxwY9Z0%>9KWCAZSj_|U^G zY;1y1)QA^HE8M>3ROvaxdy}3kG%;XZy6>bQX}Z%qz;f_)h+w=v0eYUF*D^9T<~?+1 z2kJ0WYAi)TkM^ruTN|)`1K+;s_cu_IsJ8Chubz^L3J3=DjiLY_(bQ=-^6S4$jWl+W)#*mh@+AZH^vP7 z{7G+OVuIU+;I6xL9VfK3Xum!@AS_HzK|vAJO-1ruooVQLeqk#R-}=(f*4u0c)h>F_ zi92o~ZD-zo`0(MlBkr2X?rB<&Z{E4XV7ZI~td8a~_f`0r*SN&jTU7^)@<_NhkJGd5 zKV?3Uu-n#MXt(kB@#C$285tQ%eSg+(;WNs5PAl^F;F^x}oomr5TNbT6WQ-dN$?y<5zC-j?X6_MTP+V?_`|RlWhzWpR!oazkBdRT>Tx+pb8tWWw?doD3YKUw==TS|R77!3Hs`i&VTg!R>!m z=e9<0O7qe@9jULc-*mvZP!XW*L!RX;)0WSGu|K+tXi*Q2YHHGu`te>yWo`o4$oEIZ z?%um6cA=TGRoW~?lX?Hy3Ra-A9XXGqqOQvMEcQQku40UsTU~YqHeu@X`(;)Y$#)pg znQRe*uk`EM$71i^y#^3FP#Z+6OcW!cPSbN88OuK(G&C@9H9b9D_TxbZbb>SFerIf9 zG52wO03*tdjt;>I1wbWKG{I`_b}?~rs9v{|z@Kbr; zJxkOv&#B5^vvMcTRxOWkRQRv4D-QH(CnO{QUvpurP+-4)$}+Ti|45QDm*qPZ*lgYF z*RNHEOqUzO)%p0+lt({z*2#x=Mb;ph!W9=E+yNfS|Iae*K zwOg&Cr5;`Ty-;*A<(!R8%3|s4^t2+7>4)lo0LL#c7}2p=e)>%JQ+P~v2U)(l8B+YV zv2j}t70IgpwvNn|JG`i_L>uz^J+Fk@TuoA%bWK14()vj8lu{S(@}@| zm*ZiEBH;UH?U(4GL-hcSL9>9?woFXV(E;#T2_m$#ygYzV9b10cKw zs*`m7{i6pc=?uOye!nHqIJjfN{@MEy9bU}7oqx~Xy_;>Ss;aswyj-8Ko7DxOUnERm zFVtS?^Sf18D4~&}%PH?`G&!`3+!j7rvzpJf#Eo{b38$n{S%%>a{k*4_*UOd`?WL?U z&-dk;m8W|6f4lZbA)t!EnqoH5^R>`k)1h_pPp)et38Qys&TR+b|$^9X2${`cE#Kw2SqVkqmB83lec0czoo40$`B=Pbho2nX3yr zhr@!mv^~GjawF`8G;#IdrVbLu_3z$oL8-0-LQ@?EovFuffC`aVCGDddE4GY}oUQO( zE*NWY3ul)U^_sQLi8u|20t#B*{w_a5dt1mqps_6Xw{jhMI#W#a!1=V&wLl@7P>OII z4`zOT;AkNo?I|>n9p7aWelxMK@UhETp`!v=j+C37EQ{xaLyD)ot6JhUEF;cTKTAoeM$+*}>cSPFl35 z>v(N?&E!7SYqt0Fr-4_uB)dEA;8z$E{eNeSx#yac2m{U-@LsD1j$Sb>dH#HR%9(c} zQd%ndJMy_6$P{mMzcpD?&`SSL;ICevt@vCo@gj^241qXduUcB3DbDR1RrT_JkIFmB zwt71C7?0*XcP2@vFGj<20@ZF^jTs)`FY(Tsp5m}3o0WUi7MKbfevV8f&dQIzYHmdj zz83JH9kvQL`61tM2dw6!@Y&ggEKGF9ruR>Q3tj&CMLOHOdMDAt(AJ;n*k8T~wI^ic z$Q2KddToWZr!x|IYHDiB=MQ~YT(h>ax-L?+q^$1&b}L�D-841mXPv$ufOYTR1p4 zPCP!OrEl=zm!7xx5ik-Fx8?_;mKy{i;3WAC+qRN}K9nBacQ0SdWP=##gvut%x=?0v zG!u3&-aX`U)UZedDRUmqD=>4^|wI=Jby+_GQ zLlfA(WC<++$Mk3q2iO9eQD*ygZ|h zUA+enGfVNt=P@H5Ja^W7v_D4p2R(VP=~Hj-&V%RQ@y}Y>lWm&rlZRK2)bdC6I)2KG zQESklx7e|#sMsYB^!jqTe#YI76)_d`P69L&NIUurfL#N3!2aoWncw85lhvi4OI`UO zUIh(9YMYRKW6D6sF2SIsG_$%qC);W0uVBYM=KQ%JCH=aGlHMn0v=nF(|J?aHbK>J6 z_p^jrlA*_Exr`%8<~htJzy5m|lLRc2~=zvBDoFi?o3>Z zjg4*G`Xo-In=@3wBJfA)wk9$fplO-gW~_gGs@CH?r>{pGKNnD<;}G0WbwCIE}-XR=qH~Cv}k&%(M(t?7TX)7X{eA)@GUtAti znPZl`*v+J{zO)R)kl@B6#T$<+6mO)Bm{feVFjf6$ z5Z7Jn61h`OVrPa?d$df!$l+fXnoX;uzkK;3aU{aUgm>32rQ}Jbi(PpX_^k~j{by&n zcB|d~;LUw=4CjjVh#S9CU)6!=&!0bAHr+qKUUIpS(8-_f-$bIKqSC3~a?71-6^W1z`A=l*iPFfsG@zWLE;p)nId>eOcy%h^&&Rj^8J?HuEqL3? zQ2@hq89qHT=Y}9EG`;Rp-16Nn7QhQXuxfjHEGRoe5Y4(Zs2?OE_^08C3!ANfZD>DS zSIVxXrFD7n$C(_xWAD)x+Ii$4vVc@RDL89u%MYk^{m%61~tIC#$CnZbq-0!KApjwFb<1WZBd0dsn_y?IgNmh&Q)= z)2mk-0ZJH0_IN>EB{;tC(!i~t+_YUfC(op?erwXUx6iaKS=-;3KeMn^|FovGit zL{(8#B#DB-Gp|T=DIBI#)&73*pR|_;9giK``wM&DqW(4;`FFAXCmK|Q4*pM1cMX&} zEsd5ve8~3b(8bVueA*k{te;3cc!7S;wvgs=brpTxd@iGkE_!R3yuy-=VaHE44sx26 zxwYzP=?h#~cqPx&W)Q_VkMe&@C|P7%YA zMFOSaXkw$3W|&vYLs||R9Rw<6&~jI~8NNF(L}LmZUc?ww^2(E685NPxGnl*+DW9wk zJ5yIaWzsXZv9|69(J;Q42yq=Ph`I@l&Y)-P4|ImsG;QuUXuf@FXaed+My&o=ZC}02 zG364cuQ%#dO-)U!<_7swnf!2ULUXA8T&LPPJy=I;SqdB;)H_NX7Zz!dif5l9+wRu9 zdUdoNgymOmeUD(S0|X3hZEZ6HEuqVoFOzL9s;3G*gOatT+<7d@#h83oRKW}gGgUiL zxUB2TlcQAGXP&3NrVOA+m?+J0wjK3`Jb$A{H$y+GGbA`&!b&F3GoR~?hlQOp)s9#I zT-p+?6DL%5H!Wygetjg+S3Rx@>H^tDD|(AXCvu8gB2{=9DWgS9UnI?Co9Md{qD`gd@#EW5R4J0~EsJ6&2^v?qPZ&f!mC`*a zkSePlG{_l7x-{I#4EfnWqAlZv`;RW2b|0gj<$wro1z}AG@=xXW06O_Nz^z+1SXo&? z3MU`EMGv`{YISX`HP7;B%q9{P^uIeRmtvJ!L zFS;3?&PiR2=RIDy3*#WKT12U{x~!Mm?C!m z9V=0fz@34{HWN}b7JBH%=jW=%&mbZ3@bD-a-PhQ&d|qoBEuS(Z5~9F^2M>HBQ&y`! z?xXa8PF1zGXkzJw(xECIP;l-sdb`2!VNN#TSFGCFDmER^)Kx1pe*E~+KQN#(wq}rP z%2r%lTwAdj5+fmDpVU*92M$RYFfXz{-Rc4W56g=GckR|KTU0Jx65T}484S2*FUxl$ zx2vnGrmxR@?7i>G;zc+|vUS(d(@ zq1=olPZOnferbK;S@rbDdyb-o6%^eY{nuh*PUojegxtKU-PWy>RsC(HXdO)J?d2SVZKX zf~KTK0k4?RyG<< z3_&=6cQW2lu(Pp+ym_NxTIxcnusXd_eyM-sJG3M|u2Xutx(v+BLD*E50F^I1CLdi& zgp5e24pW7@35gIsK^~+R(&te>Xi%>4k%#y0rTmzt-GE+jqX3ltRb!)O@{wx)HDjji zNAiAU9T{VE?*ZCGeU~n^y6S>ZIQA6hoa%EB6BAo*9xyecr>D;=E*5UH`ljv`oeg&r zBB7{N{gyAV%-qZYWFfdR?%WxHx||G4(Wt^RzrN4P>+QRDDo}Ayu5Y2*SmJz=`vmYl z2H(G=Gy{Bz;SfVA+AS^3c5k0nE%vO7)`V|=4F%p_%P3bbEuwp>^Na7=LZ`C*2&h@I zd9^>gb5{5xNr%w;N0pT&yyZ3ipehOOh8~$YQ-)4h53MUS@6aC|Cn+BQgCaJhee6NLdq1ptQLNMHumX`L!BZ*{7{MpF2*QV* z{#ian25(Z8-8+P0Py4cG&J@aEGcnjaq#U zXNFWTx*yDn1G?q2dut^QohSKL-iEmxReYF|vbWgr^R;}dhCw(CcQjMR@SeXHC&Pf{ zlh4(Kk_tq0!0|+yMc#GKzJ24_ zPWb37??r}x{J7m5FEiozr>qb)(X`V0wEiL7p#7#V@;=Pv&6;+GxB2Iq7#Ssj!zx=m zs!Rzy(IdOK@Atw%QiDhQGZtLS*Fa2?R<2efL!)op7dMCz9ilJZ?gPqu1|Y?WV9%$j;g zSP>7_Q*c29|E}+0C&QK;_)FYrr~;uQ5XV_rCOe4`r*aNT+6wXTkN~v=h#fpry0M@q z2mXeNuC8uX&)I5!E^gmx3ZMb1oR7hOeos6T>}wr|9)AudG3&wex7Aj1pnpKJ3>45y z?Z>4oL2xG=$t^o&30d#vy?fhr_4F?PS@$EH4+&-^_?HB@4-cS?QwZ33i=J!4o%o1# zY_R?Fk`Z6{sXl*BJs|zJo$AI7k@&k*ewF4I6)_(WY7sMH1!vk0>sCaN1Ei|kT>(d5 z)@$}304g1y00>wx>u*Ru$j-@W%|3H*e*6Z`g~^@bVV7f4ca#XM0@T-RqzUFMcJ%r(oraQlAulm~=cQy(iK-NR{*bG^5saA zjzUe};z4gV+&=|jRb{uu@r?-+=WcrD8sb{0N!4&3U})_?S8C7k!~9rx#zN0O!9 z`VX4Tn_q&E1Pz~zKk@`6d3t_c<^1`5C{TIW#g_gjd3>5bq7!D_ejt*ac{=MH^gd?#L3JEM;Z{qltqu(F<5ME8-as76D8%+RH1>37NoV-s7#$SBQqhV%+RVFi;wu-ek)-lIpM zl(ajjoA7B*v!!Gm7Zw(V?aZ8T#q9i7GPzC~$oN?_$55H3+&@ee1Br2Qw*iB~)B(N- z`J%?2Z4l&-8r|1SVSj(jbGlz?pcs*iUKo?UC9T#~w0=5{Z~7{{^lWnS6`ufphxAJm17#0^7$^Dwj${S#zXMeO}-B4;ukN+Gz_}rHxmN2D(;hY#K z@(IGhc)+>4a5E15Ti3_ve>yZzI3LX?z1KzrCiTQaUiK%gAkOD1`3;QruJImq^346y zz|i7e7slvPYATWWAUJ*hIhj+4x70y6g6Fzlu<_w}m$vmsIL&}#Mn(3_-6hVEaHC%} zG*A)mjea2t%5aZ^r;6Z&d|^qdY!Ky6~&l_^$OdHCT`v$&kG9sm@@{T%@KDD0?qEb}aIp7vdL_<(|9YiQ6?Jd=+uQA=&va{c zA5%B~5=vE{=&kg5GD7p|zDs{Hzqd7%6;6Z_7)EFzB2vMsIXfopIUj#tr~!pk^kTP! zPH1KYr>rLfil#E~jhJ(UQy)R&I zrhbNQ{~9BsT_|Cf#{8FfrF_|tmFTy~h8>eQ;Q@wL3+$r8_K!cjK-m(V;`8s>pQfd4 ze6wzP9C92^%rv2n)Lh*d91@}g!QkIsGEy_f$vNB0q#b`G2)gGJSdHvlFA!t! z|NWiM(yKX6x=uXy`MJj(cVxVxSk+0}V;Dg-FI~CPIo9&|b7|Lj%Bq#V+0?UZx>fwy z6SQA|#b6WGV9z09meG2M=s;~Lw1iJbWQQOR*5ZJ*=9nBtXe|A!8jL3F%_U%aH=~sw zz2j)TEcbD7F{2eBp{_g>;~)JgKGU`DIABcOgk!5b+8m$nyX^9%%>DQft!SNUKGo`z zqqlDSHba|qq$m3maj!fR#`OLh5Xh0PCShxWK z4x+IzZrgSZ0BH!c%P7y{>h#Y0O~X>YOQMK?C5a!7mp3vpvX~jHBNqUWKY?;jxSLQR zBcK8fq0AVSdmK1&<>&r)Y^ky=W|q1_xaM>4LdafU!Lw&@Y3$!r)gdV(*Y?NnReqfwa`r9iHYYD%}<`cP15WU|vEVaD!3vsEnSQSy2C!Ok`mJ zj$4Rsxp}#V7VBjCW^AN^xo*fy@FfV}ME=jJD}fA=Ut;N413#Vlo|c&TwDUCT!A(fm z`b?{Y`xMN$I}bQ(6~FMpePRcy;4QdV1W_}Z!cstBHLAKgg#8qvI}iYpbmaZ-ByEtJ zrNtg+Rf_VwJW;|Xx0lJhYvkmbaN=3oW9nsj35%vD_ z^275B+LqBZL|V*j^c|Z1aXvj zG>p?joN|JOqty&v-zu+&)X;~pYRHj-cJ+nF_?~JY-U68uYn^@SJaQjMp}lNs%2Hg} ze-idL@z=?AWSctI(!3)cKZ2nQ()4Xd{NjP6gF(XN2vqsZ z+uaF=op%x}3rke{=@Tnc}IP4!u{p^lhIf=b{d8${g)f^2X*R=E+ zGIRY9Y?<1=ju{ik6M>F@sHnoc3n)Ibye#_uJ~af#FwFbX8jR9WS! zpJU8SR7@i2;bQLSxF6;C`c%~l{l-mOY6uA`FpyL!c}PxHmSC|oO(^n$EJx*<+!Rs#WT}V z@@e)OvV_Yw4Ky2FiAY-~Vj+<-Oplh4mS!I7I(~ouOQPi3AhVfmQgU)~rRNl4e3rO# zf^={3=}t~^7VM?`@1}_rj{du8PThvA*5G0B-GOQcl{**R+UlxAZxtXtc^Hk1x$I;p zxb@!O@ul$FO*a~#XiF|; ze*n}OC{+%jg8H=b?*_Ek;sVe9U1jQj3ChqNvO9R(z zcOr1o+S*m-E<|J@U~GoA?O=`tO&)+*2w`CzW=7Pdb#U9vdBOekjdNG{(>8oeK{**n zOZ2bB(ov9zco8IY+Esbq_0>}0thD>(7bFE4%dfO!uC$b%D2+95}HG$xvOkvIb9_f8QH}D zmKQ=GH!ADCTBl@2#c=Tmx8N0jEU}Pdj0z_w=X9^n)WExk2T7z1{ZOV&kuhlp2EU576V|Z; zIvu1l46X;7u2(h1H|#Y$G&H2z12xA!u2n^h6le0=Nn92Yw5YRZpvzznk-eCN&sG?X z-TZ?Hi7H`!2w#@FhtMz)mLbk8zt8Nqkmksb=r&IVwpt&aW%hAZQ&%6tN2_%~A1uRSsFu~TJ@E!SO~=T-q$PfO4sBifJCY4A|HWDhDC z5u=$#R)JQomBIpfDFiG({;m~lSs?3mH@gqh_BAHXVCI3Y=V~c{(^B5QwhaM}AI$l?c>Fo^!d((pIXFU5z_{O$+B~Eh|7I@6t856 zWU*U!X*y3uMTH85h1L;?(GQ&~$Krt*!;Xl6ji_cFG6!@DB8etd`QnKc|EBsHRsfmYaI>y)+}gj;eN)cSw;{r&pWYk#TLCBKH1hrg*JET z!v+zkp&@N#E~!f-uw^58x3N3?LC5^MW?O`SY< zG$u8W#uC~uB;WE|V}&w(u~ulj-Ij1J7nT%o{7>r5Fk^Fi6qa5}_SMPa_r9}(rH(e! zz`(#b-2Zd=m1m7XiKs>6lWR%G5l>sfy;5ztWh&f&II0SZ((w%m6Q4C}OLIM+^gx&v zVTle2-tzCe<@xWsMa-huJgu0m81mBb7ZGZ?l}jr&xnjE0HVT@8__b5zl9WpyUt$5;lRSwDZYz6J~!9>HMO*u z{rpcuYlv*&5eO1b}9nXNTS@3E`8KI|KI@1ALJ(|LMc z?9d@KRF9xxgl~1c_3L)tg4BVri13pULQxu!LAJx*SCwF~fqDVQJLjI2?NPIh)e!Z` zv>WQQX1u(9&)em?{u;Z6q7J=RWIuw?M6Xo!s30RrvfH?0-^E9>WqT!a-lL#SFU?q4 zmO^o2PkW2L%Sxn2%1BZ+jnTfNxdO+~FlIh#D?GtnR(^P5-z${6i2i;HxsDSuvYp49 zOLL9X>smaJ3P!K}g5v3mD5Mh`0x$ zi7Go9X~=>9XKd>tWEE_HbRgC!u4h0IgwBU)t*s=2P)Z~`SBa|n=u-OPMX=hz-tsG3 z5vXdjHK}Vq8n{@QL>F{Ulbghz(n=YKa%V}n>1yAJo(J)LywLw<4FW0vlXZXx{1Tif zaAJ@y{u2GLIN8pHXM5r%kSk!Hj-f3u8% zh&m(t7e2C~iU18~I;Z#)-R(o5Cp0ejAqdp=@uTFraO+b)c?4L|Mb(qnhP$c*6bL@H zU;BY51&sA}d+xyT(TzQYd#S+tqm(dikePnvBX(&XkbTat=hknxwZ^8xn1XCa4int* z1|XPLmP*yY;C`boH_$+bYLPzjg(2+SP`(hL4Q-sEy|*YN(kM4P-`aHuGZiR9GhpG= z0gZM(p(Yq?QEh@bDA+axVk|N=adty2Jy8>p3|+Rq2wO+iTDYXb}R@2o6{v8zOb(XWY7A z>aM@++i?Ad!MYX%sH97`9g#vAO)#>*M?IdqEj6Eh43XY&RJVPOdx#9 zgrYauS8Xn_5L{An7*@nn53}L99|%XXr#-LooFOyhZ7k4>o%o#}y$K2WQcVxwWqL~t zACdQ~*qA}axY2{zw=QUORXs4rtUwsZ1!yAXX$#(SBXoabhzO9TFpIW5}y$dHn zWzN#u#|J??k-jz4YJan=nh?|-xUQ-;s-MfHMZjs*aod6v{33%L5D^^X4L+w?Om0J{ z9lk$J69SRKvsUBA*Z`_cL<%ZHFKzbgPq2ARH{D6k2G~E6W1__0TzNl{(6N#CQnqR% zT7ksvBcQ^sAStNYRe-&uwib88VAzIX3W3${2aC?qLjobtf}7j^Je*Wc<_{h7#>HK+I>7oB7AGTCGSa6*u9*8O#5fl?*R1}osi9^V%3dWnL z-DZuN;Qv5ay&xdQpC*X7CuXBmY*4MEzdxZqAD`;G_f&dQU`YB0R z$nS9EI2HUk5eOj+Rh3^0m@EUT=v>>_(FVbVxU53k)(ylw5dffTJgqVW9aAPqQJq{PI7rr(eO>{X383b4cE2 zaSJi-0f_kby28BQ$JjujBBX;G!-KXJ5WOpZd5UDCLxi^VL+&B`TPQY4h>0}*-n@A; z^_DHKqXqO-!oObW_2TQ3sKE{{^{*E&|KYPR9@D+Rr1vV*b$(1|3!C^q08vc~yo#83 zA)@uD7NP9?P0EOQ7t_FSbz5g9<`ElQLN9hJNz zcTKdUC1aJa@NmgH?mxpgALDK{YP&*aI3b(4{g-xuy6ugI@jwTMQYm-k5AHCL3w6(v0nd1q6-_9g0frTWJJgp4LXBlHM)P#jtL6V043N5Xb_{B(A)j5 zj-S>a92%MdVIWM*ARw(w!)NsHLtrcHK6p@XN%?eKim-01--n6$DYN%Vrs2~KYEkD3pt(>qIcjsuXzPPixJE&D;;S6^J}6>4&p zH^ow|Vti-}H3g$d6xLK%!BIkizli2@2jjQ2w0g(3UUYB>Lt_YQ>&v$x_Zr=sEUs$Vf+^IKqgF{koirnTWhqTfvnplm{*3fY%>RJ}`UzujYM=FrL|8w&ue z_C7fp4v?E^kVAthIt&vLbL&6?ut~2rt|J&mj7KHm2z&h`F#i%{I08rS>gU+L{W``s zELzE`$ctUCtdt|p8PGkhIIHj?j+uFqw0Bq=kzu>)S!{QF2TJ#wB%EtY@LSuiyj?`V zh@B!D>xW++Ev@7Mpq(#KGOuax)7l`GgFIujQDF#ZuXigmVNC!rbN zK+SQ8CZEPgDTpno84*xW{Wbi{NWYiIz0p-XMuN+L=(8f`wN--#h_A_ikRcJE+bMhOvq3H12V>eA)#fKxGhabw8+YNt+}%ICz? zU@B6B$MQL|(t*h>iNL^YlL%%xF`E2qagiA3CS>{W4iQ4jQo@-0tQLoZvvVZot#(OC zby;jY*EqLidxTsl<%PdT8Nui)OCu19=(*R_*wz!z&Uy8hI=yi5&fBO6ko#c$VK6^8 zF}$xqjO4MQCPx775#ykL$Al)NWn_r>1FW_@wU>RMc?00Ks_^jVxaw$0kP*f(iRiyP z%mNOenYAqa{@puKc*Q@6QA8P!8yOjP2@^ft-RIui*@fxkBqhoKBD0m@$02ENeTmBx zaa!V00)_UUH1(^z7wm}*q+KM=3B8}dV|g&4FfPa6>DEM;lwV2@Vvgvk&U6XxE4qq2 zidu?J8VO?&sO#LsoDL+rqV))Sm~EIBBxqvhn$u}{1FRsC#e;0z+}|ChqoYHvaaX!F z`P6)ym(z9^21d3!>FNB0S3x{x1Q7`m;Ys0<3)2e=L0MT@2@dI{Fll9dml$wpRiW~u zQd`>F(_oN9N=7CbTpxC8i|g5tyWg-sh-YXZ9)x^qIP2p=;fF>A~@F8U=-FYG7~ z9!%sF#=VerOV>%>ghz7Fq1yWy9@HirXIxxOm>R9652EemVw ze;gbfq@<*XX*aO>%;sKg;(--@D~EQvz*9PEkUGNw$7@_IDJfB^y0Vp1Dv;2p zjd~%5%fv=s0v&&i>dydd^z`W-^nM~SN0d@@4%)(&xD$HX39>IyzK9M=ES`x89>cK% z#?;Mc&z!3Sx{HVcP24G9e>}+}5DwwLqOcSg1w%$??1xcVF}4ln^f?th)bFECeCs)O zV+-xy2l?!+LAX80%ZpG%P#4BZ)l|BVX=|&4O5VZbH&40YN+89?c~c=WIeDq{D|!;) zG_*+KZj$fboK)m}428<0eTyD-8;vG>k^dnd68 zm;t33)GA;=&>(ct?iwCk=!zN6T}vkU=TviaX|}~cOlOpWj)d8iPe8d(N7?ZWonx;( zc%sg)ul+I~Si~(|FxpOT8vwmjp6;xxqXZ7Un&6;9`I01x-`#u)N#FU2&aEV#dv9T# zZ<6Z_){qIMn?;e1w!5CL-}B<~Zt;T$DM$cCw=jQ#KgHKxMm&FJnHZr#M(md26ta7r#Q zhNVk*L)gCK586h9$)OLp++b`v*pd9sq4D8Jm!vg3Vu+y!%T&Y=Cn*Wu(pXQ)GvZo^ z{8VtT<{b##+YdW^A>H}!!@NK< z@KzEpsAb&^qp-lz|IZg$_nJTwUi^Q3X>LX{|6l)(cm99$o$Y>xfi&9t?(M<}Az7R| zw;@D@{P(lqro}kow8Cd2kuXxu$`(_({?XB48-BpZ_}$=gkt7>Qm=b3RPb$cqvS;}F z{TNMT0|0C~u1JP*^*s95vzAoT_35!h0_ z6R$@6LEWf1@mBbQf4`O6v+QQSrs7n~ejnri{A6(M+5h?w__H#@=!;sZf&2dFzTlq( z_Y#?a|NZsx}gNd;R2s zAY$**iGDf|^#FGn@wHjH4MJ1_*RNf}LzbA^vi6d6Eo?~i}kCf#KQ}~ zQ&EUmJGxZo5Ud1<-f6hyUBPbH|8^oo9Lql~-A)5W&mX8TN@?~$_ui$gj81F79xHQpm zfJhEaUf#QH^Hbdn&1E7CPUK;g&ea9x17Rbuql!74rF;AEkQ*!?d%)ri+#)F{R;1Sj z2Os^Hs2-to_-ntl_EVUlRorG3Q!ez!^H2!c>XR6mOr|l@29#rE z%V^fP42ltl)alUL7Oo>qcyp#0LYV~f6x|2Ik>^NKd`ao=P-2lY3BnK+!gIY9Ub|OO z*>5E$CwKc*d(Ej0d6NQzf`YpJ@N5H#&Wk7)r`5ItQF360mAruKo&LSxub}cy_i}tVx5@6)`gk%HJxEqpcQ;KyXcJPMZqql7{C@FyOubG7WZ__B@kc(LUTehU z*#LCFStXuxMeM4j(a!pM2lU^G*V zsXbzH=~91g7;z`R@}2X2Rl)6Ra_qIQgJm>&<+!P@Ff7o)r3VJXG7l~Cwx z<2^7aMFs~aiyInt`eMxb`E^_?=H2wiqlTci<~@J@(f)?RS&F*V4R?veOG$}gdEb|7 z3+O7)H>o(ZK@T3VzOJu_<~O?H*-m*kMtwKmzJ0u6VvM-WdgYN& z!881vA<%M&oUJ&;A#1QRvI%xV5GlerJ$(j zv_M#7q@lp^W5<->3v@?4#sr7`#u_5>3DT-+gUK9aQ6uR)Us8^&QEu%@rf}*QG>&!v!fUa2P?2=rG)J$;0Dj zd3kxGt(m*P-56!YNvGq1NZapc^=LQ0QFTXRBzqgle}0S+TzHS*J&ireq%+dNj+kx--Ft!& zD)PkyS+5Nl`hwl{xV~*6Dvxi`D3&C14!cNi+qR7-Mw5}5dGFIcgDoS*XU@*G!s$=xJ2xUBob^b?n&NwPQ*@likcrO(p0_ zplG`h6i@krr%}KJO%~PcL?1(EqoldbQUYf-T>YLP)z$0QuODw8)4RB#q0~JO0^kFg zE4kgR4bn}GQv&W>DiU1baVByuXS0YQHJAcm$Goz5ZsNcI1#}PnT+=g!*;+LA|LuU{ zTZrCl$Ad>P0&}y3_a8*{hww>9+*sEU$^+TN471m!h?rPB$tJU4>&5GD0)HzOlofV} zKimTI!hVXdYPw0&NP|w#4j+$k8PXx12EKn(A1Y(L8z=3J(O=0UEv~)0{|{ep0*>Vxb`3wqOcg?r zB+Aa1ltLxSOoMsO5Q#EJQXx_zQ=x>22J=`UWQd3m5sJuIAxc7|e_d+t_x=9k_&&!z z8g}(~p8LM;b6DqE>x3gsanpq9E%lAp&b_X-v$0{I`ap;`SMM&%SN#ETfLYWNxjlQ# zM}O|dBmhG&FUUT#-MMMHrZmrbc_pFMbm+bQ-?L)lMe3V+)r0Bp&~Ap*L-Mc^1_xGM zhjn^};R~R1XQl;H6AX>^GqerwtNQ`Ia3Km_iKtkExo!9qMkK8Yc0Mu|J)e zk^@z)($zL`jkok};J|6L)%Ert1AXfhPu&41ivFdtLa=7MJzWMCoKU?O6*?tEuE%vL{<3%HQW7KFTR1!F z55Mt11jAGGT-cerO<+HY1WQN9<>;F~l<~k#5O6ek&uT5p720HN0iY+dRS^3*ykRm7 zx_j?l8n0^4NCn#8z~X!;>)PHlgQ&~F%D2l?43EYAXLf%7C95dXk41(5;fp9R1fw;# z1d!#R`;%JBBO?qBq;K0^y#=?X3fO`2g?^+ef}n{A28@r5O=5yUG;t#HgeL{jqCx5t zR_^rpt8|}-?F`1^r7&`)rK7Wg<{)|8R*D;i*yha(0HU{pnKxu~=3ZD6Y0!y-1&}_9 zAMmHN%qaryw7D+ydcb%!zlko6Oyfkj|VT{nbTuo>RUopW>FerE2_&U+y`%X~cM{(Io5km0&` zcqn7GIko!;5qn|UZN=f*l=!5$(4BDCqXI8pB<6;CB~Rf z++F`<&5)Yj+rgLmw+$zEs%kguJE6S9f5V@0&>*FG{ZR8Z{3fB}^1oN6o^R=G zWm8n759~Ur<{s?}8u~0~9)Uw>BVm}M z!@#ij{7=2@*NTw!MCS3Lhu1F9`H?cFx|)CEMkT$cXJ?$qd|S_xRN#Bg8gOj(balO; zB+xSZ_4v=Zluskv|6P&COL3=$xF}(KGmw$YS7c2_%0>5|Gjh3q>` z`c6Q&uXeWU85nQ@`QfL%a{b!1Z#^nKYW+ZK$VE?eI&h9dR+)9A&D09)3bbP$|?{`z0vqB~KQ5?2mNG(S$f^{*} z27d&it$>_(EA*AP{u^!u%mUP}8prTcLFoD&oOF?S*bTGcZ{qi9nX7?(Vj>IgH<)#R z)D-f?7mz5S?vtR<*ti@+HW%Diz{85`|2yN+Y~<>eJ+f`!B05ySp-KRnr_I+12(+$( zOc;>((5=P20NaRJn38~Y@fI)#0Su$!=$Q2)8VunQun&lIN;?+e1d*7P)nFk5vlw9T z?Qi~zMD=8n6GXWf7-)sY3GZAHx01xfe#|~wPd^F@s-wi-yqTJ?e+>mw5`1`OC1vFg zh~I?WKr9jgcACwM3=iMe`2koqS_&5oJu~@Bc#x9kL4Iv((j`m)T>bRrYu>$kx6Sfj zOhAu`gl^j-cn(#<`9h9akguB@s127JP9CcP=VDq!(b{0%%Wx3eE>7cmBh` z&B3v;dnfV&9x~NU3bN3hLncB4#wl>xYRu(Gl~Rf!%+Lg?IPCh%n=1opVI_11Kzz#0zD^t#+bAs5 zf8VI~l^Qgp(e|h?YHFDaSKwN`lvc2JbPPocbO)tp zG2bDH8yN}iMmVex_QORxkX5Y7*~!}D!XO$1V)*D7Y_AOH+Qr>H0GAwD!3zE2lf7`G z0`C+m&qnvXPx>b&!a)%jp68`W&xtKB_swgI{2LilHxqu3tpRlQNCwQ%Rr2rR9L8C| z82ru%FbQq-Xp706mv<=kEufdV85frkwU(Qk@V(h%Q0SBX{VI>a&g9P5uXSzro(UB& z;6Mby&;i&+75JD#i{Ne&V^}mv2D|>r>t{HMMh>XViN%encRsmiembG@_bJxXS$Z#T z09BK+!s-lcjxoq+KMEi<^)B;@6>nrBQhAeDR;-X{tw{MV?LPYXbZUCK2LDPQjdj|! zXcz;f9axsIN)5781dpOKQ^&Gvx5Hlv5ILEqM9<@i7N-pc;Tn~WThAdMMN4|$zYkgJ zcA~qezl+Yw8r-QWHL_OjXYgkDgg0desQKdjw63ZZS4akdOpmm`e0Uy`d#~swRQ5_X-w?HK!!@c zy0J-JH6cD;%*3MSTy?$TAY$R}^FKLIt2I8lO(cL+4Pje`3qCTrlWmpsi-#shm_eIq+u z;VV&NVBS>cQ3g>S$Pde^Qms5vu0Svp2wjAYtt}M=X<}-(Cv@z@+>;25pwl2Og5cv2 zy#vgm_B6|0`Zr>-u#t~8j4q!JdLU>WDR|+#NPIv+FKuLJM7)E~G{wTA*Q}!f^&N07 zGOofrhC)FM0i~&V_e>&!0nypugx!SY5%iw^2v%U<^ATa>!S&34Z=tp}Nka7-!AZ>o zSH8EMAJ4Y{*efjPbU`qo%x;+#jn)X1^+g(gRp@L{U0eGMDOoC3%XTjrAAxR2 zKlT0AtKZG3mVOD;OJa6(U;u-7o<3Yd0yt<1;r!9yNxh7J_N-c|`_N-dKt<-ue1FTs zC@2g)BMoRYHPbHt7D3;D!z<~1CLOd4%z`FEs*9_*8xg^WsHUsB5{`Mi}S|={!Vn7ak*?rmHco+HJg%d{V+4S5iVI2{1)SZlO}i#6>!XA*)|CV+C*-^9zwody!HqRY{)eS9au+p)7_JlY zaPuQv!sLEdUfy_T1Tr^PG#oO$zvkf&0S~6}^SF7Pm}L@6Vw7Vkn2~kDNA|*hcQtay zP3l`g8d-3;4wC5e1mXjPp@qbxpwuG5Afxn6OeET&iakhP+Ng8K7MT9d!Uuwli|Z05 zRQ+Il6v6Zuvk2u;uqyf?mN*W2NhKe{--8nwgOV^=M_5dZ1s`%%4EkcCtZ%AS#nrlW zoV`we5D>a{uFPfF6ydNFljN)G_wwQP%J?ppva@h$OHG#sfUbCYku?pJ!YxZOSd8cC87XIVcpx65E$4m1sR@pA8JCPFL)JKG9mFDNpYQUW02LiE$))&J+u^N`FQ zX2=?ps#lCs2L0|PvH71rt&t+UGs$I)6v4d*4KEqka&B(!;o`lhG63yVfnoXT{(-e< zEP{Oo{$2>Sp-Zp>OKrpAjAtPH!TgGf0>MaUk!#A^kaUuce!?e?z)bywHUJb+^7JH1 z(nvr z8-<9saZRgHSrBVT)MFGsm-1vXJOn_=IlRIN^s$#OUj{!~B<}-XHbzUQV*}iz!Cz*N zF@59^2$C$CI`XG{Nlkn+N}qBV!x!Is+e zx6+wLdDJMIRLN<;)EPF}T+ndWXgmJ7X9BH)Dyl)LZP1xk!>%9@OtL%s0-1hhufQ$S zj=a_|WCTJR@z{plOuG39m}-Rg2f};+{)aV@h?{BteW@k9LmN`2b7#&QULeP{4q1bI zB&3@v%vp)n8uUp%%vMMQF}9Gnk%B@>WC$QMhBqQ%BL5a-TM2Hgk%^oFyYKUc0eG=G zesm?4Fq|d&67ot2hFbA5Y?u?UzX6-6tlksLDf}?n!eI^|zWMl8Dk$Pz4HEv||5HKk zH~?1&tqK4pq6-bkr5C{Te44RzA0=^H2@&_<$u4#>k0Lt{%6~7eC4NFn_lv@c?+cb4 z;Q){VIc|sJEAof|%`8W4zx1WS8irWGZjXb{uySQB8go34%rN#g?A0e1O3Da8Gn7w> z8gO7Cf2?0W(BF@t0Tp=3RG6V)H9>TQ!xR3L02&$^B;)k2OM4kuSlWKdkhh7iYErH$ z0B@Vc6L4_|*Rb;4J5y4wV(uA>vwTFv{DT-YHr%ULsJn7Dc+SLHSXPnOBR`O1Q4-!^+oP+lDwIc!LRt2 z4ou<@{52|(5s2BhC#LJZ7Wc#eprNEzRZ+QAe;)UoBt<=u#+?lDs}`a)?vdZ3)U8~< z=1-K>=3c78*I+0CE+m<;}1 z1h$dD)~cRZ!YX4ppSL=HATShmIODG5Q{N~C*L_fK3?z5E|1R~6n@n5mshSQ6d@`K zj^n<+r4I1|!GpZYzap6G$ba?gsmmxJMA(SK762%#Vmxg|#crG6)5v!Mm6np8C&qJq zFp}05zR@rQV&0{KXKfr7Md0Iw?n8it^`(`H>m2`FDW5K2mNjp$Z7DrrxtmFfV10d4L)9kBO5noI)*P{b12mes0yZPU# zZ=@6P-IVeo3~x2HpxoWWiO&AD%AZRkpbX_WvLzeTBcUXj0yiWc37~yHc>MZ3W~^yC z&;Qp;q43~-hz{Hm&rel8I2Xu;IpsuP!KVEb*HqH7f`V-w6cj*3KELG1J0-7*Y>8Df z7oKLQ{rS9l4z5eDgwcoPAHwiucwg>t>IMmw`MhBWc@($XAXSlqj4f<&^-kSDzAwJ>rz&#wwuA2l6%;*E9#MaP7#+=+5LxD-` z$0Xhb{8TWsP#^s_lfO$t^2?RJ(a{i?`;340HJ54~SfNPLa5mPEukUz7RF489jpUPg||9JtjS+ZCv}vrx*36)iDy zqEImA#!@T_Q3}8Xp2*-)>{|jsCLBg2PQAx7U%x8Csi*DBJ8V{RoI{mL`2GA~(f~mx zD~hU~J^PBO9^tSNGbY^J2L-iiHg#azZ3PVwCO{Ml2Camu{pUf|pmL-s27`u&F67@g z#Yt1)jC^)#tdk9s3IYx*&#c1^C(w~9kiowaFFBa#h#=nd@(~HutQQ``n^Lw+n68r! zl{iBeP2g?RqJEOJt>++y6aWF}e9V+K)xcYrm5)zC487nwqE%ZeV3#rq`9XuPiE6sJ<0wF1 z#zs!)6v=*vrDAXC?2{j&wOA-8>N+}BQwY|QFgAgTnBoT(T2-Cef)sekcTg8l{P3ah zt&}clh(rggG-JQUQCtMctyccjB=P+F<+!yf`3^|MG!#sr+ur-a0rK2T&uNr$0T^dv z=?bxUG8@@ah9J7c({h+-7H&2UPu-4cgRIWLOyDpVOuzxfe9klYb6)@axM?*_1rMG< zLW!j!hsU71>U2|IS4CNFexcX$WRV5hG>&13Scr2W~HsIevqny z%}v16!NeP~fhbhq?DgFU?9uF_ZH-|(!3JJ);?qR7$SKJ7#IKQGP*Hv?VcJv= zyozj)ueRqNyk*+c~h0iwb=!!Ff?jnM^#xCBW1YhX%@v=)+Ff z;RZ4ea8M$ZL-;*7RHRUJTh<0B;4%70w$5{`dHymerGNJx#~bva|PG z06NVIa7RIDx|En`apA{xL}}b+O22kF<)Ys=&`{f);&hJy`lVhMmwFe42`ZQN)6)Oz z<$^Tm=>4L}bAj~@+aT;9(q6JYLU{5ifwPg`k*^6NdddRj1QiA4)dek~RXzVeNeYD% zrOV#!ibe?FkkJyR|NV&E#FzzS0tXubffA)f;8qo%-PVg&+g z%P%y`SXsdgHvd&rj90#(d*Q@ySb!Eoxs>6uGPq8GFci>XkaboOSZuTe&-_pD(1@p* z%|bb0g<^-|2W09fxD60EKB)HwTY|_{sVHzwCJWMtvo{(&;=BT5781_Mx1tb7TewW8 z=C4N+255$zxNZ~MD7czFgX#doEade7s7=vV(m~}QaQ3BA%@DE{Oc3QL#0`@0g&}35 zC#*@JqW|+1ErDbL>#cvf2@@*YeaawRPo05MQd~?J=Y()VmPHZbFZe~66swY=9@QU# zq*PQsFEfEyoPwNSkL?ZqGV#I8V$YM8n*<i2Ze*=ZZWefT+2CCISJe~Vb$ z*0hl&AS~%zKt+9JFPcC=^B?a#CvWZy`S7P4VCaO$eQ5%`>CRi$^E4o$=2v!bUAh7hlt$*t!c1gq)<>w24Mtd1r!uZCdn7~wcI+|ljXB( z$}K__bi88@J2@V^`hLQo>ec<*sUI9NYI)e%w2h-z(Ka!Dhdt&hoEZw(f%uovo0mS8 zv5AQ_{LX;usX|i%Wa7K|CX#ccSobbYvucLD4>Zg6w$SnB@>!f0_9_0(;r^tpsxIcG zz`l<){WYq?tK~R2&(i*TV zQiB)A4hat6Ms@+f0X6yedVygsJOj*ndd zZ0-@3U(L?XpNKz5I!*MgNb@>u9a_G4YV=e_fRpf}SA@V*q_UfMSD{8)J&uK-WRIfB zx*32DJUMZo1(2QvF_Dt5N)ICU8oZkdR4P`0;M?=AuyF5vfI>-#w5^8;7KXS^ETvN6 zpYb#U*HB4`{`h=Nt;1{i5oL*Go@VF#mCHVJ9Jm)0^qo8of2Tvzw{&-G->#>o$&#c* zYX~I+>IP-1cNn=U!AAw*0uXapeVaD%QY500uUkoxrE|>6ip<;Lxa&}w2+_g^&nEY* zG!R1x_mXfi@V@oG+<0Nb!lZjek4v7sRZ;R*D}a*{b8o_lAs+}wmV}?bJq(MQj){;@ zIgfUd8*0e=Q&k_}THL!XLh4}pEH=F@oGjwOyp|yFilU z*&kzUOZXo~oge?;e|J~nBj)t$vhl?RO{OY!hikAp)!wt=@cylCA1jx-i7nEqeoImE zfy2t+{==~<>X@-?gemTem~p<6?Yj;8FOjvjgphZx_*PiA^wzCx{)O<%6`6qsh83j< zM`%-{F`@Ge$T^6XkH{5vq~nc}mYi5Kc$Nc#0oU@tff{0&Lbep6ZVSLXpVWfrD^suw zgeb7Fbu3pG7Zlh*_d%Pn zXnk}l>*N6?Yc_iOSctwfzzP?|+xrM5r?s{9VeBqOb*Q^3H6OBZvS%m9qFe>lxQ&$Y z?b>f>CD&3UaOCdLRDipJ&}hhgXLN9oR#jCM5sq+kKwj02oiWtLlS~jD?bs&@c@p+K z6|JYX-%ZBEyr1wh@HSo1kP~80=52NRW#dpt3_>xG+%6yXq(XeM_oF#@>oRh3>9kIqG(`k{1uUfZ z!wP~rpx^dptM{|>@@BeHH0K^Mtyt0Iejdb1g8o6$qW~%i@=hRZNf@athB_Xj8U?t2 zxT_iPEFJvN$Ss3=fU+FVEwzMF`3GwjHVOPOi825RAgE!n9bSQ?4_TUCcW=CBDT1m|&(az@ooqaCxTl@|y~K`5 z&pb}2-;?FbW`|mJvt~L*%uo9`v1!MWF%;r)3XBgOCc>{=(H3slGM*2P$mXC?gdcE! zq^M70W`Uk#KYY>aZ9H=jY^W&UNk}tU4$)TmlSW^fOc5|pO2$VeWfbA(k2I~@zzd@<`18O1Wg^wQJB zdH@qpvOgKuz1e+mFN6tvrmaeM^8nl~QJnJ87^Cx9>iKm3!NqyEMgIAOpFD10wSV1+wvzj|8z{68BcwWCwO<*!F~6&jwGFbYl{7~?+*FqzwlcI!iQnHry&5RGN3CX-sMkPN+wFD zR*+pGJP8>xr?cT?qf+bX;UVr)rpfF`U98~AT9liz4%ZZ5>M|lJShtG{Q4T1%$Y>At zfo#{2K}Sb$8+^n(n5}Kd?S`or3BU(#ZNHF^V1DPbifso_s@zkbbU;QS-d-t=1hK3q zHs@#!s3S0R$T4AC<4&NpqKr=O9 zu`+M)qxmDnn7iM_78Yi77u-Gx51&d4j1v zphJM1WP37zD>A@E^Q4G#6P)r^Zy(xvvf(JXr4;Q78IoblwwhUH^2q$sw zZ3Z6+t`#&OaPW7WC#?YHKLr|dpmZcudI0mU*1G=#zJdD3F}ud*s~v!8hugE-O)c>TF1s+5~Ghgcub_;|{;U9u?9%5-C0k0E|ZK=cF5Q z2^fmt4oDZ>(lf-wz`Y)V+aZTJT;k~%{S7&oKw`N2`n#GXqgUWO6(P&v337pB<%Z7r z$q)OjEL90_A)10tsIXohGFoy-K(S*-OvfkzN)^!v1X#3Qe@DZ~&1qn|WPKh7)T~4N zW`RsGG$5Ft3EPN69SByqLG301kt_mehI+wI2800c!w4)#kSKz%M*5?)7L!e%tCQp3 z#W3Zw+>V|z>MQ<3UH>6;>Wu>s8!Y(X`J) zqA*!9-&VTR;u-fY3Hi#!jAnf~mG5Ud)~%P$s%pk}-v%B#Sp^QKpaYvT^fS2Nz`qjp z8Xeos`rC5+?mF!Am6NpV<$pc7*Mqk>MHnl?88}U58Jo`{v$B;VMG`PbWZlc z9ETzsz~}uPvnrbofkNUk{F1v)I{x(jyaDwUI4S3_JuekZShBhpqHq2^d)5J?C4Nqj zq9`XSBa|A>%X+|!9OWs4FI2*-#WAhm-!xRd;wlO&owNqD?@A3&G+;u;e*Tz<6r7Vt zk&Yff`wZq9gb@nuM`cS(1g=TCv`A+w|XrnFVg)sE(jt2(P=L! zGob|wBbg7E++pO@z5ai0G@!4!fRIjRPvB-KV&N0eY=UB^(n>){wVi2E4hm|t!SG4}LPIUBquh7Rm#a6O zS5c9aj;yHOJTLFFbK!DI-ty~DU%SHGMd|qi#uTq-DeSJlk%D%<;GUaMSiVY-ad$#y zf$H}sok>qJ#7oUj_E|IDj7v@2|0=>LE-vZz<+7(4ib{8PqRj{99g;iN-|r8NL<(xE z<=SOR$CR$o?NJ;M@%DHNx2LlJ(=-+R|Pdake zz3;=+52~JQefOaT=n;`T0@l8^Y?25vkR|8Zrmkv(uP)dm1|$;prd40 zn^E#%r@QPfO^iYz)Q!)(6ps}Y&Trqk!@uj@xYpH;&JDlcNeJ%Yy#HJ!w?N(3G}|dr z<#&6#5%p2Q9{OBao_CWRc5+U!T4AeQg_9}HP2(3IzN8+1{j!sxJw8i$YRbE$GfCsj ziJP%&K<&PH(jHMz*~>#08#YMER^d8=;tnBkVdHAptAE4 zYzjpq;gULZ;F9aVixmTlRl)O?W$O`ZYXO{S)Sp{Keq%OEmYZb#;N;`WW~Y2<&$uD^ z`3)Sq5xfJ_bNWa0d=zVwXX9a?%cU|jEQGv6#Pvi=K$ur=)8WPIxiOFE0KO5oI-n9j zI0;`KH>N#|-}Cfo_>M4Ny{hM%u{x@}eD9lV^9*f2b6J;(iHV#7Z&vLWL|vZdfx;Y| zBeJ8r1w5xz+N}@U(tpOp#7mdp7yUY&508e6A0T9!4`(+onUk6X!SyTG)<8dpF31st&?-0e{ttW!m_K%5$ z>!6bp_B0;Gd@`2$I6tKEqHbUFqmlOacE=WW z@Spr?ak#Vl`?zo5JFZ|`q38Sf=gQrVTJ7bk`t>~dyy@nc*fr4~D^es{UF~dMOb&Xu z{a7fCl&yTPv^zpbV2`drl6Tk5xC+sKDn_lRJe6|9MY>F-I`OF2C%`NjhCWpvr5tHC zVtiK`aY$|m>$K6cTLFs%aHMgh$b|~x8VB9(+n~)U{P_49c8OR>wVVRV{U#=c5ozmf zdp9}WevOgW2XNi?)epmAc*Eh;gQ&bTi~%kZ1u5Bx4X*jVC&Te=jmXpfuqk-klBJxr z{oKK>w{MBwCsjWq=IuNvxWv4Y_F}uoaj&^=_Y7)Z*3})3bC`!Na}Crt6j)oBtWnO~ zeNATMF`P4V9sj_QYJ?bh*Ban3fhH)H&=LT3sDL}icT-`INNQqD^jF(R$)cS6t(lg| z(RTiN{8vxC?Eau0mb-SGVy{w|hlNyJ9_|Pzi{|Fd)cRm)OTTP%?Dr3YW4SxFRaFI? zjxh1FjCvHiRpNZhisAdP5xTZPP&4dh;qPtvHg!=idSfz54t?;9St$3gk-BSl%sDQ0 z)Bd0D#-(h(6%I++*wjaE(Cs0J5_H)eqV3LXtCzsA+(xoakYQ-`?AK zj9Lln>Cwy{uSp6~L;y~tP%zj~25d>V1>pAR3=FL8-~9juEg3*yG~5C{VRWx6$cTdA z89ey9*hc3^MK<~nF&^d_58T#2nB8mwNikV};pVmvQ^WRbT^Ls_;Mpc_W{v8ZGU>A@ z=8_%T?oAtNsU}24Z9H|qE)<;O<58{$S&Ja%1}Kg3`7&$*VxT}bA4y=-jc#B2l?fTu zC>up73i)(!AtDt;)7#Q-)xW&9VaF!H@TmGvLFzvzJP#Jj4+K6Q7RohbcrRxX`*iu( zt;&|46ZSlQ@w{CWgkjB-D8DvgCZ$@M{tjA{%`*md0!cE zm!EaF>KuqL-cTfVT~JW>0IR9cS<~CAP8zJL_E~$;fYGO6(Cd2Cr8u^)KYcV#iD(J| zl8ZTh%GWhgEoqM7Sk1In=mx6Xd5dZHw?jZ~d7YH1?Mr>moG8G}a#sCD64{fC2!?Q>Avq*?Gcj z{UN6m-4~MJuM-29Gf&@R22W`D5eC?ovEMc!h++Ibi_XsQ0uaUOZ)^!iLBTI4#{+i5 zsxROR{>6k=Vj2n-{A2Z?`pu-0AdWkL^b{67Q?LFH&xFiVSRy5KGDI{Q>vYu9c(2EY z8P)VtWZ6W!J&RW7t>s9L3L)fyr;h;L)IkCnQ6%%)8I?EKzs^H1C2J*3*A(#8hL-_ac~H|rjB zaCXXO^Wmn$7jViESesOkSCH9k>s;%xpOF|MN`<1&!BeGz489hB$EUB6! zB<%C7s;Z6NuK}Kev}fFhKTZGmp@PIFn`OdnZEw#A^ovMvo;2Y*Y{l{cXq$#ws_}eQ z{VcwxYslvqPgV$8oS%*+G&CA4VkNX*v}B@vg7B0H!pw<%A+cBqJBp9Dd`Bu(r-W zo?B#ywNa-QE{W|KTr^+3dOkiucK&3{#rQMB@!POJq`N$Em+mo89|;rbA~##4tmvoK z^xs`xY>&#;O^M~io3}N-QY)m&h~9UssMPKDrWkjo(k{p1TA3~r#jh`qB*?Cw2A{ot zVNcsv&+0b5>EP{xyK3>Xn78vWcN^y{rNrTuiSN6=LXp4lgX+30Ken`SU+1brR{r92ZgG5mN8)t`YQ4fY&() zIUBPM8VUxSCl9lN)V0%l=8Jj%`GX#E>quSx>eXRV7{IAa5h{%3VJ~855A)7^8T!4a z9QB45tk7i}t_4@ap9)b?6>o$2+>xA)0YG5Zt}=Gbv!$)Atq~CshwpX}Xc~Vf61=(z zjHa7Dz`j`p#m!-$y%-viXPY88i{ye zx#O(V4b>RQwWbm`Dpj)mLZhkuTtjYcdBV1jmG=0lD&}LNBe9<{g_Fdcw?2um_OO0J zKhytOb3y(XowU+xDUrHWY}#`W=ll8m`pMmW{1+j>WBNDLfBI0ezn8bS0Okvr1P1cC zdUtb^j2db2uCQOUnnG*1b3_Bzp%8xmcVXmnPU3?+M+&87y+`!vs(U z+xWm%ipe=lMcn|I4Ia)K4(}6f zsZ=LUjIux7CGeUVtLwqAh5&hUlm9X|=dej=Hgm=?p6mm`i|&j0InYW+o`PS*g3-n# z8YvnIWW~&8D&BzI>gwuP?N);G?)J%DU|R*Xq&D4y*PkX^RM&5SHv6P0m@{O_hn0bn zje|oLU(1zZx}N!F4zHuS`C31JHfB6Yin;OZHTM&dqOQH-{lgsI#`VFk)AD@Ons8L;Of-5y!;E)UiFzjNeP=?ZUy9L_6v=-EPP{J_@60dP<%e zFVe}!_f-l%#QOaWatVJNW^fyx!=;6owLk58hS4P^Jur%ORPU8%jHt;)T6nd zKY3mbHqH%v_)uxvk!I%Ave*G#2)%Nj-AuEO0mwVVi8tUez;Hs|7GoIsRRTO??k6iP z=^yAFc|z$4FJtn1SV#zTQGC{ z@WC>4i|3L3mo8~$u28u;;WA5B0|FCN!I0)dUxlksw8a*Y#o^6PY_rC=N`#7oJssC3iv43w z96E478F?DA-z%A!$yq`p&I#0f zE7r@$=S&{3(J1|_q;~-=Q9t@e((n-bL=4+1S|V5PzVh+{KSR)gMybt{DnrKEbBfmV z+vNs0yt1O*W{ck@2b+4j76#)IeB;1yC6lRi{@Xd5-8T$2i3l`3k6CeJKnU9ozP~Yb zChaYjy|0IGB^2(E9A0XBe{MFgrkA&w z>jJ{X$*76z{@Kg1L#iK+$FD}GmuwaV1)1ab#79K%-`0J_vKap>2r5%otUd?Rd{ggM zNS{gS12K@KDfl%TUMsr9#MOgu4{Z+v6+FIy;V9$L#_7L>{$Pog88q0hbSaglXGWE2H{ioyCNl(SflrPHVEB;cKw{Keg*yDL;F zM&!Y9qc-CQvdVq=w>(5{G=S1fMEnoo_1QZGPGWy+E9R;VGv}ZwV7AzwWe}pN37Bbh z(X}D)7FbuUy6*fV7bddm?+q>x<`1w#fnO}%KkST~KGQ4NDhhFIcTN-wHpzO96Gfrm za`IJ?_6*TJs+c`N%rqS7LOv#tS?*nSBmvh9_S5j_;k3S-z-j{P}Te z!-tsF3w#lLdud){+e#H`o2l`=yxiQ3k4zFKW&yvD{dIG1F%JrP_3G8RaSSl%B*fQY zOV;VrGu9)f+P(+B&R59PJH+ch%|6B2!fd~t2L_z#3G~`&zs`L=c&Fja{WOo+>9IWv zKW6q^ml%nSx!1Vu=+1RB2wp;V4x!)Vq__{icp=pM?d9Xih61yK@Pg%0OvM|^FS^?6 z3*>J%BVp|7C!9bid?Y`=6i5!v%={sHuQjhay0flyZf>p;M*J2fM{+Rb+Kwqa(V{>g zgAo-4BMY&~P9z}edF(2xhO&!nn+W&;AD|Bf&GOrhYpSVLAoP4d1VSBE_4cith3}T1 z+zRRrD&&Zjq@GwYrupT`3Um#=B+kP3>?!lr_y6q^qn)B<*=QTzzCVeO8SeZKNGSR~2aB$x6!XVsTq zdRVWQEQ^}Q6GlFT)BDmZu$m{-K&X_3K$;8#3A*CN3oP6S61!DA;cgNc`%L3x)Ox$4 zBl(4ewfVLU)iR)JAwyUz%1rd)=U2zKqki>%>htZ3su%+`3MyiR2P)Iht~YQj07D%a z1yeHm`Sk^}nmQRb_c8{FBmr~MN|4jP3L7_qOQF2oRIsR3?S6wtB8Z=2@um2JLA%*< zrqb|h+d_H_t#}=yQoo&QHs4F~Mq^h%$BL)fk6lvv%g?#vbm2>6f=ovcD`8@-e)8;S zRv|8-)U|#tHyUbcBg?IM`6gFFgTo5)EMC5O^Y~TbO1Xp;C|2o?tPh7W>)Qzs8~5MS zAwxSgY=44h+>aE;Kj7C;(=0FLs`g8JQ?8r=ieo5nN6oW>4q~ zC={Z!>dX@)UeV}UpY`^xmx_IvI&tNrF-lu~3SoR}HT06w3UJw5C2{xGGp zeL8~WRKRaHa0RwINE2tqbvI^ah`ZewHg7}tLr)<~qr~V>LwkVpB6N1Io@*XOORp&b z}`qXZp$8~XmgIZZH+`oC|t2k~^&8vtj z>>K@L*%-2ekm8i|lE8Mxf&mtE3aj3c6^qEv3U>~x-QnZ(1&D!<0{YSrgaJX$A92%8FDHFF0Y;9t)Oxi@$4fyn|YuAc?nPVo- z6iV=oxO1M2_z2qPGe5qQ(xur2*SR$yX^CH-QQ)LOJ2+$nzN`GgQryfa^Sk6CmiQBeqr`?+`19wkV z-#ZZzk*&P%36ES^ckEv=^2EYUAyo0`bwzC5Hv-8xeREu*kdk+8)-3^JTQsz3hXx@<o|a6&fgID-4*p#~e>4o%4SB+1{c)zE-2_wf{q25!($p zGg|SleGkr03H{@;tk5x32Br|$LNt2uPF>ug;L3@dw%(5+l{qG(rYpUu(0#^2om zDuxSH`DGdx-X2*YR7V)SsLVwbStadTH}cB5DV@xK+L`pe%a(<03DU~iZ5|*j5_<6W zL$5g9B7H%j4V*Z*HlK&ov^x0bVq>4EbDM0mP1QVAc6aw{W`!7oa52X&^JWq8jCksY z0u!0XWCz{)j+l90`PuEdmFfDC9Oh+fMFs6lUH9b7GTStNg!La+_#7ZwHeOy9Y)~em zK^d~@73xTI*Ay~bS@J;7AwGU&2H^Pip(4~@L=*u-s;G?c!;!go&JK=u=FuRka1J1h z48Y3enqW)s1cAcFapE!H-ZOZM9^Cvz|Q?qZ|9;A8xNt@g4 zeP6Ecrh6Zs>})Jn5`Uv0i?e&3-ehjG#K$MNa!$_6S1!Z<$^77y_^Yt?wRKLZ-#u?y z29y$I*5#h)l1q2}5rqZYqUwezf83h+xj8H1*x~MeT~ahTxUxH^a(6gCgW8MM;BAh} z?LMTt+w~2{#>Oo?yct*cN_ORlbP#4Y`A=Z<8L(y6Z9v|Vn`cDWCqdVf;)+wq;4 ze)X|necZdxZuet;J$kia|It$U*X?rKhPrCS@R;vc`L+I(uSKmZ6lfI{ zZkgOLebu;4cV|gOhDvEsxYLh;{Drd4=NgNrJhP>DkLkPQ$2@C%&m^C{qCeAo=0VL7 z!(E(u^{v0x^0qc1ik#0e(tKZMQ_<(yuJP;bpZJ$YR1qiF%SDATv@?E!aVe{h=@qUzZjrU#bALvxnykR={*uW*Pz#n(j?4-Vn&(F@e zSkKiZV@KCzMc2Lh_{#fB3p?k%aeXGSwQZSgI|oL;chy*BE!tZdJ-BT0;PMaq7NcJe zk7qVc8Kt!E&*<2n)v!nP`pCj+&ZL6*W=F#l({-)K3z%D}S-xrS-J=pWV&;Cgg)!)0 zz0xV2W07Hn2K5EZ&&1Y@xI-9C9a2swuK)Wa8-D$azQ!6Sz7#H{bWab0CW7og^$3vm z0BqZ3T>2=KBqY#(lOs`j!AXdA;+7a2iKh`*9)i4n5qW?@!7d7a3_&#GqivZ()hoMc zw(xlkO3ze?pCWaYaI#p|im!)m=r?ukGK^1)J61II?Rs0Ll=*#?*n+WduZ@djx4BI< zz4d4)*jSKIKUp5x)}}|1{G@Xneb3fMs$~MpY2CHPwR5_~8HxN=N$EZ$E|P#-i3t@- zQsQ&kCX5^Lyp*2CW;Ls>6!Y6 zHq9%ZcYn*6}y9UNgbv=-%y)uI;-WHDf>;ClY z=JvJkOoVG6?)h-^85TEfxJzgvIQ>8!u|5* zDqS@@3L{2w-10AW9M_Rxl6Nf+EC=fgmF#p>z1ydXJg7V@GS~_s73HpndG9;oZao1g$lPo;@f1mx5=qt zpMybHc8t(sr_SE%j3{@Q-K^cbC$3#kGk_)Gn1fsCSUq zF6NZ|;DHA9-obm_>J1|1Ukg1v4BMtX*_n&^n#VFmZ89=lXqFudbX0b1%VBs3}v$kKELrA%Xa5{zCo3J=R;;j$Jk%L5z5j4dV8H%#?B z6c22t;Nl)q6t+Vnq?5WK-euE{Pq7UrPihLUiE~r`JpDlPvEHvs+ahoFlvpvo zZwcw@>>NBn+>xQ9bA_J=ObiFl$^%w}vq252P%6^c5b-n79N*fgsw+I&)fwYYXPNBQ z2mJb`q!l%UtHcAtrP)HTpirWRZmrHZUhiwws}YYv0A&Nn5eor|74(YVPO zc?$W}`iY9zu!{V_eOek^>{8RkHSY(tHr_vR_n>s*r!t;No~w+mc=yKrC&zI2sQAsZ zl+DtA+PlgyAJS>};z;zLJ+=H=J9YmQN9euHYi_hXo|^gUW`0Pf(?~HbLQ(hz{(FXk zcb`N+^u86gO-IlCI4T;*%XH>PUQL41AWV}nqhj>uhi4-#F<~K=E|As=z_PjbE2!Xf zi;LGWYXLXBpnGp8@3EtCqKkS$Qb)nnPzEaxtZt?~Lf|c-C#0f4i(@@l7fFrk8jNVs{QLHpm6To?)ToK;!4UxUu5M#6vc-65K(D1U`t)^ z_St>{;9U1q-tvR8MCifrT4Cd^f>Vjc!Lk!ljrV$b3nX!8_1y9mH-QPht1BqnebMuw zc5?0M2VTpczr^*j*T2}+=_#^lteb`Td0JYR=HAtM7sXO7<8?RsT)Soa-QQK+)?wPq zw`Yz$F(+y*SFk%%T{C{^*tX-KwiRk2RWHOk&nXRtsTO{c0mxE?O#B-(WjU zX6OBuoMy+hnBF8_sQ?o2jne?hxhBGM@tCoZ`ir*R;={vZ#c^N1Pl&mu4HrGkw6@s1 zg{hcj(k1uOp@UX4pL*pFg_OKM^jo4{`kH*>`L8IsYnm=}Sf&bmIC*kE9p}}=S{

ic&kOG``R`zG0p{(hTOpK0hVM8EK>98Zts7hQ@U0wT*FnS^2=9+{zA zn?lh06`mCP(*}p~gU8TDFqxZ3AIN%^V>j`kuzVsKQ{_M5Hh}xd%EW}SxG3Xf;{s)? zl&EJ7Ej-cYj?06K`^U_2*yN1%r$wwCng*I$>+IVrYH;TkE=Mw+&TGO4DrXaeWM1|- zm-hc-+gztV`dzZc_b8o)?kMXInMY>6zbAK^2yEko&>nJ4@Sr)3Xt6--8FXsPn_N$y z-g4@(BxmB0EtYSd#u=TJOI8{=G_Kt&Ho@>m&6!?zN^YI}&yRxGBWZRLQ#3YWz5$Ub zT4Dl-{wdu3mkM{T3)`fhoYUMo(qji=#Hw)J{G|3c%eNAgooCx$-@BQm{i5rKy>`dX zPh&k}M%fJ)wS+_q%GQs1?Xq?V_g4NX<~GU0!WOIZ-rv=6%d&H4H)bmB>_OI-S+M^n zHOlR{2lYcaDxNYvxiU)V!Ni{XtCQM4)pR{IN#Ns2pk#TO-5yC?%xYnL^~pWF!AD}k zwJRs3ov}D4`hlY}L1h5PM^jG+jyIhTj*1 z>hr3CD?o82^ee!W8~7k~xeACUfA+*ie49k)^5BFUd_j&o4BWmu4Tp%6+R;2!io5>% zLCe1ffagyajJ2zpCo`~O3DX*Sl;h-T1wOxP`Q<|LYF;Y*WcpFBwxFUa1>Kxh#~Kdz zsiyah_L}(RsQ8*(ow~lE<3;68@4HImn|4gP3gf(izfC}*)#}}#V%I}2TSN8sv3%z*PMj_iz>po?W zDRb7I)8w(!b)`MJdwJrw(giZ@jO-FkWx%BDG(5zOcRzV%6}NMLTavdfNC)vdCQey~iJa5}Qjv7lSd?3UP&7Ztl_A6(ESH z-{g!9n6?uLF58*SPjUazqcrm52tAI~%VFvFe4&?sfm&b0)s{|D)?mz-sQ+_IE-=k~Aqv6w(}(Br=3bDxs1zNg5?- zE=18F%_RzHRGKttrrkg!qSB<22n|A|>3^@i1CrWGl7{Nv8wCB3r7rI^fx~*<5o)u zrDLgv?nVq6KHDm4GuNrq?h0z}cApoy4@lOi<`qs{6HrhfJGqIuxu3Uz<1nxY^|C z(7WWW=$Y+&ZdNA?JJ!b1ijd^u;94_Msle=>D7ydwp6Cy&sRVcl__Z?8e6?(E^!9_T z1Jy~9782FnnOPxRkC!`WWHr61SfRRS_>|P)xyAE$?f&GW_LeK{PPYVwSet?D1?S5Z zm*>N=>`f^jjkhZLLYMjm1m$a@ zfY!S~k`k?;5Q&%@9kx9vL|SSrvJ8#JVe1E7r-M%T<}07%YVEtVpVq4GjxcY|G}cbq zksz>O&g;gNgH4~w*+*JW>fa7T)3-UYDL}l zn?w3tnb7ZVi#!p+0fu&!RC~8z3`bBC^uML1x%y(bK|gZgrcW&Y$$$-y5;mE?r~k0K zdgd*7Tvg(K>v_$)Sm%=C<8eCGScq4M^A6@nBe_iT<{`Z=N^M8v&dShD#S_mvud3E8 zZC&w4!MNQmy=%Zo%-EcMAv!ByxqKOROt20|ZCr!8XF=e8W&MaHzuz%~t|U6Dw)~T( z)h}o59c*$6N*o7YH}0#c5&L6qp}y5LUtV^x-P4wo-McxKqh#ktU5xW&HPH@Z)FH&_ z4SmKn)UFI9wJ~h4jd*pY=XZcJyOHFj0aJ@dMcf??0}@w(FdcMBKN#6^UR3LP_tb3hbT41*ee-KJ=?{v! z`tFK#nVAax=kkbf0QwvZBYXhwlp{~MKP-0RCgFKs>vtP9QJKcgm;O3E;J^C&q}bW{ zT3QiCFI{d$J(O}$LIJ|RM8sJ^|0!O)*z51krZsm084E$xCRfQitOdZ zrSqU&TD)Qt3FN#aumJK9Dk|7l#V#Z6)t|nt!8G{-P7PvPOY_NyKmI0B-qA?Bar*n6 z)He*7*=+UfjXavWkC(H?ElFD{I2p5Dg`5mr{i06!|>P~Q0?2!pco4++O{~Sr#dSzgB zRN}j?rkv5C*3Qv7b5>=CW0!a+2H(@N{U5t97Zqj&jx__?I_ol>5GAvzh*HvSY7>(1cuQS+MP9;3=XaV_#U6!k5sV7(ygr**h^p8W7-gF2Xuf$imgvQ|9FeaPDErIX?dnL0eDFq;v7FTd*`p<}h19rcl%(zvm_k%`t%}zLnNx{@}8&H*mk- zO!@gfs(O#JCQ)7zTy`F3*P=C{4BoAWH@^Lf8da--%vR7~Jp~9u(mD+g$?WRE z@W9C;=T1<`8Xq@}xQ913vxC7=r_<4$0V`L~tqy-+-N1R#f3Q5)cg$==$l_dK{NoCn z2VD>3FbCz5oZYx#;=LAwpj9Mnwv`a=FnZl3EK@dYh~oDXzq9?YTt*?2?cGBw>jfp4 zY;4seH}*<9e-!KH)O_{g^z&%F7(@nA;ON<|{roRV#?eDivUY_6Q@kp8d;TFaydn?*|rmH8Ajsy6`R(0Axtw4sMw2k^|vW2v?H1Pr>5%AV-A2FSYP}tV6@s@$M1nzrTgt$5o^7K(^^Mv7l#hM z>`&R&-{q~rmpOaCr7@fV9OrLpK_PlP;&u|QinV`5?fGjHtZgBMQM~J_k51Uw_lj10 z8^6R;RHb{V#US9bF5~TfXA^oEa%Gd_fOZoO^FOkqPXQ0hkHEsECIGDE4>&yb!eg}H zw(te!lCvVT`wFJ`7+TZH0<=q@4pckTFI=!7S#x{2pX$=N?ha$J5vy3aT0cq~8Fs30 zze-KL98LeWTBpDKa&<>=H!-oI=Hb*OsmzdDMoQ-N0brF{FCGydB3fb4Kv<&RI#{F% z-6&7^lLW=o76Cc2blalSoPikP;KFA`x6L(%<&reCgY)Y@+KKKq-?wxwNm0~K=cL@x z5g|T_;#vlCTuUxUjJB2g^SYiALzceZq^J&6;LQKT!-(?vR2r>Wnx%g zP*}J>s>MIyBv1a>G5!c6SvlZDo6@@l9f_%(&ec8+uZxqdr;gkglq5IfGr02do|6UK zUa;50&hYRukDf3Z<%O*e1;)-kYBsk@fLV*!tCZ_We{Vyu)a0+G6NI!=*$g92U|cwT zGA`?J1eQ2{`7IRtehtOOvLLcZD2fc?{kr#nuEUYLpssWAwK{cV_$F%H7O4WM22LU& z__Rdzz$Njr29oQi6Ov``?nLiR`Y90H~8d};{C zMPT!bEf28-P8odYv3}l73|kJ%!_U#vco>$!Z17EI$8=uaee4X6xd_WkimyHo z2F_C9WZ?^55WYa{M|BpJw5>r7PCZmv48^RuJ68O1AIpV9D}F={f;P;EbCK|)A;1k+ zxGm@(pyr!>SxsuxW}v#n0#}*5la~lW1ds{1FUQ5fuN+^3vtF!zEJiSx=Nd>WaLTPL zfC7?xlf@14kz<=SD|NUoY+AL}-^TM{-k75^@0eS&8pgJ?pJ+d!z}xeb%WswmO3oN9 zXjJgXsz-lI>r0{M78HE0-L}&fh<4$LKUKJ&g>!ws1=GcZ*D=AL{3`xL#Xf+>;d1uh2NN{PRd zeKBxVse?s1#(=zJkyjE;I_RxZpkoQ>=UHzsR*C4$lWQos? zrOew;2tOB}zW!M^esMOeu{;s*Vb3W2!whP9l$5DY;|9F|-iH0$kN==> zNTueT-`tjEX`v>6<{~g}b^ND*Gyxe|YBwKO|;TVOiw z2BE?pf)%JG)ozFE8CxgFKbSGjv_VflE8S|F9fn4WWsfkUOC--rP|y@Edb?1@5TD~L zF-gf+f)N+5JBRpW#Yl8n>Vzn}<^So>ZO)J~7)jGT{Qu>QcC^u4PpRdd{MYINLPOaN zUB3vWyvbZpZMVH@Z<*naL=}m=UzQ&Og$6hIYFCtm(zq~JMUO+1Ie#(t1#nZc(oxTm ztg_XiwQK9XQ9E9R3cQW9hFZzo&;==u;B{vdIm-_v7k3Bf;!_ zZW+6K-iSrsecNYyV^eatgfP=PvthbjRx%$6@qr=V8OUO?z9&Q)J^>+pal&hNiPH&Y zmXaQ$j^9f8Y+e~ww=#6HT3>%0UJxKdUiRpS@##Eu9R?&kE+XP`_`Wx$=!OM)bv|uu zZxZ67R`vflu7)_21V+gAE>j;hw?YW6C%~dYP$6ydGn)enWbBu+&$;J`Qp^?Eg^mJx=&kV<@#eQYt z=Z}n?KC=isq#IOPI*f?SEe7?q!JxAcaw3IzH&no=5&n+B@*78l7R=$C^yJ;BaoOMI zEKla06Q)1y^!EH)oKdCIe@OhxkpQG!(Yk0?SJlWy2ZG<@3D_$rzPwXn!y+fj^J=-$ z%e-YLX-kfS1AD=;eW2x|SFCJ?)C%Zdxa69V#5aqg@+K5yZx^l0OId!_eS0#Gp_S(T z|9l*A&_M#swJhjA0dV+$Yo;1?xa)dHKt<*EqhMFCwJtx)qxAI%)2j0i7hbuw@i2LI zS}i@xUhQ5+-lkz#5J<&tw&Db9VQpQFdrGcrK)ZYGoiVEHqHMDQq4j5OpU; z3h?-Wjg<-j?F5aDaX1K0%s)|m^;aK@Um14A=HJF!13U{k{b_@rwLI9sMNCP*OQ>1} zpplT6;LAMG8H;n5teSRZShv%ZDtqeK#x#!%j+=)a!zF23C0ey`wlyl{Fs?%s9?GZO z#xEb)IYTeaSU0axJC8f&aIC%eqUqtD&?i5LYH*wJYaVEmr2oW z)r)${EDafj64j&R4L0M{#ntR0V*&8Bsm`BY-G%Q@Fs>%4IZSl%agQZ(tAzb+4!3_B z`Z^?=+;6+?Z$o6Jj)v$mOjlb5hGW)j*yTN3B>v%5Ah-8#pHdCoKN>QPLm%^K>%>fn ze8w8f+!g52I82e0fD&WRlYqKIIy)8S&zxQ26nFocjbq8mEMsRY=PfOSHAMU2IfSt@ zY=D_yD}dV^-D_Az89HbtC_K1CSzR?kd4g%>1JiCR2F(q=*A|=LnXgEU z3IQyvhJ+d9J7oGO1HTx8y5bJ%tt+u!e}10Ep*|n9^*yKtK&%gFGIV2Ev-kStk`j-* zefS@{Vd78;aWpJ`SaI@#^U4qLmO&g)MYd1OpHdF#z1_7(3(^(&|v|GG#sb!eUOx=5IqWS?p*9j@XfQhy8)SKFHxf~+3FzvyXp z&&*DVk9ltLx~2`S2VNCkPdW!H0Q_vj343q~urCo~P$(!}NUaLzM>c-`>(E~iZ(0JF zm;UL0;671ELYu%d@?4!|(epH*qtBm^@T`wahs&k%haOZtZfvsc-#N z^Eu;2ru+*fo4s^sUzaVM3;m-rc!DsHTb)<(KL%VXD42iQU@l*HiTY|Xuh3vw_oy4M}cnhRNVoFX( zD@V%CD&{M5i1JdwX<#u06xQ;+t2{0fgI|oWVCb2m9EXeJmb2v}T?pjJ?!Q}3bvwW< zpbDe}DOfHN_Ge~RS9X5H%c`pR=)9julugdLt>H_$u9T1YC$8r>bxWGGyWBTFP0efK z)p_E?81qCw<;K62oe0-#;%^2|xuva!F1;L+ur|K&{DtX;UXP>k!_=2n`)cb}#*6ey zyEnx1cUTRw78?A%l|wrf@1No?(~G%?-q0A{e&c>+v^oJaF(yhTGRM68kVUspc_~Q~ z$r@Wkfi49Bkm9hFgLm9zD385IK;XXymodUl#2F;x46%egZgn-h8b(U^^@QQlNp2I& z?McS`@M|@R{kKC}SYWIiO54QTVpGHhw6)RRGVF@qshp&wGiYl|rVDDy~dsOYpaYkPWZXzo|}T zsF#U5w`)%JBiAe&dY3HG!h{R9VZD3XSgW++_2D%Gk%M2a5P@QP&FQ)y@ zreETkU6r2Q@!0WPq*=OnUjN~ma{pr5hX3?4sG27mGv~(=(zJ{0%|awox=;h0+&GGR*M8Ax$TI;DBbd znudmtO8iBz?tubwC@;;N94j09bS7DO;;5Z$`K#a4uh-IfF}sCx5JQMOhn)f26cu5J z^m1p$wXkq(^yAf!`Qq>`eNW;T}8Kz*$xxq z{KZS-V6)12qomW$_$+h2%OK~AErTw7pTx%Cqs3WBxfKzC6YKbW)7pZw=H7keeFbOF z54_wSm>OKsuC@muyRhpCUI#l-r@>T!%@pJPGz`+fY7KdrHnmZa|IqKt0^(w$ZpXzL zRn8~-ux>r#t^JT|BecZS7?q8FM4FVcJP+x7;=?kDSIqRa>Ta0BG1^<5 zZG9_b3?Zy_wD$;zJ$U(dz|5Ur4{{Zw^Yn36G#(VA%-!c+y3}u#>nlE0@4*JDJ!guL zIjrGeWxeCU+c}glRUyQ;9Z##zDz1IqFKIt=lCOArpLNdB4MQWZj#wOhGp1&C^kYVD z;s13zRDE-fv)HF?*rqle%P(az7Y?&?G423jw?Cr#cC!x3sd;D^c^{Q^+*$T%)M_qQ zQQr)2{>S<|#sBB8OTyh1rErlP7eL*NpzjTl8-k{2{5otY=G`c1va+&L?^oHj8J&!AAp{KfH9I_!Lc>)WXJy$am*890!4eR`@5Ys=rzoe2K z57(I8`bCR0>}86)AYP8~YL@wn6|mk@ayoqvsvv*2F*1xPH$D5Fcaw$DtJp)D;sY*b z9mEdGv0q2(_s^e|aH3yZw@-inmleemBc1c9ovIqL%?hcQR+AbF$gUNzBEn28dmy7f^Q-@G z${YOAg`bDpH+(BuX7frZtz`T8AkS;2hsX;nlCMj7X^Qv+U3fm@$r|cTz99N*WUj!o zksU*b<*f+0mBfwud`_YG_j{{k1n5?{;P(@u2!6v5HUu6;;&p=5+c_w$b^_ZD{LxYK z6Na)`)-BRfwrIP7^^xyhxGL`Y^v0J0AGQ3NIW=q9j%~O06|psVP0vYBE_MN18TFzs z;ACKb#7iH8jNr9gSghIGpS<#-^6VBeyZZehS|pc!YIW{84AyDOLq5`a79Gw4rlCXN z#1gU{&G3N=Hgp!qXYSR7j~}lP{nC0znNdAfkq1T9Qyf%|(BxO<5ljq1{?!^+unm)Z3($y65H@~W6Y+J@^}WX0^Of4q z@|+s1#LCopzS6(bmf3Ed;VBX(eD6olybnnsAt9C37-~tJvok~-n!DTMC7|p(_BU4G zH75|l{iLLLSZ$H|lsNV>G5OA!utbRo!$mR#`+%1SAeJOnV*@BZBqap47@IqI*M=%U zJ`2tA8X~>ijqiiiu;S>zI+M?sGN?CQwILWOwdjR$Qg;xs|0Lf?7~xGYzG2HRpK}3~ z46)YI6nTE0&v~qBjeBWNYDJXJhDc*~qvHYnI*ezxjQ$~g@IAka9MMT|-|{c@n1~pg z#A!Lzg+$E_ z8#~l3X1|I?(i>yYeB|xBFE_utzn^^)59Q?JdwPi~zFQ{jx`F&Ie*fTci}9CobAmJDNKMmvYFf#f()>4h{wG<<9xCtWm$D`M1Uck*L-rw&lf7 zZ_>7;`coCwyf+dJZj-u^$)e0}!<+%)L12 z$#9m|i4$gWQqrmD;!*)H#I4tU`c`%&O;K(QWbI&&tp;?xF;tQEP3s7gR|Pp(l_x6? z<1G9V7M<(WPOU4EV|g!JdW370Q%Wrj>_h{;1vGu8rx;n#{`Z;7hfFn?4u*yk;dt-9zTxx&nMy)I~HB6fr2_J@2i4#_9K%R)z#Hp+vZ4O zJBN{O^hafKPMPf|oqD2O*%T)c!K0brRf{^%vhT%flaGH!y(C6_mC_DP>&!baj}6Mow!(|9*;8GTX+96H$>(>*K>4i#V0TSGpg!{#RY9iU7y`qJaf62RlAbP^DV!=!> zW^uFyg4c1aTF$z8DUxb@RTHH5D*^Xa2^!Ss(rISTIv~!d^j)8cce};Ym>aYf*Y6sb zrGlhJ_@8*mn?h#^2D=PJZ?qs;=|BR~yvjgsNc%Tg3P`vkID~zUBxk{6Kwt487&i-+ zEb%ERDQR%1L2YX_c%V4{(<;UNhyA}p^V=+)x>D(jKCkLLiQ_aBf+JG9f9}<~<8WNS zgB7R(p`$v}`00JMn#JOM%oV8QR%hmqy-0+vwox>b8W_;c>wq&@<_yZF*9xYHB zppF-;0!Kgv_zeKiO?k0pLA5v<04aI2B?x2Vj%2i>Eued62T<*fzbEnxG&n1MIfh7Y zipu`F;`a#;i&uvA)e8ZT`{)}kQSsPDyDUFHtH;Z>`67gvIQXEW!CS}xSF8{Pq4y+fiJw_=1UYi;_yY=!Y?yFnc+W!5pu1p$7>9@BO z*&}}1CG%-73J;!H@>Oih_cbakz#)>s!1ML?X^vLN=S6v&`52EK{fFd=ELy#K^#W$* zo!9~pn3RRYVM2<670WJ|d20EUH(cZ4F6bBKT(xksu4AgH4J+T|{WXS5hNNE?(Q1c# z#RW`o_aOb?`0oeJj$bY|o0|$%`#BxEqjr2rDScYyHL!A3@OZ-rb z@4M~SaMJSJ*ypB2(BMMF<+-xMhdw0`yvKYevwJWHZy@#d!drLW89#eyH8#}bk3Z<; zwDsF$!8Voisk7&O@HL#WR-LUA)LW_Hfus*vAsFRU`VwQwm*{f8s#U)l{Ztgp?S!BQUh+`HtW6k?Jk9|x2 zbNHd&0xn0Pz^;V|Y%pCGH_Lu<>f@*7#JeE?uO{dF!N+gd#{OW$iIQ!=D7~^xiG{7P zv8llBGVNBz`eOr2BKYC|X!&a7;kaQmB+)kn+&3#dF=?CU%7*!54)S;Oc=8Lukg=30 z6j0_!R40-tSQCDXNBR<~`RxsSkvFf@=bxZv-Kd`b{XlBLhY6>fcTSCp}`zradkl%r~g&!G?Q4m{FocL)2!jL(r`6v+6WdbSs zuhuKUNk)NZ*O9oVp^a@9on!iK$6PSxe6RbT=b8YP``JB@Y=RnC!Qy8y@)IDKJ`8QCoh+s+Q^ zlrH578htnE4drq3yWCi=2o}pFyc@6l=L!Mz6POZCI+b~VX6~R95bBq}@rhchDtrR9 z)bXz_rd`JZ3*Rp%Ydo08h#^X(n8^I0nOW9DvSiW0EHl?p0QdzM!Ntt-r^&Iw#PH$H z=>r9moV1NWYaY>dlG7)k6HFp3bm2~WPP%lRqWw5zsG7HpnV9hi;4l}%NRLE-o zr@UPie%xBh6r;}KpXGOE6grmZDrEA^5!n(w{erfdw1TmKIo_iuIMY+bA{8rdZ*pyP zhjcC%%d>4O(}vr#BtMUQE8FxtTz~A*v6UTH=nqP#0#5M8NCq&mK6T31@$0*a+70=- z6X1hIxqXPJP%7PmZ!zimo8NP*8!P6J`}Q?)E6#Y?q?+j0|H(XTA4A*w1RbDaJFE*# zK5DyHDR)!q;YAqCKo9$bUv^)mle|@vw869gkA|hePQXE9l>lXLAe`cRn(3 zK)HW%s11*7Pid`uhOnnN+)6fU`_R6CnF!{z9I@)cGN6!XW-Av`j6f?2{YcB5*$qn` zw0^MA_07%AZ6+l(aYAV9e5l3>7nsdQT9b5|x|!zM0i25=u8{TiaSHp2m7iykRc0=G@~Dmh;b-G4;}Z+isoZ*tBLwy=Htd zG%guxew6k)(AOnLiaeR%aqLVYVWd}pRz?}h|6$l!soj3*V6Qnl9^WJ7HSJD@D+r_7 zq%`&O5S;TlUDHZ(@9AWm1iSq0IQ>hzic$_;v4=UF`^?lRcuEc!{;xTq?+{{a0{Q4Tf(P$PMr3>gs;oN%gzJWzaR4H$veIOf$Fx)yYgxI649n$<8N=~joUOCUM z!d>sj;|ub4J*O#!{aomq$^MJ;j6_E9)sUc6KmDXRObxGm&u#4OOm@SkES)y_oPWV6~?8 zO_6q|_N_2j6Y(sUIL0Z1Q1?s9x>@qeGLefz&5eR9*M*+s^4hYV9^EZch76Vf!y>x-Le*=jHEJQd8{^!x0Nb0+iuUI>Tg&Yg2`Ej|!(W!XT5pL`_U zqLqSazx(4}=|+WZc=U{=tC|;!O|?r-1^)Tpo@dW_w$olLE%9mLgT3!C=*v3&Mpz0a zZE_$K3mB_Vdnc?N;-Wqg*J-U3TC>0k_yu`ZHScKt)F!ygdMjdI2^zHc~L$Stuh?3Z+MN zx+3F9R>nIYU+>M7|N4+PrIo~B54rM&p5rII6YiY&*j+Mra;VIUDt{+rO_gK1jo&8v zb22%b!Y&{kjwx0JsBjdLBR4iazWHOM`m0yh_VxAo zBE85I6cpliaYu8#-#J4cec$0@89OA*s^KCLvDkvy>SV)9`V)Tsgr{6b9D8K(#6S`0 zbv-AC_u?>A=@0YkoiH`Ns&>eK_)+YyJK8@yS!maM&+p~?_hjZT_HN4vYsZuJb*D`x zPvzw125N%tjM_anVTe=DgO-PDlnCi)EcGeoIO-9{$4@|5f6xKEGwAvf@RX&=_o1?=t|m^&{YbrgCA zN+b@;yAXV6Cuy6Yys#Jjn8i(aKmgl?X1QR=s3P<~1|QfD(UdjQWmMc1RQ z&5aS(JXB}Pde@=ROgYO~`ex~=o|lpgyDY1W5})$&9XO2{v3b{TQv$gk=xZhV$3+I+ z;Bk^{SOBcbJ9l16c`dx?{LlwwV%#>X!)yGqJVqs{`kRLwwVC>ox##h_2uV6cOj_uA zOo}8b=5p`}sKnR`xuX(CRxxF4tHfzjSpZe^B$`*1=fSjRSQlB#T#lk`uetf}Ms% zX|$6Li+tc(D8#wQpkUQHv!NJxUmtV9IwoSM3ty;_6+pcs|qK6dn{a~ zA*(9-ttqN?5$zctUMjqia^ms1{qAkxZuIdNovP=DDW*STFo9kHo1 zNDW#w*++!-?g|bI-YOqAt>UqsBjQjwtu@A|yC6C?7q)KR_33!>kjw1M`qO+~iJA0g zJ+XlvL*}8+{yoXKS`qcp!A>ShBQQ~s!(Gpt6O)pN;2sAk{==>4ED0?Mv*P(Dmlt-o zzjPT4OZ2k=05{)l^^3Ueo%hUVe5&q$fB^BDeUq!Mpkx9GT%X z$7u>*i8LHnstMUO9&I3l9Cfq1nw1p4`P1CZA79tg-gAzC+Eg<*;rqbVv#e~!A*cwx zG!P8VRi+G0p3HD}cV7x;HRYvf0&VNg)$Y8F`n;H(pE7?5-&J&@`=hjm7hJl~GGV(j zseXFYF5!l;W&NLJr|5(5lNWo;pRui68Hh3u_%I_Gm?8(BMM5G&aP)4|BJQ&9sFf75 zY>dmQ&g4o=ewstuh(zMlk5Du)_@Yfd|9^E9|H&} z_|-_A|2u+69=qP*8gR(?VC7gzWNJL=PU#dUExoRSXbN#PAOvwDkCjEQ0meLYjy+1HCwNOCHDCP58NcxyZ`9b@v%HseXuQOo_XkR(6hSnLZao?~51x3)TI1WZ6` z5WHqDUfay5<)Vt`js?Iz&jdTr-&aJ`AB!Dsjg$+?$;o~EPOvi^wg(&NM6(^?QItP> zmb$zpIKe+Cs2X`lR7k1bQ~3qdY}&sp*j?Fccq4jD|G0e1Q0Hdy;aYnt>}43rnO3=9 z$nW&zrD!H;tR%xR>ie9d@o#$Gq(|O;XD+cyER@0jkK7e+$A-}*^tbv4)hwR)*RBDM zfPvOtB%Wc;KC;pQNzvXy*TL)?f8yEG-A$o@YF7n&tCtYaz$;jw3X_m4swcNhte_^E zbxoMg_}P1`T1R6%&c3K=_HF7EBh1o~&d5g0d?2;$`tU(BZKQ0b=Y)aR4biGKTi@;Y z+HYH>5lY+tM6cGGQ{=4PE#C3_b~QQ;k)bG%)vg8MkZ-`pa{Wd!%mEBUbyA*~=_XU>(SfnlQQ++HmYl$QUv*V#pHVj_a@o`N4c)ZsQrBr{Zn z!5>q*MYqSgIoKb!+w1;5{d(3L`mVjmG<(h&;6h`tDyL#Z99J=d9`^}$Z~T4CaF)VX z^8=PF(!ZFHi$%GhpIdO$a*t4GZKhGh9MYcPwF z$7fgZK4w0-qw2JgO621>upQ%UN*3QXq)_19rK@;nA%>|519bRQG^RG-f7#R@pq6I5|L~FCciz< z6})$1KREMcS3+2i|G=;N2k67FwH5Xb)JL<^RbBXLwza92>Wp0K+;wK1?16x4$8_)c z;f3Gi$Xmu1eP+HS1<~m&APIF}_yrb9!P#EsSVfmPP@Sx!5abRt3wPTSP(9#m;_Q*N zc^@TwT994g%y%~03Jiu9h+oX@*+m;1CU5&t7vv~^f?1G>G58DtVDv{O$Gw?Rs(Al~ z&7>_Yb&-l=s`0J0v;wWB8x_R2&!4Yiy~ZQUv-{5GQ<@tW%=z~18?o*n8BY@5Q12hV zdj0x*3S!EZ!hktZ?N(igN_?;a2s4eJhkji=X7a7qA~5K&;>?^Sv?~j@!vo>NE=fa` z(vK{n;P0#G>%U+0@|nbzL*7tNT+Ip$=fm>3sBQdOO_5$gD68kqJx6dP0{?z4_TO9qyBg30Kwx3MwsA6nkCVO#%Q zpHmIffpkoQs%idq9fSZRG`g}6{Dp(5g++Z&J0zNIy7XW9%rUqs6=c}P-z z?5p4NJLezR9RHE--O$U_#3YzQoMiaq&+G_5B(Av@EDCkU+!jc0QjnU?KI&1^E4^@* z#(OV26XvI)#~A#r zI~DB7C6uB>6d~l_a47LAD9--M)(gek)yiLzvg)ABiA@<`j?qO6W-sM!#kYeqBIWwi zLMjirX#i_JeEg`B<_RJ=7m5n;17dkEmRK%Wb?R7rFJ1K9yEj^t!2`-C!u^D!u(G>* zaq|EKi-cT=Ck7iGd%gQRIn6O_2d#;{2j8Q-y&db@Z!bvXZ#!aWweS5V#3ZKAxWt;j57~(eQPidaE06iXKV@ z&rX#yPMNZ*^fFyXDLU;YCl{A5@R1|MYYxN42t!h$Ac56hFsNiegNSMXOv`(K_r(KO06ggqdj1;WEouQ}pV=q`0;N|EwEkrhL?~W!xJN_Q zQ$eN9!!B|q*KycC&+6db4+duUxLGaq<{kg)#w$f*>#!DQ~EHR>MOZ zmub0};j~2F8*Tt@A7CYY#nEP9s)87;UK03#=dt;)s@ZA~sRWcB>gS4p^tZ-Xmg+QM ztBB#)`YOBp0It>_rIHY=8I~aDErSIK#1*}X9j~dcH{&(AP zpyJwyyzv|?JTZQfP#T-}P(`SFV*8WIf<)hyRiy?cjZ^f?qZ;G#X!Te{#oc|?tt~Am z5@lo#gfG)ufK49OMqOYnfq3hSHt_kt#-KB66!ZeBi?tU)p8W5k{BThKG3OxgY4y%s zyBd%w4N_N5r3V=yS%2k#`x#}&hRCk3cPP3W51b$Op_N0=RwrPh&XJhRrChhiGJ>;n zYVaZVQHT7eZW7~ak+hv2N3ssS9pD?`m(Ng7?{al@y*4p1c(-+I)IOCv$Rk&F#h84A z!a>^mk*e#wXhU#t+N8{~;nRg~N(a7cvA$FNy9U^TH+YaH-1S!~cNvfgXVi~sjZI&M z){XVnx5fCy=-5|OAGeH9*h~Lq&KNM(oi z$8HlbQC{hlp1a@~>rsGDtejhk?0&!0a}b`&p|J_N@|nM?wn zziTTr&_I8a_dK=FIk_KR&#+hw4i6W~OUK@c{c@w0*IwHyqp#!ZrEE{US-GoqJW`gv zd{fKS^(mZlE6h9`&Yv8YboitW8kzqv-mle>$6hWw3U`y7-|b6H*YOG|eLu4I?;#y> zrp;OpTt%WMZYy#+XdMG52N;B_HJ&iA`JB1Df# z9d6RmAdv$Q^DY0cDUN&0%l2I$PjP!SrmUa|0O=PMajD1rd5%ScAaM{4D1tpLfXeV> z2^gxJY}!ny12!_9)|3;GA!2Ljg?HFfvw0t|c}1sMi7-9VPp=SY!KfQr%}B}kZZ-9z zrVYWGfYfY#o%D77;wIT^@yH^OWC}Ae%>WM%k1UWc*Dv>5_ACGEYev4u!9}-US+Nv1 z|43#Qw;RVJh$1l)f`uF!HIx0HK`pfDS16@**E#i?iP0$^amGNM6LFS($=W7T?pp|gv4}rcN5Yz+#coS<;z=I zc;Hzk)UnIAt`$`@A?ZqaB8P$`5fT8TCx2CsnME2mtIc2#kaCF6!X@czNhC<#HUilZ z=n(_uWaJcOIYyRrEHgd9b*%0)mDi8HdJ3KuQvijU630cN>BsnzmM4Q4C|>YyxzP$% z9J4E#2Z$hBBUIfC02a<}3ME|i3Ru_sZfOlC=epLlM1sr~ zFa&j)DMD9Cc!pYg+>xYeqCY(p&(%8JS|$b7i)uT)U(qr=377$EkZ_SvYFgyBOtb+3 z{g8#383}46>}5y}uyZfP>{v7VqsihUS7 z6Lay{)oN z3|a1-?jtNf?M`MZ%GtcXjF%wta~s?s8iB)<_mLPC>oTNEk-`vx83e)-(|@!Br#!0P zD=4I9LABG@en=ffyaplY6aNpWx%c1J$f`^*?v6&}Iu?*DDJ(%bfdY;&)Eg~xmC^gZXaJ>@ zd>gb$_76`vb`hnqI($9DsC|C0YL?#6p32fgjpiZG603P7HVTsz=8;} zMH7c@O-F+-CMTk`M#6&^3^jyD*IN4GcVx)#+bog|w;04a-~a zt+~6ew$X1m!0^}I8F8c=zGK|zm9v{PVLefauy;>k!nSOj3kkSJJJ z$dL5(#_%F$LA#BJ*Ot%Y$WQJRhv5ggUw z(zi0SS9y;ENC+|Yn7cHsaCS%$r@wWZ)p+G1ozUy8L%51i6)5C%#c%e<$b%r@>>&q2 zkV_%wO&$9U2X4|&g!+?@nb%#w<_0hOh4|`t^P7uZWB^6$-&Y2126yE5k4`r~I?ax3 zTA(L4ZruvNpN^LPE^Ytwvle9sMAsu6HxbR~`tvgbMIMy*=st66Vqt`qtMPPL6gI%z z!TGTrX)NJ8ufuIZErz~@hcDujiwOch(F7j5v>hZt`x-VI@CF2P~yxF`t_f?NwNAW_61%AQB`o2efe5X=K0vl8+^_zs@83oGouGHaTNn%LcFYJP+_tEx_EoHlYr#=w#l; znwDpJ31OzS7lbplqkbB?(;k)J8;%P>E4R+?`m(++^q>;SK2d#p0#NsG?YcNhO>D5s zx8^MEJ_~h-*{Xp=WIBF5LL4^%QS3l63bug(_i8ANXi_NV;~~$htC=VySPx>TLkTw8 z$kRdTHIE`7CB+6RzA^@~m3kZp@fyP`;$7!f&juiYq(9m0QfG_Lkn)YH3ATvwlFI!cKuTe5jpWdx-1A_xvn$ zx@L%fhswMXs(`@PcBhF&L!8HN9>NIVbih42LC|h;Q4opwbOr6B1}3;i7L% zFU6gF5UrZ3JgJCTj^=~6LM|IsMp5~#5fxPrz+YrVz*OJM09m;Q(~o^8h6@>gAACH< zcvJnp<&RY{7*CW1KU+>0N^zD=(^5@5=g
TbjGA+@q#z1O(hs+(1LPJ@Ot37FN} z)GM$NesA>6&|uw@?#%JH36r+$RGALje!Y9cl4VJMRZ`T9G7^L`n}>pb*@sbe*(JLc ze}K*cz~&~r_OAp_*^|^#R07y45&NL>CFc=Dnt|Ax?m`W9=KOgd@CJ#z6qe9Na37#t zz@H6oy?EBx#MG4BM5%`$W_5UsqT>-M1tE0u=c)VLazpl&F@b8OqjVa8mLbQaT6l@ggND-wBBj$S?#=Z=nKkqe}6`bvm0-|QN)`Q z8Z)1);aEZ_55zMIG8d$JE+e?F1plGtmFzB&9^x;{-*j~Fi-e~deazcKpcm**w~0vuHl2mr!17GAI&ScS{bp2~#Yj<++tF)W%*^-tFq zhCHG_h(}|;BLAa|gFtTm?d+S9mf1Yza|;Rz$`J}e8G?OuR(nN-vZxQ|`MtlP1!bV% z)Z1e?-D2<$Vz_fh0PUm}@SnkFQ@gFpmI_r76?(+9?cXkNcOvF27jX7<0~pxRS?Zzo zwkdXFgZvpWr^(LTSh+v>LmO@U=({S_BzsP#@7~|-=`M@ns&2fAgCtRFV)MM)8tVoI zL!I79P)JMzvF7C z!~?NI06$>E)>&Fws`BMc7}@g7%*==v6{hC%>m9$p=b&W%ZxPjJ;5{vCt=7UFpbAmP zRA-{B9mz>6x;<)et?Ek6Tq*vyDZ(N}+^4qFPosN$d~#}Q#Kc~F_CQ1gh<$s`CLXb# zczkdd{J(goa%nB{Rl`c`<`BJ5DB0QBWS5KoS%`mP8)2XjCyn@@dMmJo&@FO0Jo}11 zJxbPQ1?3L-u;86@x8{e#c-4EU@5G!=EW=Lo}{z zW9un>;XdrTYgPQOgY>7sr+Ub7vj0Jb2?h<63%J4eFye=;rOLI5MXapVC}?OCDY&Gh zYW=AD3Wn4e{&eLS;4I1ooG>4}W+vjAOCjxoBO^I~Nu&e;u%O(+DaZ?$6bIN|TpZ;N zXh%Sx%}blIfjePFL`<4Us2mE3?V3gBZ7^bAjk!0DL-M_`Et~!vJuN~QLc^%#rDA1H zEwQ!{v-zr6Gq^qv}`$ie+br-aRoX*t`;Fg+0!hSKrzk*VN3Aat; zSHZ7=&`s_h*T{tD#m%a?@MuUqQ0-myTDB}Zk^w4(VC5qHbcC{`>MSIsB00jbnUXOKG44Z& zNGKno`i~Z%;CqojV|N{OA=xGfvD?H%5ZPz}5J>(UXy-*;NG8P%o$(ufK{JjUJeLCa ziwD^!Ve;|a*EH(NpM`drwB5V8vBO$hZ?^wVR?Mli&k>4&GMD7{Nk!C}C6s>AhjL}| z-E(YZm3cFzZ|A#MR!U6zrXQWdl+2#@>N`5V3c$oGFr?dJy%Id z!jSn3o{w(Pc`=f+^7&kg_lCWD_Y%3iaBV&wC1FYg2N7=j<%R63zit9oczS8xBGtFP zg+$ZuIWy&C-Ts&Ze-dV6p$Z8m(O!~sqi?oNMV%_v>g?`dq}8i>oGAE?rGmS{Yh9*; zpmWg^rNxb=rQz!aKXjMumg0Zcv1+BITUaU`KDzf>`ZNOuKF^3#lq5WulHXcd+_{^mD?h#L1jTh>);l<(HciZW{Mtt+j zy_Kf*;TBS!53a02jTtOXC;p`_mA3_FZs?3V6o7!$t^8?($=Vd1sgA1}PxANfq89l0 zZWAuy*GyHS7kL2{_vaZ=;5;Klig?V+2v3><=+z!34cN203j3HZ33LR0MIQOg^k|vI z+tl!sLy62Pf3f=v?GGbO{i?G_>|p>BF&eI}3Z%;+*u4!_EFxRZx&1Uh?Ko!roWtT= z%N3`>#kxPwvp1(M|NE$GxYhHR&KP1d*aNJp!+qqy`sFR&cW)JiI`H?Em~I&yf#Cum z%$@ju0YDt({Gu&|eUZ@>>6lD6vsQA&pZTsFf*i%OV$NH_ZCBC51@3i!C}4mUOhcO51$i2R?|-xn;~36KgIw$Q2aG!!cYCHtX<`hv5h$`CG%DNgoXKI}MTq zSTxnxEnmUlmN1pxJ~(z{_}uug7h6n-P}L!~M(;q5$ZzyI`3ojku3X{<<$w|)7_qe_ zYYG!?wYPS#jqS-dRDQYJI$Zm@5aZ3A38JC%-qDX0zum7J?rEo7MmR4_IQ`ug( zNE|hq_sLty)+Fl*bl9am42<)24TeVz4HP`LI35jXeRt-m6^nBd-8?ax^?Hn&IleIJ zPznV=J~34H^z=*^aDusT0au)O6+7eCd)M<#Qzq#3m9(&b@t$8T9zit)q<+DmeP>+L zmvd1tR^tZt47Z_tQEi!26yZD5pLdk$iZhxQH=vDC?@ptCan+1_K`|KQ2Vv02Kmj#6 z0AURYjUN1EQ{22WrR%DQA*YA!YI>PrUL&}u+)M%U6k;-t_zN8iHfPinh`E2v{oG8N zY@7H$#U5-@`+jbWo$oQ*nySG^5B>gx-~Kz+x*9-*4-ba*PlyGG=j537XylE=s{7xM zQW0N7Cx49yo6w$y^mb87=tWSiNEWZGY+kwG8E_AAf7VoC9iUz(S}-HHcPAYaq=B8E z#JxXR#8<*A;y)jE00SWKiElht^ZH`yI^?LRQTr^gGdW1a_I$zseTS`C4rrPYGin_B z;*S<6!h3J++O=1+G?KiNlGZ`r#N|VwDtkf}3=oVWXrmoGPvtG;=gJ-LBh2vrd4k@&pynI%tshuzqLYK_Zvx5IO zBAZS;rVAdfChK5*;oe?m-9EV#7pd}ntMC@~H%@)F7e88_x-vesLRt84!DUt&roC>* zrwvI9_>e`yc@fy>-KMWpQ}oWFsjw_j3%ase>3~?(n~r3r@Q_hu`Vc@b6;)ps?mwBv zlXDX~uP1m_8mDF}Cv6`U>ljF9i}`H+LSw-jvVQEj;(A_q2#N@5AI9fo7V9rEJv|}M z@l|8*Rr@4`Sm(lrk>}uVNPe_Zh?p0xt~d8FJ-vz-FJ7$ouv2tmB_2)k$)`{LwD@Qm zHmkQ$hf2-$;Vr?T97vf>$pkp6Xh!(apKsgl!gI>JSS#rHdHQeFO9tfB{STSPQ37N> zVPI?b{uQ6T6$w>!XuOO1W&ZTEx zhk0yT`-KoU00X9)y1M19#Pr3>YesuC9hO;{-Su89xtntH{__dMI1*fgs{sMaTgg$5w>ti3 zS%2foTS4ked&n08xi9|bEZv_oEU_OtIo&|FCq)!XgEoZF%p%}H9iDcwvUKTu4 z6K^*8+i4(Hv%MelT^rWFmm^(i@6dqqZLz(*y$MyJ_7J#PYyHKMyK3}hGO`F z<|(oNrn&`i8p?(8@^V2Ahrce9o0%{m`jFw}Fu2%jCZQjcRTyH?KW`WAT#$B3B`L1a z^?U%rDE)rAc4WzA0cr8$0oxa)dAllUsC=PQhxRdpu^HMcV!K*SK%m^(e0mOY+vo@m z+<5_tktj>rkC`Y;Xl3+IU@>QPSr5EZ7JE?G?R|Z7z`O0VbsD_9AeJqBtJSohWPzJo zx0Pd;1q_{M|FVt&tiItDD6KaG-wyZEDL-z6Ns>}%{l}_kJ(|Da?zJ3H6BrQ7jFVoI z8)-#AI_Kek+q&N@<&)$L>ga^+H#Dt^{R%<4nB>9ldaQ5T3LR&+I;BuChABjiQz-BA z?Sg=ZME9S(y4c;}vn_`twntH;Ml2*Ms3>3o1*C{l zRHP`j7ewhDQIReny(o&YMT$rh1f@vtz1L_cQkC98={-^f1n%4~Ci(6;cZ@sk@4q|7 z$2iA)%3GdiKYOn-*IaW|K3+i21VR33sIV94`l%-EE&7Y&IgGAd<)3b}Wj#N%E-l_L zb!jDT3`I5jFG#%us04zs-1cAu9KPF2-|gV|!8dPK7?fFid-3$vbj{MdW4U_TEUEEh z|I~XPcV}i@`v*P6NFf}eV`|H8p2zsm%EV_4vFdjjj1mXCE8kNWT|x9>{!dewo0Ei2 zc@!ltgUIn4SiK1|9gcf%mNZD7$RA+dYb>|&h|bGgEv_W(;TJEKTDaL^u-kc;ovZS$ zqHP^D;>5DoW?4BJ4h3YF8%yrv_RRjY@j|g z`P2NNl@kpC2hXDNx&dgLj;(zJ&6aPGQqq4P$G5qtM#9uR1KInGb0K)$f8wS$x1iq> zGphBy_KX#UX`>WNPp){mS-4+(8s;+ zTA4eA;=cSAbTI!P_rI1S_UApwTEM$t+%r5hVEo4)zo4&pyz76`oZj_pfAEkxOVe4X^X@kEs-dPv?`CFc`6ucGu1eqY z&@tY*hnBuO+Qga+E&S(mFQAc$VHmv%P}<}!cJQBjtEz|^-HI`IyW@lKwr`NkM(FoT zU<&eju#8{ zw9-Y;+aYO|s*vTbBMGtSKgRdn7#Y6dOn34h_Rw3881O@0y!ZuswAY9ZZNs3~Md_{5 zi?rHVFiksk#`+fM$#7EF;uO51shXYp_e))ySbZBfKut|ez#W%M2iMR*4M}V^bI)&k zfA5zmn(SVF_w_iNluN`^Bt}+l`9ie>rAHN0%`BkkjJy%@fM4W3o-i(EUk$~;jW&Tj z_oP>Nk#D5l2(Q>jHgxmA6Z-bhDQe7i*M6;y`a$PmRQ#K9kBisu?)%NDfRHBx3h}8o zem&00+Ec{D&{dsyNX6!$oQuD*)gSEfj4M$lJ)y%^`wLYoXv(h@9Pi>rU6ef0iUGyW zTjPc+N6$XMNzjBR_M88Kh@Un5UKL_!=cTrL=Uj~Ahy|8sr$3mt{Dztvl*anT(zHcf zVUf3zrmrL2K(|NTshpAHP!t_KG&^OVt zu%c?8{1e;?FWf85$4oPA%b~gt3cM2Zk1q3e#pIy};t#cFUmsZgU9^p~L`_64 zF?6G^=eLNEw7)|z@5|>Hg-`=wpdp0Z&>JQz>f~Rtswa=LZIidw+(w8J4EgoG>ML#O zvX5chkGLlxAo;5@xb?=!xXIJ&qcKuHa{OQEm1SwO?AHGD)AlG8%!Uph$$j$3(zQu2 z$427Y@m2=m3)EBhc0!!TYV}#(iN)m%o6gU&UJDB);|hD3&gchg)9!Oz?ZiJ9-NgP| zP|0uG-I!$(g~Sts_MbbOHh!Uw=CLKw|x1@I{LLz73&gT9%edc`cGdjXl~g zCFg227FmtfIp_vlD$NUQ*1IPbn(1Y!ZExxJvt9ON$VM9cyy!D=ZHQj%cNt{n9v&KN zn7(xDSMu+^&YX;#srRcQ28K76xC~Y`)+uE4rBxXHLVg+a?)a|2Xd$1GW8n|)#tg)X zx^G(Cgn#dfKEG-weM@H8f|t9FJ8M=}_CS5WuRqz5_c$h}aNC=W@vv!C@%KrOG|CC& zib&t0C%^J_TYZEx)=D->>Amf=X{XF}tGm)GR}u8qH))B*c>Yj_@4c@1b=hpW>Cd=V zlTUMcfcJz}#NNvD*4xRuY{+aMuO7MbI~s-UPpb|Z)HppXwp<*l=MUF=^!J+{k~fVD zzWEqz@u7IiqWz4*o9nsc61t*&+`Z%a@3lltRotR@I6F^uIK=hJ7q$1iT!mkY7j^w* znQGoEdG6e~W`_dw2F{~SG?K67%e#+FQ(kFlT(8ehO?2qK_JM8cKamf7E@8s+%1w!- zA;9;n_Pm&;U$jH5_>bSwJHOvp2qHsmmPNmlR?<&g49?It7|Fh(u!kkqsir7(llDB; zZQ^!!`4IW6*W0q1t#dA4{dfqoAZmlt9Hzb-pW^wM{A$A7c2(1jq~r74U1#4`9UN?x zKIAf}{@y6cNdLu;)hO97)Y>O^{$;>|K}2%0MU_uZjC1(+VKv3++sCqW$cM-z9nYm{ z1CglRvR3-}z1_!GITyXncsX+HUG7tV=kv4rCgw5Zb%jZ#mFcg(4XRe6bn1n0hMRNHh|TqQlJ`I8oDHlEd|I`;_Tfo>rP{qLe=o`f6P}JI@B0NT7WuTZ ztsc^de7#@!V^1kuJEcNTf5u)ny_mbz^wN6!J3~?G5A+ z6{wZBySnj{iwfEs%BJvh=dI)6_p6po*%Nk{(0Se$p!@NSkv|vnv_#17;N4XlGG-XB z>%y>>yy@{^h8&**DsPYMVKbQPcf z`yNGU3y-wlDPU5l>Aj>qpBC_|xrG&95(-fGNg*Gn+&3y1?huo0RFfCiBYBnV1jqfd zG*6;t0Q zX(y@hTYnW*Vef_EL{ukH5l9NKR4;ioictzQctP=$Tj}%-rQyYEb2fL%pX3v%tJq`{eJH3!$ z?0L&qIBXuu#MiH8b?QM|bG+QcY+jBEg&WN&eM+ z(K|uCT*U)Jw>)%>PleC5YI}JED0ZFvV^#f>VfF9kCcDU&DBRobC_8p=$zD9lZuOE; zo?2-ewADZ3qG%Bwtg@SEOo;cH)L zu{Ugu%>HqtynGUtPDN+2lvv?Mg_DWId(v<2N z7)fiBmFwfBExetHJ>!rx7RlEhFKabzsMz=?1G!_GGWoLJ8v6Uo86urD>&QQ-W5>Uj zR=Fqcb1a38!oyQDjAbt>D(X>MdA+lZcd{!@uVzvEWp3)f^j}!Db4%J>pe^^@LMoqw zssFs8YG>!)AFr-&tYzmj!6)7kU_u+!tKCoiTE|J8;in&}(g!Np?zkG5UJ=T()bsuE zfPDYRampDp*rpL}ypSGME#yePCg>-2n)lc5bueJ^(6izx&x}WrFHIGHEZC9q!Y!_f zH4&?_GB2(<$!~t?A-R)63T{P$RkQ44DE)eu!6yQ_Q{z>xl`CGbPywb`Ap6ebR^wyY zcPt(^%vi~+tnmrHCtLS?=@xiuC3>RW_a08Iq8V@dWB~a{nySx9Jko5G?w6xJMRMQl zd3a6~b+%1Y*C9QGW>bzZv(MXC_nD4`3kzZ331?qo+{xfnkN|JwPT7zRfwaJWq{L-u@oBn)M>oZ+NYQ z#muWeBzIt(wh_-)GV3+RIOiCa^}smiT?OiEbWTVArWq%8_iDkYu+b$kTM3EBjbD$g zm^nwMGOTR8@h{h`kCCe?jLP(Dr#bFM9C4OyAm6BZy=HYMgW-=2PhR2}vvKXUrpj6M z%R^nwtlIO3v4aR5tQg7HegPZJX~>LE_%_Jy*+r>>%nZl*tiGfmc)3 zO+V++GdA>=Ph|h4mF-P^!ELLzG%iO~ExKMW{q7_XzTQ$l)W~qow`o81x8#qmp>wNt zhP@1U6_@dH>{xir9lL>Mb%`IQ{Ksd6(74F?r+xK(UL3x&wz`w~;C z3ST^UjfTdbx(p_^zwU=+roMt^?KQ75FK=1NY%haL1r3MKjG25t{_maYnhWKu zqW^$Fv1#Ze9Bt202KA;Mpn{jTcbHy)-FXn#51Y4Q)SEtV@OkiOykEWyhw$3qbK$X% z(CqFR9n~DLMoGC*FVGCxey(0o33G^kvNQW&O=h$jj<^dYPY)O8Vf# zuUET4yLt52wE96mzk}&pRxT8OI-XtCR0cX*0y-SLZF9g`Q3F@Z2hkV>t@pY&FOA5K$4?v;gMy$pY!)A+E30R-)0ib{CMv+y!RI-CT_~a&bw0&<_6YGep=rbLN{{Yz=8K82Nl?LD*f0+;tf{*wW4TFh{fTW=zQVI zYZurPVm}BuUT5nyA9N}xo(bb({&#D#3+J!WScgKShg1IWKAUfSxe{ml$<^6Cb>O*W zJNT}pXK>J9a$Y%lmrS}ao#DD~FC+7+1`NfPoE$}u;A?oQ3jVpCiVG}Wn>(%3l+f6g z-0dg0auph{@Bg^43*;u){U^%&?ykw)1k8$!0_8&`iBtVn7^e_p753U0ip}C4NUx*z zXT)61%dUJc)+#}q>z!GW(eVwI#kMFvF z&0pe%wOvOUkKQC|aQpz`QJAXo4aE6{K?LqEWz%N%6oPrWX_`1<5Rl@=H(tV_Pg4_w zJQ(>4M0I5YMIFkzkHkJ|n5iEatHe&npY&?2jsN$ZUOZf>jWR8xuyrh-1H|$vHGl*o z(~KG!Cm~8oTGgcFs(z`&Sc*E-zXhO!sEOfcw3{~7ftp0Wdv^>7-~+Bg9i_8Z@!DGS zQU_uDfdbVWCR~jJ>cn)?@*)~QCMG6oz{6HdGgN%aY;m1=xJJQ)-=z7TD%azL{x2T# z-zzm){&_5pcI-6$C?|p?VcqM0zpr8!YRARCm#u7sZNppTd6_wa^u=}>H=mn*-}q4@3VzfYHQE2nMZGaGsv z&DOnzBf5Wo1q}`RZ!yiA|NSdX(|WKm{`tARbvre}Tlt4Vip)wJy7JdUr&sO#=an?C zfBNqm|MgQe|6QE_?wtRgj(^O<|F29(rhhw)-VwAnK;NaH)CxQcQ^hDbw$ASe_CPx{ zfP`QS*mB#qZx4k;sVedU`D@pLLi{#q)I)kA0<2Jq2f~Cj=o@*eKZ>cutQ7v+UE8p&443n&025KZ+o~HTF3u!ocpE#z6fc(h8$tj82H0{qn6KVfCgz$Fu_G)oHaL34d zq^JRLoAS+@p$t44kw`yIBb?iiOjdHmxzRc>@xr8g#WyLB)v-s&#U9_VoAn$3(4Q zbYg=%xp2;9O@Xw3o;hANlb$D|0{f(pn^l{XI zP!fh#FHRnw%g~yNmJ1_IVqs7;C%2Z-X6}5~!C~X60FCD6=3*=BnGscPl|=FT-@iVs zdVPZQ`Ey{9Pot*I8!(z95T@<8bC>NaXh=rj!g!cJdB}obkJSJPsv72Lp-VRooo^rN z>lN|r>oslK!!R1I0)#)IQOj2R6`bd(Sa(*N>H5H8tj;Nxi#`zaalB5B*ueC7v2b_Cc{{pt)13`$*s z2--*j>N<$9dsqKkJl5h9Cou_u9A9)T4nZV=JZCfjM);d#Qy?*>fNc~vB-yLLYI;S5;=@&eWKmY%f z1Kq6r&0WCag}}1SsjO1JWr>E=^9^SWK9W2Mc?lAaL^mWJ+Z{(q`-oC4bUKk`c%HN# zi3H(g7|C>_@OW+rQary}@k5s7+!@3`oxH!4%WT{AVE;qA4)Logy3*ZqWr>x@4#**Y zBXUQuGZ_u0I1R&~m)BzTrdx~>qL3-DwJgJiK$D%_zhE?9yJU;B?~&8!+u}(S0fAuhMWyNeKkkmTJgO$96M7W2d>98Arl<9Ch)v7aF+4Zb~N~RNV&j z&5WH!_yxe0fP3=>iy)?)gps%$C>n<~EW=bHl|$UzafLbO<>lqsdbfK@y=!e1?mYJN zjV#!esj#urW9{nLqxKN_NxS{|-CYSwCWx%iiGB;htjC0)prF-_Nf{n9uR=gtSCy-J zx)nt$Pgd;Tn2&V4ILGb@qmY7ezUnZosH3B^Ycg6nAs#soyRfi2o`&GqL2T~G0NXDc z$WoFGA#Lp9>ROMJO|~Q@TB{(R>|OAm2^JHrmKLq(k8ZdiG)1Vi@9Q2Li-%8&hRczp zP(ESS`t}_=4sjgnvA4DD5st@IFeU)IApXl0T8ADzpFamf_iygow*l-Ga!!)Qz>Ww( zN56id<0?TWEwhOZ%~8AxGDAk-TjRXDnYLvnL98wn-6hx}sveNx;~}4hX0^wHrX8^a zQ3Hx%Zeyc-@#60opp5jHQHSa0?^}vgq%*;qUv}lad ziB^h>Mym%ChN^ul+z)$oMP%9wH>7k;>*?di@FEo0$vVhdy3S>Zc`9f3`5DE|50ovb zVyhm!+jg21peZw2Gbf$rYHlP1 zNy}~|T`ZfM+p=N9rvq-Mm``O@joWAhQA}buf2{y0X-uC@x99jZuEPo9SKTiKifVfB!{2xFbmH{w`W${7 zDPt`*y2(!yNZ5z!m9^3s?;VC6C5HrkTm*JYP30F^{ywReYnucsuQ+n2r(>-HAUi`y zD=M+<#OO7(_I-Ie`A3(N{x6-c(rJTfoZLoR;fEPm6;iy*Oo+H;B$g(BH0lKZm zSh8Tv$zczD9c)eOLB)Wc8((Ad*xPZ@dP3_IW~4;))TD~r^$*)kCTXlM>AW0erj-TjA3?S;^>oFG){knVArjm zCeZ!12uI4uuP^hUGsv_lF&Lc^IA==q3!lXiCDhc`PU98~p`0}Lj38`F?WeHf)GBu(Gj+dvw)-XKF-U zfEQJMbn5KL3WUv8*}R_Bd&HL@MtCP0ZTi$-`uegXZ1A6v|CRK00MPg0Ba@W+HwxL2 z$OZ3hgnd20(ZB1$f%7ghz_^|7vfy+fG^zUe;$u~uS{!wqOAtZJFP-U0w)#CAOA=Aq zZ%Z1FSP{v7xx^Ig$L4?pFv|q8m|68l!0Cs@BuvgKH7L(za^> zjf)EqNJFoZ0GeeAtrGTJjy?jM>EpI5r><|(PKT<8$hWSCg2w7<@aKV$3ziv}z^!(E z>pP9f=WXrnfYFb~T(6!mL|+#8o(X8qmMY7d?D%z&__xfnLhy|_RwDa2Z`Y%DJAM)!}?J;gjJdLTdo=ejS_bFw&bajD& zNnW}{GNX+S&nf3f^eyej|DkXC{kwOmWOUgi1h^7lvRiTkeYEbW5*QI%Hg0v6*RjxQ z#QwKx?=>W7C15wIR~{bZ1D9MMQ!M8CFk(p!N-IfrV!G;C=3!r3Iy*bV3y%W&!$L`c zPyi_kwH)g>7>ADi8G|ipF@{jD8_DVnCffGB z8TruIs0{q{g8Q(>oJJTTtg`yFzaH#?1EPnLXX*l)9d%42L!oZlV5`~0zHUaLty&fW zO-QqJ+tNCtEOW|X9w+s|vs5uHa`LceEEEE@NQ;U0*G zh4i9GOt78fL^38lbz5O`8)xTafx>Ipk(-72V}~fXIpw75@Q>C-V?xJ{$s-jy67IY) zo~`%u=j*jH!(cI9l$1<1+8z4(nrf5gpIPQlIm*3-_OvET-?;Hs+PE2lJs9B{g@?n= z$0v*P3dCCQW&dIW8Fp_81p98Me&P=34kNnWp7R{zh0wIjc4CamDq`QAtb30HYe=OJ z$l$&a^Fs~v;+>h1Po#i{eXPpGfQjMq1hys_37EnRnWH4pK-4mTV$nzBD9CI?;r*n@ zfx|g)VygLh*>hpf!KM=Qp}90dSzOR`2LprdqY31oJ96$_mpI!?Fn>58#@e-;?SHeMh$id_T;qz=RPh~*6H5XB%27KbT!CiR;E zfVxx9{*j4UV{5CH;eaKlG{O;%@2tK_tTJ}GH{2%d!G#SQHbkQuAv!;BAV%-Vj?kc9 zwiDrP{YU9TBlVUP&f1O3#mR;|scZ$5omc3vy%=f?O;!kZ^aV6MW}4nL=n0)rGqbtb zs=eE}NNyk*!QC*z3LdlFtjNmp6k40#@-NE!S>Sv-Iyqx6vKqZ)R!zA^JY5$u=l3Si z!JOEX?UbUpZ7ql*le4HQu!@K4{aoHJeIEl1GS!YG~Tk4 zP@=59+(DxwC3AS_)tSem5s7r7GWO_2%%JT-CRR((4s7Enh^#rp z0|~~h4xaJsnfbl%aP|$eBk9CsPQo@Ey9)U|wPU&!F97ui5Bv1!eKC};doxa)%O0w8 z$<2NtF#0qWYato^O@03Z+Q>}tfWtz*n5MMrkMEdvbucy4{KJBVzsISNL*tXLC$@Z2 zWKb4O7{J-LZss+IP~fiW8jnzvQo%bIZzNf(=Qu*mdP}s2C$+-Dz7KcMIx(WAlcPOX z*8ik=YbG7j$)3nQ+;GsI7Xro?8qUg=`aM1Xn8a~HeAQ0xn%XO3ae##0_gzmne>jJb zB^dK}viGQ|n4#l&R|2nQ_RU&Bp|;x(zuLoSB7s#&Y6X+NYG;|0M|{DOed}&j7eo`g z6A-&seAKYMS^lQj4#|%J)Pn@$0ICSY7^vT0;-rO+a?JJ5P*Zhk*M$cD^IKbSQF~ls z4TO^qI-P0Y+Cv_=*LvE*$W&x+gIh(kIZmDW9=);014AX#wQaH5~UDptQLQY$Mz^LU(?6bEo1f0*4@2_1uS59Aa{T3B^(cVGzPj3qSK6nk1wgv3Rf0`=ib@pfR%3@ z+NA{@BssK?5AHVlt<@x_(l!r-2X4*m)WT_u8j6evu#c``B6TqtHP~2PJZQwQz{4(B zfFVJ%CkC<&sKqI0XuJl&657*|_-;5TW%O(o+neT1zf^Rz|MKh*`ZrFFcI2sh@FU(c zMyL2}$DIU3(BX)xA=r+{b;`58y@Ob2h-MhicJCB0^dev@;a+%5VLGY-&NV?(;&aAM zg6+!=xh<#$ktKv`P+VvzE3mHmF=I19IQs^`0o)vV15r8S`Ra5cQ!gNh%mI-o^h#+E zz5&*Vje=DuxCaS;|C#m_y(IS#!Cn zwpJdq1#6F0p9+B;Lu~YLii%1Q1FuvI*;x{Rl31Kt%7su+0y=wRKF(%$KNsc)`$CxI z^m8awK~7WEzVDUZ6p|&12LyRzAUaUCZ`?)z6BY{)*O+eNAU+B)%AK|Gtue~Hi~VBD zNhiDrhqNWrEX1tw9`_J^c@rD zNa8V!|If%)Kbe5McnB5WkN5~F{vJO}NbAF{Mz6CpFiU8krzs%1z9W#QL8LuMZydxM zH)=2gq9?EMJSd$!d>b>GXvEL0n64i?+B`{xiy>wAgJjB;D?(2$8#X70Nl5%k3)q>9 zb}8MZo!& z4CHluyu3o;{0QVqz9ruHe@ z-JY4$+V!6}Ae}iwf6&|8Ge&-~c2;|h7+hnYefiJd11D6a?@suchCTqm6&?lsu3ZM! z@|zHdD-(NQu(XC#7@iKfR^^DUf(mx73}Va*6H<`q;xF}6UZ1S{i5g0F>|D)=fBfT zFoE3xwroc@P7pf4(}u0-aoEe1CgzV{fX)HiYk+W^;E@=5_!jX#B9{SpAOi9cm=M7Y zfhLBUQzGzJlUZxh`#KNp5Z6JH5~XSq!v(F7!4u=6M|frJtt3(F8iEUD zB7rr{p2w??+@>!DasX=(g%W{=53g2L#wpCC!j)punhUbbaPX-lJ&*?xqREGnl8C~^ zCyyWRO3$ z1n}VP>BL%~PywcrkA?GLX6K?C0tBUmorwYVIWaY*fL!D`vj~C1C~_zhpbI}XGLoI7 zlc~ai%X9rKA`_(tP9iem8_((ws0HQAL=t1|VB+bLb(`HueXNTemF`2caM9{(C!km| z@JcU07PwZ=n91Vdy{MT%=ZyfFQ;3}jgwWg8;`9Of(-rN09!82wI~%Cq{76bQ2vMqC^Khv0n2}irmY(aL6LQe5i~*XecC|7t5t)Q2 ze6GW}`0l~MC}t7+NCZ7YwGnPxw{B&nXaS1OdOCr8Z%}M`&J9@tJB85ypvGreEi&t6 zBup;i_@U%*2Wr)@i@i_u$WMtWUgfV^%=VwY&vY_@>CjfN9&1MqzA{9NQdaT3^8|Dx zE(JgME(z~P{5&*)_tc+?P{2rp%{z}2`ik03gkrjE`RF3F4@0ptc0Xu)|6YQK$2{1B zuVY`694?ToJ4cO9f>^(;JM#KWT{CNCi|wV!j1qRX#rxIt6El{zAX5n>N@#;fERgiL zj|@L?X(ZGfxP`LZYDjXCQovjvK-I*;Y17Ql>na08ay~9%-xxI~>ky(}Dki8Dm6RMl za6kfy9W5PQBX>sITlmHu3)kP_q{g<~#;6%e4Y2enY()fdCVDOxTI7iw^OrPz)r_$moU$t;0Y|4#q$63l2EVjaE;I5=R$c~0?FSV+NVzBFC|BUkWMc)@M zg2=6c4T(%i2|974V2CU@2DPgpfElBQhL&5AmXjJY0Y@>IMiJ)&Wj}%r;A{>87EpxR zp=2&0DUlQKM=5a4A7M_={va9`$?+f5|74X3UMW!BGnPeyNna3nVAG&k_jZbJ3kwb1 znZUjF&0bHqZOQT6$>*rkPH+ob4S5j*K*~G7jcegEt)Lp}qrXh*IRp*%^D`7!hHW5_ zF7f`vRG_VzQ}E`?vmL^p*0c60%(dI%a0ln7BSFN)zUH* zJ1Y%>Ohif9@2HLw;y=l{Vgx;MV6_Rf=*Pfu4g-*?nth#!vp4|R4Yi~jk3ski!z_TF zI}IwQN#&E2!J!5BA%hESUr5f9dU*Jiz<8?pnjehl{qjZbtsqM3K@$gz7@kwpG<$#Kv}H1rvZWxcuWAYg_jnD=8|XcwpO3 zeUzXhVlZ=K;=C1j3*iz|PDnfk@Ky?B423eKPUg%SMCby*6?JajEQMsnKnWD;hrwAW zwufxI5<(}C8cO%K5;bsX)pFacYYAg(vihV5A7nh1ivxQNqE)rsBsy8vo}= z2WhGv#H#Y64pXbNf*KtG%}qfKJZydw)vpguC76}Z42ORViqZW&9w$OwnhatfPO}~w zUvlFq*bxL~W*shEtdZi40{=j8l+Y9jjK=2xuE8Kc>dGDaGvE+WBqi~pBr;1N?y-1~ z(b(2S`6a*qdf;Y~iY3xdVs1!jDd`rfqNLnOfT$G`Bz$HJiV1>g1?m|iCW@vbal9p7$;z-;@Sun3RO>SM(E!_>!!cjVZZf_Huiv9@3z05E`tmr0f7 z(j}LzD6zCzR2wXPpDXIHV&1VM0%H)c&eCA7)Z%jE(myP|2;GaMER|sHr3c*fp~|7e z!jHzhLE`#QZg%Y+=q$J&EMdUL#x{t+Txhi&PVf||uOdTBW}M;15g>eQDV1{)4dUtxXAD7Ru&Cd)=+%MiM(}g+R%vHFOl3nrwM$Ky-=S zC-KC~HZXl2C9f%zTXs6AfPzD4H)kJwM*>1h+OC73`52(Sr1U_;15_GPQ_Vx zP0kWnWGuMF*Ui_OFguR4ht(WSF6&*asY#hCPyb#$kyKRO`qs*{bw0Y!pi@}KB5<+W z+e55JY4Wx_x;o1in|aXiO!7ezCho3&_o1SK#4o9+0Z=?})KHkUY`1D0gfVK|EY$Xb zH?#>ZZ!r`Y<##Es!7|;0Cg~HhmKlr(eHmrbXx(u;D^xZ zT^8}}{V<1uqArzsZ0 zm)gAHHQ}u6m!?MwK0nz_1vviv{nxdk%LlOyVP|U)NK65YKMXcGOiVo(D&VtRo1N*& zGqMJwLhy0JaD2{XobWn?x4_8RMw^l(a9%jf{oyq}R97dIS`jW8SU}pg@sO}?*#h=a;j@k22xD~b9wFYDJhh^+M(}vZQ3Lsfje@BGtX^0j zuLH^9#-)BI0FR!g?n%e8XZwO=b-zXpdLMk@Z!rD9!s!b4T;i1(B2xzVNde7~`6k(e z2kmTaw`Y&Ra1!9C-K>O(z!sXVW?#R+7@gZy3xFimt~FOeA3#w?)3o!((Y0|IkjwT@ zf9B$%fX7&$WuXVEv@BBGBTt@W+kSluk>hA6nb(K!izy=&zT#nm$oG`G6P~=?0!$5b zh$Ml@6d=MCH10{#`YlXii%pjRk#om5Z1#dpS$ zO^6W$CPUGx5lEA3e|=B(?a#lFIwyj+*6l90h|7!V7w>ftl{cGs0vg7*8;#Yw!=2#s zZsd{}8NhB&pVHI|V?^Q_`F5{fy&^FyKZc@0BS(d>#mGrS7&VCMXj&9En*au`iG2U- zz0SR)pw2a!m-mg)2QEqjh!1>3@FV3!=yn4?#BG_ru_7nc9NATv0Hbr;L; z7dfYzDeLK&`L6qx>CMVL)RTeQ)cIFv#nGP+ zvhH2d2wxw*n4vJPT=`yFY*W@Jvud4*3C_PncGs1Pb#h@xDT2XJpYL!-y93U23PxTZ z;#B7ofzdP~4kQGdoK1Y*3#Y<0&WW@}vOjLO^Y@9{_gC38yd2qxTQf+4Mc^~wF`*<9 zYss}sVI>uq?B18^v12#VeGtbP;8PjF2KcoC+&f-ih47p4O`_$kEJ$>%Rqk(}-4yq> zX?WLlJ2wGj%R>^(>dg}tK;dZ?_8Ij7f^myA(7GwzO42~_b7*0+{)}L6EI380^_s&% z=Qg&Ev5xfV`s~w=?X8}#Ur*+SCI9rxG170uoBP*qD9?uwABL3B!+b2XBU#5;(UCW< zjgF^`bNo&FV-CZCq}=#T4ugX_yHCb}r9n(GY2*O0lM@v;bV`IkmPs{yk>McvkDwFp z3M^K?jY9<88wY%IGPf`7NyOw8o~SSLA*TYbDDJ$oCx%#X%-5v*FZPZ62K&-2CiVgva)jCbc6ci zb(6V^vJf>zs8SU?%KsyjW2K;_ES+Seo}}yF(a9-xa3>b;I~v~i;` zpW6L>^!@SY;Z*Bp37?c{Il)|^p)(RMxB$#d3D0j*4Hx5Owh z0tFMe+W+MAg|f|kpb+r&Q6pQ_A%bnT^^dy2m$4s?x%;I4m_$dvaPPfttQrmb&#JX% zrG7dW8e&xlP8ReQjg69{=sFb$zhFVnH}L|=TT;oCetbzbKX>30MRVmX+)7QqDBF$C zr0JBt1x$|3UFnNmeSt$=B^RcqIkf|GcZ_pQI&yEaOl)TxKXx}hcXzuiqu{MbkUgSc zbBl8^_COY656(YWw1D45Ed}U5{DZfYaOG_!oSs-W0%&?i zjuy%y1Vk#HBI!^W1A{5j;G%d>5Ql&%?5st`k*C=qjQoo88|iD|Sx7qv>=K&TTrr{x zV2?~e#Gy|;16NPGd9y$`Q!|V1a7~!=S9<`F1UPyY2jE0pN4z0b3^2+Ify2xMxqua> zdCx(5htwVRr`cHwaqsM)bc!@w-T1_&rteexlu}@6j5T{a&=U2{!a6ZXQHl*un!czy z%+6kFOB#v%XBH8;?o4QmQ`eKX<#EL;cPzXPCOIG`G!##X%(iL!%m(mjKOtq#OE@!? z*3AiVP@;vq*HtAfL<`zYeRd1ozz75b*m`dPch+e7#ZuEVQ45{P zx;!{ZNH7#GZFl^vkQ}MTYFr?@|$y5T&p13y5ckXKZrx0vVd@1wcUhN#pJ$ z_pN~{(MFAQOfv+`TE&G92r@a zGl*V2@BZ-}0ZlF3zj0azg4Vdw(uS0FodiJyrwE zRLdR=C*uY7fk~t7kCrbyTMMODQb{9K9*70UqLz_|L3r~BR7uf~12uHi>j@Q#Pz=B- z1WF@CbdyQJ9AbF>&vTG^sbyP65zB-0{|aI&cQ6kKECJfD`sNbCF9yM2M`j={m2h^b zvwQ4w{>vD>X%je$GC$R-euA~8^)%#-8= z*$-(^1{L-XFYUvwATtj>I~4E{owWxkV2pT^W;C+@2;hh;meBN&z;PTrc%C#k6EZB2 z5sC+K9=ek_myWcL|MR*1PSC`*w%9;<)IJHp1ibH{Cgb6{s1Pi)7Gf-R%Qxgpv1ko% z&pD%Iv|k@8!%D;3EhEwq<>xOi-63Sdc~DW=C;+dIz4H*sMa)i`atW25P~pM;l6WvY zH#cVhCgEezJDh~TL-J_By!$vfQ}}S0C?z(-j69eia&n2civ&g>qIWal+z(J%;HFSyF;dH6>P3pK2$ui`Xb>JA;c&31;dW^fNvwiMo?57--z*`% z=JE)LGIiia$6R2xI}0OHy`yGh4TyeJ&LlP9U?dlLFGWv0lSnF z?gJYhov`-UvC8;pX#;hP2qAQRY&8_6WwFmm!3$vGb{rn)if_+aBrxg|>9RYsmcS-a zBM-%dp2NV;0gx*pAS7JTt!ZCt!lay>oN7Quq2IAX@3k1{06+{$9X63R-@6%@S1Nex z5XB&0qZq9~&M&d=aK3Z%^9-4qI+~!P4FfWFq2aYBShGo0tPjIhSR;` zbSkr6dh_dlwj}S>GuwiHzAfzuLTCDZq}#G5|C7`ol55Ve_SzwF_Jp0^diCPpo;guQ z*Z3go`0_t&e$so)(!*7C`rID^#yrWoMTJ)XvlqGPf?Y4y6Gu{&`|pg_+-*Je%u`hkNW2=Yo)J$fZ!lh7 zHk1FnoJQfyc1P)Cf?~>vi#rl>ao0(VXi*ye_tM2>ZPz7I0J^hL5PWl6;G<%it577!yH>{79F_9P! zm9r9eRnL7v#3`)&6jJ`$e9g1xUXBp@Jw95Q7tHzRm|Ta&4o8`l;(<{gL|?2>ifP==L75SM5a-YF7jgabasn33FW@W{levy!qY1m)I zm^Jn1{&>gzUUz|At6NwR3wjVlZ2)-V+IYkV%oI6P>&xOa$$M6QI3XT%5EUmf`2TW}^f)mEcZCkhUgNj9|0WFpbTWcXwnvmX$)xwwdYYCQNWh0G$GZ;l^;h*$JMvj zCZfCo*z^LR4YJrIsew;000|I*5PuF;zHa69oEvBhz1vX6769Xy^vZx!qg14gP$k7* znFH))Ie2M8tBg?7$F3?ieuGeO4K=%oa^t`!CTAU zUtY|BAf*EO{RtLDBoeTpHnxwU5=9EDsdybqgQTK7)RYv0No?_`!EDba{*kORM*onu z8I)Q5P{xZR5e0&C_URKx)D7mlfOXd3FlqGM^WjzxMB+X4QJp@g~qaoE1RI)!A zXS9a{+|K+Ms29I#fnA#AHq-GAg#Ls$P2rqa)`?%^b%l=NJ0<@$vzm) zMZ`WQ+Vc%j{PeqxyzIa$aODa}|I)CPNt+L%xH^(;wRK{;R0z_T!)WAz1(QLKC#lQ< z&&$fm8AgQk5jW8HLR&z7ta{knX=5*8_W)5RfDNggBM@qV>DtwrdR`(o0Su%8~9mNYc;6P?(n+g z@VQbydOFo@$w6|2PQb>`UuO=aK#gd;SGYMlr%a7B*KTEGG<0iU=sn&_p8$$x*29B9h%{BcP6?JbLm(E!%q3kRl~j zo-s-q704ZIV&pINK&7}s3m+DSg7-_9>T>KaDGS<5%6_8XP^6*W1xjm)XUEU6^km=w zmy_llfE-f!I)F*jAb`?FT^_Of`SnNR*DSEF95Qgt)PTB9R3fd(IE97?>LpoJi|D1f z7!~GA)~?rB{~R!hVbQXT= zy%`sQjJ@~jWtFJdTZ0d4miC=z@zNMe!ig?56m2{`$3F7iVzXCZTY(lTFzaIGG}iP+ zB#L&Oy!{$DZ8^DTJbh#LWnK%Wpi}OIOKq@aWgl;(Zfvq>XwkfXQyPpt=a}YB@$(+Ea6EN^mNJL5HmE=Mll{uW4K_@BqM}KE)t2$|6W;Fm~SgG7R!ZfS* zh?|qKe~-GO6TqF-`jYy8iR!lYr|bbm!~hpw+(^>Q(9j?-}oBDzigXvo~XZ; z4Def&xF_@UOoYM23~_OB*~(H||45WQ3so0N4ScVwoyPj!h@eR=hS$p;78VwstTES* zWUVVxKJ6gGFm?J9No`2Tg;o<;6s9(6x~-gJ(d6*%uXobSL+B)*#~ScZZypBtCBRzq zEO5rW+R!<|N&$rNSoESWzzm{Vg@2;WwRP`=r`e81a&&xxC>sgeL>L`tFo|oBLctW- z5?k-<{@>1?KW_-M|GZXj*&7~1VUGUKgP?LjC5N+?Pf?fHK#-t9-&Qvw&y;cqNN6*G zy*X#nh=W0q!zqLibi7e~R`E*-Gg<;3AULr?RR$#q%)2>5$jQhsXf*<{8Zdekk9rH~ z%q?YCPWOxP#;+)i4DLIchVF>u%i%33 zco=9ueP%*ie%8eIeiL<-&fcmF6)vY|&tyS>7+v3W)4ci(S`UJkn|D|SczSwTp+k>S zG&idYX_^>Y&yfGdFem1HrFQWcLWg zA!_%nwvrL06<)S8+uQ`ubTJHiO{9qzfIv%<_EV5X(K56n4;Hc5USrL3Z6lVRBCBg+ zKT5v@TbNr|*pbE4xTf3D4AJ|VJ ze6HZ(o-K1;IXtvGn3>~o$O&begzf}6LK_C$qG-)?CSm?tO-H2B-v~Cg&rOHR8;2jZ zY3TzfRhqQO;2tp<&YetXn=#ZbueLkgpT)Z9nPWU=X@>uBoj5TUJs3zjX;7|gloo2g z`Tgm=Z+~4Uum>T*!=3B{BGV4eQ?Vcx=5) z{I>A!D`J)XMt%>e%VAgQm$|i?rdZJwktXBu+d)7?db+zqz?%ZpPXXYbM~HVInhZdV zfiLHkBL^wQAUueMzd%2U4=@7~2@uZXnsu9Au!3@N71X?g8L1QYsCuShIsuN^0D|TU zq3wJ3#AL#$ckLuBbl7AP;h@!2j`W>@Tu!KT`e;_u6&jD0@qook3`D@{$}FOWIOQX@ z4xxvlUtsm)?(vN!2aNoj|F(Q6F};P}-QnOi;dqM@J>H9BZ?i4Q()Yt|C-=%ZM-lx4&9aE1Pq#P(f zB4Ql|sw~)7yFQ&n_tOC9$~!x?NGuOpTb8F<1Mo?AMIvvERk}Cep!gXAy;}Yy}eP)?rA_*i^uBKt6jv{%@gRJ zi6+)#_!>VF7y>ZjS))0wG-FHSgYG8tCFl2u$54K15NOm0!D<>Iv+&4VSEdegy(j*m zhbjUn#Jtn!)HDKurnF3G|CAc+jablp2f&T-{L{)2_d?_YCjYvV_N9B=Ie8y>j;dc@ z5T|8%D)}HSG~c$CExw2K-FOoGrw7A5OZ$zHie#mg@9Gt;(3tOR=lIbN%t}AcqIA7q zEMwfMH6An*_OQL65nA-X;c z))I=|FIc?D6#fTe7XJI?`F7g`_qIHJZ3SJv}|6v9>J2 zMr1W0d#&I$Y-}j5*Z?#w1Z{yXb7&JHx_PK{lRgBiv9@Nz0-<(GGUdk0*EbRcbM(fS ztIU7^V2FMbqJRg?gUKw@yNYO(e)&Si)f%9(RQw&RXJUr%iKNGbK%X(A0Ec6M8xJ-m zY5SeDimp!vzJRn=I~p(p4vuyj7j#8QWgWRPNswan}|zOeS|Kfoe(rE_5=O(I~flhdStNLYIbp%TJZ!(Fxi zk?{U+9YlSQP}hn!-2?Dv52ioSq(yQCY9*i!2_WSjz-lmPGz!hdf}Pd~JnRR8pAz63 zkVzB}J%|U@g3p7D5|490av_*PS-6I%k1)RhFrva1Oc=~C_5>k8&~T7q*qNUv3jNg8 zPu)Z4Vu?HokR4xEQeQ)+y*6GM3|7~jeD4gI8&5ZT_fa1!;!~o5h1@#|`;Y7Z>3q z^}+1gimgP2P1NQRFxy%xA)*`J91;ZM`wx#@2CpqOrs&=@C^HyGTnE)OO`W=&GY7>v zR>V2HQS88qs$3p+$`>SU44#%q)WYz%D3FWDo+M5ESY8;KhZ zdfXH^PA||#yySUtr4qs;40Va6Bv}yYQXs!V@w@~|D*o<_%+)()Vqqb~V;bm!P&0a%SG#n97hAXf9Y-X}t%ZS5GXP%wi1we$*sZQv8EQy~X zS~4V~YSyq97wU;Lj`z+=R!6r$5|*97F$3uproQ!m?3Lhb#@*T9!|s_>H%F&KMK^t>Ry{$uGr%ghM zv4!@lB&pLTyHr3^b4h{jH?`jni9x+nL86_?axH@b^df5t$)=v1E8_HYp5dzPhQmoD;a8w^b z{ygJrpxSClMhj@>^z##A2G85=6R{jqrCva0JTfY(`0X2I!hhK+$oC*lDk1&USBNL% z>h%Kh%xTP*m43`XY`r>|BXg32s^Dl;%`#y1W3Ls+O*azQeX31PAA2BRg&z?uWlq5M zoiaDy_PIrS8E(eG#7oqA?KEyQDL8fK80Be*uI6ruQGWcEN0Jun?iovYd>YA3&Wul7 z^0>XB&~x0X8ZlW2U#2kCdlukhN!=TLAGOm697lx5pCtH#ali+!QIw(MkPbWM9SLpehcg<)eAb8L+`{%#3>Z9rM zo_1f3lTH~jJ=dRqOpWyaUktRsuPyx-8s}h_;c`V%QG3PU3sMGi!h%`uINP2dKjub> literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/flops_step4_manual_pipeline.png b/examples/aot/matmul_optimization_guide/fig/flops_step4_manual_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..5e29a8859fbc9a16f52e446a58eed1a812d9d129 GIT binary patch literal 100320 zcmeFZcR1Gn`!{|`QdAO>6=~U{tPl;Okj(6alo7H=W+5X=p;S_26Uvsoib}SWtVALs zd)|*%@9$^)j^p>o{nvf}b2~ntchcp$UgP;Z&&N7n!D_0C+o+hSNF>s>GpFS=NF+*M z5{aB+^CtYHYRX3ne@HsX>pE%LU3PLcaWE&HHF3Ij)z0avr74Gtxr3vno$V0;VIctt zJ`M{fr)!Rqf`T^xeuaRY!xh0w1Lu!;mo3*$>p7B0yB`z(Cp+VOGm=C`BAt;tq2=~u ztjASXbNCD8q}^^4nutW-_+9eN#aE0NYSatz+B>Y4ThpAaZF+ubTU*;?>be?arswH1 z=rCN#v&y{wjB7i61o!oTsCk<2Z_jMlfA_9$ZHqt)2p1ueirjyHX`T6hKl#t~_&<9!9*&fd$;pidg@&HGc1`H%vFm|t zZCY=K9J;!?Dn5MJCL<$r&B39?l}_~Dg9o=LY4&jJ-@oz9nKP%)obl7*2}n#lU}$92 z|ND1%cb;{1jX!(qeldsdr=J|VzKbMe-NCn&UBczy?c!-;{Hw%eyz0qOEPZ~1{F;25FAZN+IpaGS zmi^=wyJ#{b;^X75;Sg}0IgTA;yL|a_ihdrC^s!^=<=$QoV`3~l-@kaVU)pnFM^l`v zqM;${;lqb7nwoAfFfb@ADtc~G9hhBE(5bZS=;%nx%p7v}?vDKj4|49?N8ZuVam~p| zt~zjAe}Dh;s6RVW_Z>MxCvN+-B0=%a-6v1BlCpDh{BGT%d{bJgJbR2oN{Z#qo?}(7 zuQcsoUJc-J$5rID@(OW z*n7oI<)IK6X|h?7y}x{Yjkk5`cTY*OL+Ln$6m^DOlEBz{Q{uSu!>#LEcZgfcXNkvzV85zme)>Z;bD#Dmv z@WPuQ+z0N=-fR*rrb7~9L9l>;m`hNeuFgi>| zViYi}NYhTgHan`FvP5;|^5wvSUT+qy;u^>;$Q zp685s=Kkl$ZW&LBmDN?{S!pNBy4y@$C9YZ7Z=8N=Y~3w%e0-Jtn1}xz*|pux&CONC zR_$s1oyI=$A0CP3b)UBx%)9Vvy0F91!C~^F)C30~AC`@2E!E7}XU*}|;1mZFSvAKo z%2(_u+r}tEO2bzx@X^%0I(9w90eyWJUnC__-I<^0WjFR&%{%`w3R|_J;y7_PDl04H zZLe5ah17+y46e@iD#popg&g}4&L(wjYC!&Lp*?v-M8tvo=jM&k3w<|flvYcR2ncMK zU0*peH}?5vO-&7F!JnC#np^DNL3>XHRv&7p5Bp?MFWCHHnqutbF7CXR?x#-6AiuJ32Z#5Qp2pswMRHd3Jo6 zY^1{J)0@R?zXqArho3ljpP|SI>;F@#<{_2GV%vWF_#toWy1L}dDshdZlXXGf!h-A7 z>(`;j%0ok`H*IGP>aV`_!0K9np+Nn&((5_Szs`Rxa!h_Z&d1N+oP3V`HtSK1UpILb zZtwQ6mfXP3DuG;ykB>n-}75ho-)7lfku^llUu zO629!Y}r!TlBk?xQoZSY%n`+3;`)E4r?H)HfB$|-`ug0~WYyS{$B&adW=8yq9EanT zqIhb53qEKhUZ0_#msTUlL~-db(@Ue0KML)?cBDYhMD6Sp-?RoL>eZ18}|d>oVW|~&&A1v zVLzBo_EjF(BgM(NVWM>YX?I+KLlNr(LF2li#s|%~+MZH?Pld@zGjOG2X}+)i(~@EWw&q9u;c=z=Fpzv@7tm>B5)&QFeZ`}O2HIw>A z+tVM4T1V@P=QXx;oVvBWe|o5~tIX?qj2;mW7HGCtzrA77)6;t>>*F0fEa5V?`{c=! zPAzL2+}+)Gi`%lUtu5O%dnhX_i&%GTotv8rx^t&~_E%>>Q}z4zD(~ZD*)>vgYd&7m z*MHRTcptWYicZ!ZRyH;hD=VHChj~5!OykxQYdOc}i&$nNYwH6k*WZ2j&&kmC%%o_i z@A|v+3JXWFOsx9yY~4CN&^9Mg!uDYiqeo-iulPMbbt`SkfP>THSPuJJ?~Cpxve^#v zg74pP12UkZfLJofM3AJidr^yqWdSV=T%f0cu$oSZ~{_m&$ck&;VGj{yes zkBn@_sa(2zd0_F+aG~qO(~;}H0;m~aO;|nU`=4xaEuN=eE6{Mv0d2- z^vs#ASNGVY-P98NHke@{@$aCoB%a;=N1OED7a3L9hadt!lQd}3Y{aRSF8j`M= z{9Aa&W^HA0VepQu+%L3*092*{REfZVfPj%4EQ=>HUd-6QZVQvov%PsevdAzTNDPaR5oP-jD(FG!U2JS&%?XNT6W>aCtcx~o+Qj3$)a$)$ zbDK?S7tSoHL9C{>wjXtxsH$s=!|cFRRx8q{V^UL7KcyBq3{h28SBo-ddh}J0_2+SU z@)bm(? zW4N}pqHTW4v)0K)N!J>7wYNXv;Zb~EQYl?0YnL;9SXh{_?N|S0MKyjI0x*vBl!!0Q{Jg0U#>{PW1&Fw-(0-ry+Ts-x5nJ5t%O9U` z7c1~SViC8!`2B6f$+LS&lhe~xKR!He^gME^(=C(bN1pA~tG5X-fqF+q$}lL{esj|{ z9_sa(->`x&{UbC?z9psdW0e7V6(cAEWK{`mD2AcVwJUP}oH3tGr3Mu=Fcy z3izG=diczY)BC5#44lq(X6VrYgDIq+&y@Re8C|IJr4e!Eu06fd^tAt1R(TytnX;#h z{23J$dWv-PX?b!_p5oiLw-PrJh=#Ev=Fub5+7O13?!5Fp2Is~0?Aa3(9Lz0t>~+%R z_Ye0{t)0)*XV6SiIRSucVs4(8>!qu^8x<|GOg6>z(j@|9R&;hUf&mbKT-*EUkD;M{ zpn`=TkL`9aGgEP_cgG!dJh-;>OHe{Wg8ur`C|;eA8|5g04@E3@$}ElSOJr&&u`t5n_c$8 z`0D)lmqROGbIcU;?0!VG158b=EIN`FjrTM@+QTIxLJzjbH#;&F(l;FEvx}dGRbq#+ z*KAaSjggVj6zXtwSJ&?4`3c63EaUPEna>`gjxvi~4Z`upt0yS9{rMo?KQ@*T#q3Em z)^hJsi=KG-P@!vm#}jS%`S^s)8|Wn@B~7OWYDv>lJlM_r`By({jJc1EjR~P-Z)KCr z?OJkoadCMlX2XnHdrPov;pFDBSZVj2>#Or|EoX0p^kF&e7PX>pX=(9y-o`9aflF+d zq^jfIpJ!Xx>46=b?YZdi_R`^+y1D@@TAnx>T5a1g%IMpx6*~Xf9>+OkKLMPuiCDFr zUI5NKW?15K*4*oR%Oi2Sr#mi@DZR|JG&Q|}qWIjX#KkH~>!|mNgo=t%nX{d>btumD zISAMS?teff)$-hU^@}t2)c}t<=Z{_MBZ0;R0FYf7>&iJb#G#84?g=!UqWAhBfazXO zSsAxUjwIDs{jq=6S519AIjPv`=Z&_uw!`!MTjlljnHBEr;a8%#3L@g`OLF@CRnKQz z>)reJ?|-7?ThK{VVsQG|BA>ebnk5@xG68p@Swt*+&psAocO-z=kG^ghUMpZ&sP=uX zJzb|BML@`^Eya}FG+_33j{&!KJL+S5cjsNh!fRx4-b<+?&S;Rrz%!jxv@1h-wq2dY znVFefA570wT0)|rUG|eVlU2LkMdNvF$&d6Y=`0H&g<$*DfPqYbJ3Yf9OH6xg$XdJf zh^X^o3+bn0k4H*=q3=l3=1pu$zSm;Ho7ByA%)=>|>Fzg=U;GP8OH_>xMTS2mMwbAy z2!>2ryeU2WK1MTbaCdxUB>l>S&pI8254K`9P&yUE$}!L%qIVL_WUwymRDC#GyAuD) zU^HX(Ovgiq4y}OISv`n>Fv27J+AnG15M+S&k3^Moe}8{>zxMYZd=4isud?pk)D-uW zU0`0GC>A!4zE1bN=K1rx(7g_A{kU7!TN>p0e3}`9MQ>TDP~kp7!8=~vR6%4HIS+5< zJ)f~7@deQ6iPW-%4>3n50Vf%wTU%Ntt7&8{5`vY8t)TC{vXGN4E)Q>%jMBO zqqm;v7c@38F^Lzkt>u;;{eo_bg7C?qQLE(oyntIumc)5M0)gTkD6W60YlI2`o-qGtm%+@~)o#^Wucg?{+ zbhO%+$0#*`P=$=me^YK)?45688`0Sn(E64t)Vjj@tuc9{{mZ zo%hs|E?a6?^KMFq&_gXQE*^HrENCYkov`P^HN$dmBi$?J<^jYSYfE&Uk$-&CJM@)V z{dS)*?Ms7O92ho->&wa zwmgV974`Lsx&mLH3o6;#zO<;*aZh^B^CYSL>sMA_cfr}5x;yNGOBzL&KRi-etUa4k z7JP6*sC72gtB<6`)Of&@=N)A>-MO?qm!zFXBRj0$x?jH@8f|83n(N_d zB#C;B%g39h)0S^zKy3Ex{QM5HMv-dkp2@7)f^QzmT3Kn6S0rhhe?eMlxU*W(VkZ0P z8bL|1rm7qunzTJlO|cFk*Hcbg)l+LM@J&xo*B@3NQ-1N{MM|Er$*;~VLJ1-x*}a+C zL?G|5J0?K{qk~|zS3*Kp_X=7CYI15%y?ZR=)xGAzB&}2p`N*6Takp>ZmYb(kYIvmG zJ%2wYCT8Tw3z3Ors1#ccMD5?XWf!0Bv#E^Xc>0&xv5}Gft;y%ChYGc1bjcqE&Der# z#M8^+jpgqjJgemO=Ef?uLN_4*;2$1J*qbsLq`hx%*LK<`v3Ku@8Ml6?=;Lc#$GvzB z)0^Ac=&yWy(kv0xhQ6?O^zI(C4?tG!ZUIQVvTFrP*JHJek&k>4dJ&jhSkmT>aA|8x$5sad({}+pf2)R)*b1(&2m1q+??= zzkC9XC;?>vOE!z1c`EA@j<+WVwdHq|+7@iukXCh~>>ij`NVM`112qFvQ&a!uPb9V1 zm(S*vR~YE&5sGis<~D#T>?Jy!Y`gw4z1_$2ky;KutFw&ILJwNmL6FG?4$IRx%e?c# zuRxzn{+*e5cY;#lO=Yt^{}M*?3CidPHzI2(Jm$ty zlcTvb-;Zi)?201_!Hv&{V^lJW2SdEJoIS#Y;;6RyJXysJ3Thtt(mxL$?)F@nsy}&5 zhr=gu0SC~)fBm?;8ik549rKqJqQBbqbwB`-dI3OT?+-+QQ*`|AV2vkI?3 zFR96|6EF=W(c1HNyogOVJ?L}Q#6VX0+Ri9R&gwd*3b_}qxP^55Q1!~+*K0}F;nx~H z-?yl(4BBl@ctMq1)&L3v#4ge;D=Nx>VoetfDOAg4$!B?79D*YWKjO`~&-b9w9*Rze zQAvJHBv1FJbREIxjuDZOTBb%%s<`F90NUE_TVR**3?dZ8#!QIfe}su&Eq<3{+anoP z7I}UnPX?e7*{CVG8ywqpBw!GI;Zyz4-0D_apvSqd*4OFXG}<&x}TT* zhK2y?s+tZSJQyD{v~=FBPSSVhl_z4>9V)pQzvtT^5ClbwF0x?*5!GkG6B>2-$g+bx z0~1p)t_7exXmeMwGw;pvS~b3`)0Se0x2M4eKNTsnt=a+#fW_k6_!r4D z_Aef4sN^Wf_cMo*t_?LXWLzlK7imp>gROH`#6!1rNftYoAyHJ_IpGIbgl_glzY%J` zoE%|hpfa%8cnC6+^Ye-jq-XB&ysNA{5qUDbaK;!^wdy@TY2-^mmcZkK@ot(bQ;nA_ z!P*GvuFB!+RX%tc=YI?`?+li^0bPAdBRG&|nqXSPyZeXaMGdfm_0s75yW=HTPP8@& z)qZ{!AQj)Ow%|oZ+QuT*HaeAO13LCl*er0w@n@{-w@Mn>6}7SVG^gsUkHtMV{uPpt z#UT47IOu{R0J1Eh{=n@i6ae*x3&0I|*0CEFE7NW~i;InA zBY1z_yr>oolm&G9sqgRTOk$z&5_+=3v#q5Xg&jxl8e6lyouJzzpkG#&AhW2I zKR9dxU;yys9`xdbR+pY4M@oP-hUf!3&jjnyK?T}JwUhm*!^WwpsRK`yr@WHSP>sP* z@?0J-sG82TY<1F*ExHybCM>M}Q;L;Ip!(stb8=ak5UHCjc;Nj&*Na>%1d{ayy;l?8 z3+WP)y8hWMiwjd`iQzujFLI4GU-IEt-5jCuh9V3$IthOu0Y_og>HldLPIh5=nK~f` zuYhyJQZoucSaICI7RdF zj?y=8E&@S7IwCd1jmybp?IYIFM(l7ZU|imxB{?s-KYH@Jv9Hg4u_HKnq?NCNqT)VD zNoJjwhLyzHy7YF3=Tt4DK9lbwNGPbYAYYeSYhrzLToXv=(N3#qXi!QX<_aF*)6>z>frQ2ms#INieO~U4xw-jXVPV?m z&z}=Y?;{b*8?~W~>8q|85^z0ED=AgsRoA}1rNwntoH=GbGt#VdvqUQf%X1UWp4$*g zUL+@*4b%ifXg(WZ84Ct}3zm(xu5DNL^M>*)`Nh%nEdRFmXmPbA6J_ahWse}5KDAhc zdt*H__v|B)N=TByfLN6PW5Y*smgXk} zm9FHuPF%=SFjA0{3u<{y(9YD?tc;9@m@DUhmrR(#-m2YG0E`Ocw_PEeRnOL0uEgo* z28h!84;FhD z&;9P`{5>liAtXJ8U`S`hTW@uQXPqTnV-zjgwtV z#lk8%B38nl6G||_GSR_2!3QOrN9lnCNHBJFb#*!R?j=JttA)%ccO>T4R;Kvo-k7Yi zB_*!)mKSH}2>xv~VI2e~n{act+4R0naJj*AkCp1xbYM}r*d4%7i zU+W=TT87v;rv~fI-aiz!Vk>|>3$^Ch z!E;YYhG(YHq>dPe+FdeQzU?na7 z9D#?b`{O#Ry!zHxug@9S6Y&lY?ZP@Twc?ZW@;Ny;+RmHBACDm<#ApBh^!${3c%6WX_E~Y8usvlCLm@3; zO_r@mOjoxV)%Hyg;@y#pyyO>tnFhLL;-(vICdvA&vBGX)5Sx{tjS$Nw3=NYp%Cai& zXC|5}W5xD^n+Oud2jhHJ_8TM;%sIaj$*t+%XNQ$D1qCx6J2v~(tCLz<^r}y#gVFJgx&#j#pa2CT z46;1^+G3S3vNCEeUa@12wst`&58d0RWIJ%AY#G-{<@ zlaPBcF~I~rZUisYe3Y1e7B-m>;H~#W_xob`;#y|lVx5p!dz-{P8 z*k)C55IhT_>^{Ewy1F#&`?#p9RZU1_j6lu`UQW@RIB87f`WiLGz+t2tRjBmxx<9qac3LWynud_L?(niFYLaMA+8BV?F zpzghWw{a+5ulAqT8kNJY>y2#7%Ia|=_4gv0cjr1m>Y#VkHAM0D4-elG|K>(cBsp4> zX@I3d2zfNOmr(gdlk)np<#BeD+kxC8_cY77xQMAh)MVYEk$#^4{gb1b36hB*qw$}k zLDb|S1d7@wJip&KB-!*`Er#o|!-fl`$xCzNp^%fI)zTAS5R0SvrI8eT7|IgY35E7F z?Eo)m6;nRz^FI1Ej}gB3AD=!|vkM)+bqnh#AN4EeRO_r~((xCPv>K}P7(p>K21IbHO z_gjITaSq!eE?Yu9m+3Yd@+bBu@`6G?3o+GAmb8shbqdQa0o+K_j&_>~VQMZ}c==6#YkHv-YOX?wM z1H@4wE>j^j@#ZFqXVtfF7Yep%oq}#nw3Rjquc-kFndPw~84%cqL}}UC??C703R9tM zSX+Gb^l5mL%(6#$$0iq#I&JN-yZY>Bvid1#lm`2+)gK z{kpAwAEyQ*(aU&DW$!}l_(s!~7{4JMm*c?7%1X#rJE*U2P}sO`R-BVv)f$rt0e@43 z%}(}q-4G6G^`GG;`=2cb2ptEYX){11ZaD4k-9cy%sZmCIv0eZC_@Ei1rx*q9ZML#7 z1-e)>P#fB=RJx2tP36S^JzV991XM+)V&E-~0|(@Hi!32OL~!b^SLVtwvdd*_OV&h( z?6yd%A`h?wOYDxW>xi zG^-gMOYU#<9dp}k!O@6KPcAp|e#vcz8%+b7al_8L5V_ zN=S(6$1`AS=oVhvi`EhXm97t6mM~F&G{p(Nxi#c)6y+fZxZ_-^Sgg!LfJn&n@y?(c zr{KmzIF^J6$p($A&|{Y8eAb1qJ;yvYi&oQHH%A@FGgVdH4nuB#+ONxYc6UHb-f`HC zGKfLRPh{aG1T$=ll;mBOj9IfG_L)Gaq0vW<98vb-KjxPHZCs|b} z;vQ>tM4T5Pc8}FOJlHL-+1lYT@p)dj?&enU#`2jB8gt?BFP5;22HwC`(J$iymHU-?5r)D1{CCMDU~LdY$?si~MEFn{Xg zl@E^++cRjEz7GurpbVU+&o1#(8*I03us7$5Y6xEtsKCQ^Wi|sU`}b2Nqk26{#%!8zffP@rR1~WC8jYEc^;s z9M_iTYzPwob+oR~e$fAf3=D)G*kA2}Uh%cnAL@GpXZ>Hnr9ZtNiYI}ao1dSeB=UfY;jERo{9H;>@mqjf2nh;= zQ*%jG9nT9G&m{vxS0?b1cLfVM7!7xi{-;%S5sua4XXdm}e&RKG=a#Ew?T2sip|Q z3uQWTxewK=N)m;QIA-V3Htu%e#`}S>X`b=nu-~(wn>=s>G6d54!&pY_+VU<8wP{{oaElc0>K|lb-F0>vcJ23fKPV2jj z1QsFEg(gMNH5Z_hp{j2cvHQlFpPvt~6M%S+r0eh7D5oYv4N)gKq{PJ;k6eDw^C^C0s~ahWDRKfgXywlvDyEP8GDBO8JTuevHlUo5SD@9TRur}70zJ~*1%xY4{l z(?Fb%sTQOnfUj`Q^k3}CuFt=0$rwl5F$KFNu92ak;YBMeDu@9vLN_DtSOX-dUSMlv zvW&l6L^qdzI*2S9Q9H=KQ(l5RX%KiNm8rH|F`)54M87Esg0zx-6ftrzPy8%nCAG>g8OOFLMJPZ7suxHOWnoo;ytRE~qnDDs`r$*Df`6YH< z+SlI;Q-kr356g!w38(>-tUy*FY~`pu$|@=~2!|2wE$>cK%2RKuJ77xDzI|>ld zb>aoTL6B(X+V)H!jP>YF*_FwgDd86C#mJ|ktMg>KBHzDx1$;^|m5J&5A0d zZ|!j{I17wRTr#=1max4dAF@@nw=>|JXlMI<7Ka$4d{`0^5(w9yx)Fto`*3KG0R5iZ zmoHs<(b=;1g-i$OAZMvnSN2}yi2`DTkZ6IIEBJ>!8*#-D$u#6EkZtB}#m<5&c>bkf zFq&bA?e92Qp9>=TB^fXkkiNMEXtu#E>USDqG4bP6$*YHjg-@UgC`-G*x68Kf6nG-# zx*eG{37@q$Dci63A*e}2)ChG5&2*oVfz}0rB*N4sz0F!}2axDr5*8KJX4cl-L;uT_ zD``V(63WRSGQuY=UD{9T10b#;tnpPD&$+P}vB$~xN=mLr+~MBz>Th+=&`dhb6J=Tx zMEg+2t2o;hMQ78IlSQDCZm3uG;JPcgB885_;cBk6wY3z44TIM-!;>YcjSA6=8zgYy zVCX5js4lmrNpW`GC9p1$1%(AgU?`JS9b+#Fthudwq;&_)ru93+8r(8@y#pU8h8^VO zJXPoY%nE9(ATpIYvf0up%%u2teWz+G1#yARZSI|p;?XADVj>qx+QGsS3NBpz<%@yk z7hDnpk&0(HAS)VAR-}^AV-9T;Z-QTmVx4%88{Lz^8v!_3m`_S?ROu;?aV5QbiQwvq z?Ma@9h~ABg3PARB%P(Tn0uF6Ozt&;?s(ueJGM4gAx>joZy!RP|m+gUjP4whmR|QbVo3pdWQHdXZ2j36|c@?y6j*RQm--A1vTCp9T57K;houy~=tX^b*EFD@-vy>{ne zh%%*mn^mOw(DYH}01+d|^+dk^19OrYh7sK6dt9fWt3?-P4osY|&vic|X7`Qn5d~%e zCfjw4x!YSaPZOe;_o~NV<6a@2obaMJqLf6-avH3STzS6ay>uHim^$(_XvDeN`(iUh zSJ^Sj^DmK#B+`_G$@tnTw6QGVwEdTi48F{~%;UIy@{JJPdB`h5dY>~aQ`QQLi`Nrc zpwId$<8g29DCpqoY}byn3N2xwPXV>2dxdXc_g;8z;SMmd>B}JmNw^?sMe`dR%vgq$ zP0Sl4>ODQ6mw#0+;xkjO`OlU_eKtcaW^`6s3g337Plm^hCk|;N&ntJ6(W)aORXr{X zo$e%Qw_1;^;EH!haPSG>?sge(Ka{_Va1X0qVUnTyGdtD1Wx95H(p1qI*@qh*1((VGJgoNPMkcs_xKs_W;XSDp}fK-~zEwIfl8n|oZx9#Kz% z@ezS%xSy3ca5>8Y5|PQ|p7&6L0y5H)7@BRkFrRTJ*VuQOOWG(|fko(|uSE@Z_Hy2% zkSuBLj?c~C>%s=k@*Cae;1>+AOVP)Ogvhcvj$29G8*3(^VE(2Uq@2I4ZdK;H!_mu4 zPx*_t=+jQ>H|fy6oUIZs=I7_1bvP9h^!8IWyROI_c_Sr+#dLPvJt*aomm7FH*aX7| zuIuIODHfw|8|GKjQd7z4dDJ1Q^TaDvqIJ?))U;-^$|I38xxXPcBfWO-?(B^VQ`6Hs z`K>j?^`@BfF|ML;Uyh!`+EyV`DWSKP^4|i~4BGz`xCwx*%uriWJOz zX0d34=yM5+XDP=cO^r{sI#q{!(P>-r7a)>|=y0@|Qc#y=zzZDMM@&a}x|dT^{C1lE ze4O6>?bEk&A+u4`y~=y5>*znhsD)mEz#=b^Dm3ZAt|SZ-A|~OK@%5`AAA;(+FOfFC zW^doDpxGEZz^-fL8ACR$=xFz~P@Rw+*ntBBkQ7l5Np$_@u9y%hIx2*0AQ+b1&s*q} zNiO3|HkQ3*$B^I+fsC>FSF!8F%h}=dV1NcH%Q0*~$PqX2F1VatOEZ<|`%`_DgG=Bc9C-Qg-_0{6@t3>_fU8=`6TF6j!tI@0eBtM=hQcRjeJG_V>O1$dN zAliMCpRdGO=6MBH;6_7GsDq#x5N-xahfq%lLG_*Q-|sA5)AqMMcKEP{h`xR+eY>1o z6y~iEX3Ac{qIO^M8>($;q5;{aixxC2e5RGI83k|1@Qs^#uB!b(PrX#hTLZU;HiX3S zJ}UFxyLWHAxpv-NLF2NFW?{0vggY`-s6&VN;w0=(PUOWY6KU1A4GmQUCC`xZnBht~ z7Z(oIn|s@g`V6AV4QaG}q|r zi>_~Ad0;ZBs|9kgYm3-I!UzcDh3gO%<>67sq->}P#N^b81T%P9G-IvjS)Y)p$ZaBq z4dR>6y8ixJ2}wg#=HtBs%5pv;*v-Q3)0b>SaC%*!Bo_{ibI zJH2Ql8;5j@91>NVQmH{0aT@nj)$9$7oH(4u!7wnh6_>rzVPMC-s;X7wGR{l{&6SjA z0hR|F1>$y?MP|c)j!KXiGmLJ4~Qc z1SF-26~*3*l;4vDI_Dfyx8g>$-+8lhV%1}_t~ z7^-KyGn94<;?A;qI zS0!UZQAJEz_yhi%c4X+qJ4?BYRUmQNHY$OPlnHbT3{jL=tb($Ju*Pcd0Rr%cL_IuiypwL;PvJ5^;XT$txfA|^Rk#Aq-rD-H(U4&V=@3&ySBNc7!(K~1ozD03lOBG zG|MrliMUGg8XEVqKi1bLcg19_t*ylmq4n$na)G69FLmU|YtFdPGWs;wg#|@}aWcTtXyxW;B499`((8T7w#tn)W z9P9P`vu$8tz#qjdLdzZLI${J3WZVRIAA=7j5&KwxgbAcX1dVlv2M6~ti_S~FhP(m! z#D8ddveM~~8S5oM*qz=9A8j{22(gz))?u%t2r|^)JOz>8cBkWj%MR_cf`~7UKJ#20dT`*|%597TpxDbT zi`stOO>lmkujrCt=YHQSr(q%m;tUzk-l~d`+_=qjWE0M?-K;UB-n!J(@E63DtVze$ zmp@#*awU1LOinHk4R_#gu*wpvObkXJkL-0S5kz$dz&0oHh zbUE%u>}OP#eDPWf zGrjFWbaawS-07b3AuBEtoW|z(os(EvBh^1&IbIawj<>Qlz~x zoguR_c?#n5RyfBROMF~>SfPZ83=w}{G2=yyWnkKa!F@Z2kJ>F{Mp81d{P%2Um3a1Q z?hil^5oAs*UuO53QIE(gncXWRb3yCniC71{OA*H~PXC_lxU5sN~60r-)HG z46tng9wv4qD=X_o6B9q=7yJVPxShkn@|`8A(>X_bO2m=7A?GL$dzTf*IbN7 zF}F5hZmv9~`dVmz%5o-6lE=#HAZc=PvVzDN!1|&pzk}28T>6;|FH*CpGeMIPaow^G z>`3`oqSRV@3V?andO!2%qPZlSEdm}@iZvB-IGcV5!dsPdR>}*HLZ~UaB`*A!;Pi)m zM_N452joV?l@&p%BB17jit{Gk%FoHE#5R)qIiY<9UM2bsA!A?&>=`;60HI%ICjaz5 zbEO!|>R$W! zGR6xUN^$bUpfyHVwjxTxJ-2|F4c#)&qeS9%qvzn@pfJX$5Efo8g0m-L96wu=tAV_AKF||`*XEQXp!qjmzjjQ$^BCqV zp~#ek;Xnu%>E^aqBcX9|>=_vu{w#*r2?Tw2_Wp+R?uS>^vFPqp3!A)rZHbo%vwLJA z0Td;X(277Tp}nODWITr>yP2ArH@_|tPeb?tQPh-DnVnvin)*(-=tiv6lRFCgCeC54 z63!CpX*cH)dLe^ALBO>b{M`>@9BZI23bjI zXbvm=?>OpBKL&Yu`QXb#jJ1$)aAQvXHPMI!@)WMrU(7|O4Poj)^ci9_3c-=ap^37C za9IwnU^kkTMp-{=F`=^HJbW79rHTNGd(I#&W26!%!v?2N{GqBW>CN)*lEQ2K8wm9s zepWp4Emq@KU>^~&6`U_%hIi5G6cWbAo;-OH*@1Yh@wgcRc7w=Eog(m2v0mTz@BXy= z6#B4Rt$dz)#Nq)4Avai{8xA>q!)1v_Qgg^bzzCaqJIt>DBmm;koe$Vfz~6x$J^&h| zWxYI2FbT}xzply6K{|IgL^NV_R*WM_w_dS_DRC$jTbRf*jE#>6IODt*km19OSz37@ z!_Cg(exBWx+ez57CQDa1<98B%7RM0amnl(_C(&F?tNgd5;BzCsAUa0mA0*r+pXpcj z5)Uo4*9>YHETubS7@ezwdqKyF9~p4$i|IM8|R*-VR0^hyfImFSKAd zsLjt#kQZXkweq#I=ufua? z{7}3uUAa=-(y|k|GNM-z^>H_oy|jk?Q#|Sb35_k>I=B>7s_xvxkSH?O7`oo&6^@T2 z#%jzT^{DNEK!Z9KL_FREEeRHskg%{HNE0z-4aFQ>{#Qv%EvDNZ-gS7_)5C^3q5Ugx z6Xb?slS1!R$!|XEUMB8@Jj`(I5NFzEWHJiTH>9B3?t;q%cuv3(Ofve(aT4!~M}aib z2fcs$mYD4&E=O0FE{I_Tsvg2lDH@4X#K0T4qviPO{`>ECAliZrvhV0on1D>k-4JCI z@6kUrWPW#|2P&5d!YftNIJcvPFK&e1zFmX3EiHy-@pzH0FJBUn5Fk8A2#89~?{BDR zo_i`y2;dmdh=`5dhJugha6E(RPE6heb9alXVRt^3jVDA)(lq5O6pFN(6P%ZH5l=*-n*`a!@wvb`xWq(d7%h^QdjxKB5@Zto z(-saNt>x8FoO2fAq%3egH{CT>oCQDYbPNsR z$$#>M)oHw&;V-dAKIQ&_4LlsxEHBZ7T$pW|9{3+ChI)@Df^Db@>`^jtCDVTlcR>V|4dezJH2?gwx2~9V^Z*E(8#r z`O3+&4Ss4+5RWfzM}LoNF9iucFfrHbVM`lvnacGoD2!&(ws<8Q6y~?@-jR`Je)q5- zb%TG$m$uv(<<2NRU~gP>G7?6mcSAG%^!JRwAz?bI>pn#;5USry7bg$@KOYu^6e zpzf1gnGGhAT#QC9Dc%m00R zobgH_M|^q7=q-QO9%$f4WR5?Gy zN9>JK`ui#1Af(CpEX9A`(R*q>>wn&ycpLm~_k3|jnYnSR(B!eq|M{2ypBE)JmL1eQ zPWSgD|Meg3ov=>+^_RbY{$Kcsdg{iF8#(sxmm42Hb@C)4PS2`|UHGpr3{K+~=D?-LvHTj2YbPef#l&_Y0)_Dk z!TRWbZZq-jr5FAdCeAP{10;a5YdEq^C&dv3B6g{T@V^cKg0Ua5UPZGzxFcTOyHT-q z?LY4kB}?o~r3(FIIeA{5&F}jJ|NUzb-UMdV+R!0+faj^{jzY*sL|zroY*5d$(guRK zRcZY1MHcU18c#eCjj8xX-}w_4L4V44t{icw+oydVwH*^w@)))wCZK#Edjy1qZS&Qc zxR0F|JV&I#AdQePl>pbN0-_i(G}HuNo|rg}{D{QY7ARz=_uq4H5C6OF|NbY6g1A;z zT-?MlYdeACAz>BD^~qq;QAsJJ!3!C_^f~9J|2)|S^e(ugq@*N$D{1~`D;6s<0OX}R7uf&xRqVTBW!(>Sj5Stc#FPgJB$ zeyG|1eDTl{cTVKl?u`OZe&^$%6olFUCKiZxUjZutbOvOS)>}+m{EE_-!yY}Ck3w*P z8cjw#j7&-CBt)6We559}0`LXk3^)V@vuey_yjOHBV=;!^ZSx8U1#q1xNqz?1Xf!D7%GK=7NIdpI1`yf5S=?KCZQeMa@%0!t_Ju_dbtD2(v#3?_OJH6zcyP5zse%>yL;t?tQ7pvmX68xj z_z7+wy1qK$Q~TT6)RdEI^F&t;@$8Uz8{)l)2^SX%Sw_Ccc=p+Cooc_W56IA#Wz0@I`^?29@b&8>0ONfF z1Ds2r-@Q9SKf@=mx(KfAUk}weqj_urh`S$y5+|P(POb;|s46RyKl3=Q%YAT9ARwRn4IuC(B{&vR|?pofgiOcOgh8aldhZwj+ahkx%W zo(d?x5z+J;<3n&i_HE22rEJ|Wj&LPRYr#1Qja048oA9u+ozV9*yDx)sg0k_DzoU@F ztb5wx*w&lRn;cY>mFZgecsDsA>v0OxyeD&4K_iKW$ApE);brtMoLpSa7nNd6tAXR? zojZ3L#+zcLl3fgZ@7)X@-E=_vCmva$Z-eKVAk^y{6&2ND20NPo!q-QL0K-T0f5vKc)1Uln zEeDbQuH`vo2nT>gwDo1Ld}IZ}1OFx`#ud#zcXe@P^76Q8>7tns^QTy^1hV5P(0g_1 zOt7WXhYueDE>1$qBc8<2JO~Z-?^DXc(mub%<<^h;piA%d-r+r7g-X$~L@~H16KJ$GKE=QBQkS4bnL3 z9>>w6MQ`;&0#d0s88;Rx5gB!;DtN|6%2xeb{{9!yoDS@%2LmB0$*WgYkdmV|M1mvW zfn^pEuOq1Ev>7b4Uxev}gXzg98J=J11diV8d%g=yOYXIday-wtLw`&O3J1f74sDf6 z!^td^DW!UMC}Knod>J7^{lXCuQg?zyqZ@TIzxHpM-x6hbh|yF-gv9wi;)Ne^>8@(8(y07R$o zEXQ__iT@Pv+P@p?5gz;W2~Nh}3@{BRE+h+qc_YZqXjc~B<1;Q}ZG0ej%TNF|iYAkU zz=|W7U{pB4EFc(}l;FWqJDUu;Kffj2MP?Z~4r!6>syy**A0o#zU{CC7GqamuYW)DE zIvy)v@;4EhLnufHVksdDqAk`EA!$O;0!ISS27uUr7i44DJ@NP;QZxa?GsDI&8p6P6 zm<0wnu*uiP#wMV+2>nsh;|=a`q1Un)$`p^G*J8RGc_*Vv|m&o)HCi zUhI=6Azza?HfOy>TR?K@Jk@SS?OqAY2AY!!U+AqK;9j`OjOKyI#_>sK;c8>Bj`^Ca z>n(guX5oAAdC3o#c&W)Kzpe+tgNbafvV7#p{LMqYv$g@RNxB{Nz|83dE%C$T6Y7 zCoG)H{$JYb-@3>f#}*j+eO4cO2^1=M!THqKqb1?nSo=z}R5YZw z+6b#sM`snPA>pg0I0CU0HVOzNRk7dj?d<9j&NyzNsFG?vk(&C<1_~Xu!t`}-B5H=z z^3Jh|>E3rdAJC$usrCWS`ySSl%I7GoR-CZ<#U3nc1P?K{ZlLG6ATICqOaD_oJXnYm z70U3M8E5axw_8TR9(Dzfa;r}Po@K(xSE+BC|9g)2*OLl5f@kw)1}J?Jobl+r(IGYE z`)QA9-hsO2%F|*MA>sYcpo#7u#-XO}d0I;{T&qsCzP9#CR8&;99aJJYpQQ0N$%c~} zaqjzYoQktE-qHx_B4w8Au$$co5C4MS<-_)u{C(MODH$_BUIgiObCV%x#^j^}zTjYI zu0`o01V*Kf^1F83te*y4(mxEqj^G;*!U%W6(YXwa6v>b08tUGDjmCOWMIG>tQ=zvwygpX%Mj5fNbP)v z=h5-POYviSR@OOAnA-}TJ&VfxqD+t20A+@6th8Rs=b7!ed|zjp-dr#{0cbmIp~re){xDi)jIL-afRRAVT2S zP5r8m<>qkqtPgg`r!Jp}SI*{Znvmoh;$vf(N`D|IQ(W0VlWPmWs%I)bT( zObS>zJ&60HO4Rm^xO=$WiGU*TefB~Xd4;{4#KAp&+%ob#LL2sDq(xl%T?aoCTj-)P zV*am3ul{}ecyUlJ0kpF=5?y})l55BpvviH*Q#XTzM$=^S$!_;n;U{@{z#ms$xNsq; z*B(?9iEG2EFj93y&2r82-d3&YfH>6Qkk4X)taax?&K1b3Y+pj*7AlSu^2JEnFhs-l(~__t2gsIiJNJI9DgmR z_yRdVk*_EayRWx_+gY*U-=_yikvvBA-y{iM0#o3wbKtc5KYVyFAy_=K+%b~x`n@+& z*K9`es*^N?P$FnOxS;)hbTkmiFfVdXNghZ@A6`w~S#|08SqL!Z;KlOt5 zt@aqwgC)~0|HTnmy}jmoz`4DM^Jr{b1+ih8lTaN-Hi0N&UxomLG(bd(JKf94%iAOg5r0dD9vcZUIUffC=<%6% zTv$+e_PbFL1kQtX6wbhBaQuQ5oOd=SH+K!49|>z4Z%g5pf1(1w)Z8z`rn{!1g4q%@_%yDZb9XIe9)TE(fb^pK__SJ8%3naAP24{;@ zC}7ZH>3WY&FU2}btkhnmwF?)JJ+x=tj6vvbUG2>5w`$fQzv_Cnre|!z1&!qRDi4YL zcaC^p18Gxgio#%xymlW*5f-xqB=@R9oy*J|780`d=R0t?N!xKZc~K#}%17JC3&|8B zje!6j^H{)U=LTmn@HG`l|KQtNqSyB?=RL;B%zOz>Z<1Ld>{S`W_`;mBjuyZ*`AJ|5 z#tY_=VFyXp?<_UglLL**{6@(GXgF%YZT!0b*)eI^ZDJy9-M>78KC#7_xHSA&*Llz7 z`;KeLcT*UDd9--Yw8sj*ZwPyGu4LlbtjXlFpQJYdjup8sUJ773j1MS^TkQRzOn`h$ zlRgxtRL#$c)`E^oav#Zdhx9ZEq{L>5(32G)$DM_o zr7p{~7RmtF<2wV+^iZwO=xB*vFD5_->|m;>XZqf6m|39UbM>&cTRmif-=F9rw9 zZT`I>TFR)T8f}CSr2!5{6%m@*_9)U3$DYcir=pqDbM;8cjWU5HUd!~H=EUXNoK55` zQy+tSyRv&=GV5Zw#e7-t-3?xo1`_-io7IOtD7^YLzX|`kT5QtN)RYF81@veU1)r91 zUmyuP!QSP*UfAta?LVjW8D{2G#yP^N7_zeZF;l2U+sR zq=9$4Y~lOVbO$JvO@||Wicj#V}IiPizxu(5io zr`?)P%LW|2zENCy(a<(%c}~j7fP)eJO6O9OV^dTs>#ED2jnyam`3P{P*B)zCV_YI4 zl&U+tIAJ89q9r0JqjO-R^tPt2xx7vC^ZT3AiC!ggAN!*WeYRi0nzsl(vZ%s)!+o-kO8Dr~olkJX;N~SiFYoN^8 zpcwqsRZ)&kPyCAUL@Pt%Av=+jZ9jILEz8c2WszugaDQE2w^z-a>4xN^>vy9=--mGC zKmGiN?Be41g5BCSfd>|R#W#I6H9d0rGNpL+O=cKhPnu3?XT@?=)rg57Ev$Go$xw=# znzSmr_r4EI0KLQmJs05G0Vxa)iajxXXxoh8j$J2Zd8~8&;lB@1lgS|N*`Ttrug=Mn zB;xXiJ$;0_8IzOQ>7frx>md3Q_QLin#^VtT3xX?vqRBSq>Ixx##KE)pH6~ z911tM>rKT)SMm=0F?f)rCA|#6@94Rb=q`BIC;#H&o|t;q`~~VV)w0**MeR%>4hmW z#b57wKFR*{?jc9qS%(@9;jz0PKD@quE0%k?IK%Q6`)l@Gux1@-KAm9n1viQRzFCAY z<&=)LJy*gF$yE_|Q`%-KR99=}++@A(t;=}OR5n%Tc4_YcCbE=u)Qt7Rs0QQe6-o#8 zx*qH~BJQ6ZbFWB|5ncM})5ENc^+6ULf@!y$@7#@^vVImKdU`0*k^l1Ls``jQs|#|m z6=|_lXKeW|r)9WbJl65c%&4HcW_M08U25l@vf3+FjaqbGfM2O7#0tWdo0nIP%AFL{ zcV!%@F$ZF8SpV2q+PNN5RDS=zN~aDl zw;2mW)nF}UbAp#j!CLS5kSVz-dE57D*+{O9c{9xd#Q@eu(f;s2?}{&0C*9PQ-^hq; zI{LFB_1c|HY~C{;ukELtUv~VSYL3a9g2^ALZQrR(Qx1%ppL_e~< zegZjaS%Vj=KziDB*w9d4rt253Mrr%Qo;5$c2Lc2NP@WNR_s<1Ejdtd3(Vt^It`FvIA={?Y`n#>s0fR2USkhcdPYI*wzL!^>hiR z=Wb(Dw70XTJ}%uXrWkTqtaW_uo}IOceHMLyTJ@{kSLc-ZPxqbNbMeyb$vtwst0kGE zZ>6|g(idNn5N;N5B(lFcoHn9cM)#oBgD7#G{LGirEe9EYu^wAajrgb&9W%(!J?3tU zmj1 zQMaV_V?vW7+qM1iv*&uCKWj4=%-$}k6q*`o{V6;;M&$mT`jw0|2J}I>nzYyQ&VIUo z`U8)EP2%U!6_gt46k)H(*Lim}NSk^Jp zoiWUOvTeuZXnwZs9|RpTP52mRbnisp)RX)Cs5UI>_PTqjS~a{`jXk?$6T&48>HV6& zh(71peRt7Ic@O;~r`9-6&ZyhI1uddIZjtxDI-Wn+ywF)D?EQ*MOVrrL*d)7C8s(!V zy;t*TS(HKEQYXdJYeUrA%TXj(9!2X*#a?C8K56WSj74HH!tZx5f%)QiV130 zNR%p|RQ`e*Vs!lbzuU}WMD~gd=*xJ~(&)gVBLR0!VkN8k$)wj#*|XNlDKou}UxZxr zTP~=^J`g<=8l0J=w^=MVF^@f6c(V3sDNcycFz1-;>C!dhg+G>C`I^`19thnQAH9=o z9BEd5HNk}`Im`^nQ*BY?EIfINHf@F*g0Hqm5+vo?O4F=0iJm*WJ4v!~Rjb z;iy+;7Uc_ul2B2AvQN&;RQw!Si_+K+Md(e1hkq*q^r^~VF|yC7qif#kHfaP8>^c}i zNRmVtw_+Ho#6EzT*F=hPfK6jOL-M)1z!)I;E&=>?^)1piL?NuVbcifiPA+2%pVEX~%w#>xP=q@GUQ(Lbz+><)Vw8vEAx?s)ai0Gw7 zcQZr|$Lnpl{hYqg-HHE!h0v0K;AG>w*CLai{3!VFKy?r02~UJm|I3U4x%Fp{cd*>P zXJc-nDL%1cuN(K;xzTYxpF11ChOLT=6kuc!p%2JAnT5VExr5`?kv8+K!v4{nL*I1? zs*6?{6Ha7E))K%?)LOuSxCvim_spz@mDHjz&RoHXg}fWBw)Jf{Hx{piASick=hn3s zj{cLX{C}ucZk|van=a%#sGJoFrqBq*Q|TkfSdoR%(3f?&sCrGugMQ=Ko5 zV~Oa1_qpcG{10YFdVe-|?z66|h^*OYdjbq3a$3k3D-sld#H(^x3`n>}Qr{K|c{2d8 z5CCryvzVwTMMXtdx=KJ_4UQZAEt#9vp|jFVK@~#qkl&-5P+t-1{*7sx!Ae9helO|K zWY>6feAQ2hg7qG>|NdGBg^UCg%w@T|vnG{+- zX)44D*C;6~tr+k=`O4RvxiYD`Md1Xht5SpeY$vC>UhStov2!X)(%-O^$HhxFxTCe6 zQ)5jc?^Q1pw;6k!?tn?PQ43 zbIXm8dm$i8yKJBXhT-GWtNV1 z;-EWP&N(HKeMeDmR_odhaV`2&W)Gyd-zZ8?5h?FYUi&_BKpFSoo85r6kRczhh~%1Y zXFmAZEm|uqbK5QH`g}{ZlG*JIXY9rRp^MfZ6H@p$Cb_C0a8* zW5PF8=0-_gzwDM&DX{uwYn#6N%w)~b;bATteG5JDCHV$!-YP#itn(B7(Aejkd8a_K zPN1wLVQ?6Jv#CoEcqIry21Z6|3K>BK`n1XF*?&dzV6OsisuanO1b)PT86x!rSZT9x zCzV&^GUJZ((tCtrEEdh}*R{3f>NmSbM(T_2OBLO>zl2k3^R4yaH=?;G4@Z*LFukYf z#!4MECT)#^j@xDBZOg^CZ&`dI>Gl2WFDDd&8|g++X|H?Ft6{98chvU9oMxg?lrr^& zmiz~wIaIO)&K@^yNwp4CRcWEEu717uw1!{lc>f8Zhlw{NTe(K~qG+yTTRT+!YG~6^ z-SbT6GZo50xZcYBd}jPd$4e*&5`)i-Oeqq{fz29oU1)(JN6c8YrAId*tWgF`OQOrC zuPJ>0Iw`-auBCy=8t6JJpg6h~`%BpOJ(_$NOp;rt7S7GAisXErCh_tH$|<`=h4Pu; zquZWY{FG>2DVkP0-m{?lR495tvv$j3dC}dHJDFv2&$gQjI^;|H1ZT+V+tu~!Gr!OC zWa)5zw5_=ybF+`Wa%!t{PRV%ebZe@!t`Sf+7L%8ZTH+dO_*&AfeQ#@JU_pf6@lSqV z%WK1|c(iu6OJ#ctB~@^Tey!eEa{78qIe@TYlWk`-~9 zJ${E!!ELUe^;@yELfrHkY%!TRyfi!W8e$QcN2}% zdsYmbP#RdS%>OMtrAP* z_3N~mE*~87B=?_&!{x5oht~-AWvlzV*|0jZE!z{aB-7(9%_R zz5v6C2dYu{3zQo{H9%j}fij!>{=EeDV@RR>5tr*Ob?5Io{-(?uH70y>_mBXzAdt{g zQ%K+qR6-469`Y=-&(njGUH3gbG0B0vtFl+h�=J+F~|zJ)nQ!I;*|EO!P=4XN>6H zLuwC>G@sPYyTK@QOJ($y-PbemJG*b+OKnY)dzPg2cEV@AQE0gnOKHQ#pOTN;d4R#a zxutpkwUd#VIrSEcF*UIOg4 zt38)o3otm?#q{gJf92=QM6(86SgV<-{a5P;nW2T2Gnu(|7aiJPcP+;{i^)@p11xW( z{=)B8W7&M(GN)B3W!1I!^=)pgW?%hhlh*$5qO7yfX~9#G)$85Wa_m}-EoEH46f!5z z^m%t3`7!aWtF!YSfEpxQZ^0w|+*9a7d;@q^1sA?cI61vaN%!>hj0pRJShJpIt}7vr zJ{E0$`|raNW_`P1QvxH9H$9^6hnnb|M=WlwS*-khA;~p-1x4vuW438>8*+0)*?sPA z*;b{ZtepC<$<|a*qV6k>Ik67}9Wa5U-+M+5#sR@ALo(+7ZW~}rbawu& zOOvTh-d_U10*aK7F(7GSvymsHdG6Vy*N@p2aQMCKtPoYC!}r$zWI%WJltF!mbIvyZ z4Bw?^3eZDDsQ>9)xdGRM&+42gO5L!?Bgj(uS$WXXVQr@WCSkjI{Qs3E8aW@n?JelK zooGYDHaA}o@0^iJf8DdJt&Nog;-OF3h#GU%Dyrkhhde6aJf>WW^ZN*>umZB5P!c={ zLl44t0eaxUY%nsN`f*c>=g-cUT1DV)#!bi@gKTpjpkD+!0xAhcmtO*{-=^VQ2}$wh zU%BT!674qZ%V_N+%;Pi*?b#^K=X&FZi~?zI^-gI`vQgeTJCEx0IEY-RY)- zYuY;xzphp4n;vE;`h}8=@3B|es$VPg`}E$?9oha=?9@FWD9uRFHAx&K>l~Z+U0D90 z@kscD$%V*gI(Zk;H?&8$B+P=khDk~s37YVjwuZ(cavd!b6u1xo`M#9$f&BCL&uwF6 z?*QSO5PFilZk-u;AR$XV=W*n};!2^_?H#sXcET1*Q2wXv4E;ZMDx1(n?e5!m5{?dj z9BD3)1NI`bI7#R2o#;4zL~208EWs|<>8pbTKmDe?COp@B_DpeT^-T;?FVbgD!b|?) zxTrK%6!MMnFh~|8bav$+HGI2Dz|1Cm`nj-hGmKCN(Du?3?EOvr<>Z^bFJ35;G(kLU zES@JFJP!SRS)gNZc;(HX{;TXYpb1f9R`(^5W`zC3UQQt{C9uCGZEf1a!(44`Yq}Rb zgG?m$318oT>1tP7%Smzs>_4k*E@(MmoRv2i9ldVwlSjSVvslT7&aBa*kj(w$!SjdQhS2;YME#=@qmY{;Q3nm92?1w!25{NR1KnM^WaSEbbOj%AX`H#)HFHaVbuLF*w-_fb| z7{`nd{~9OR*8iE(Gx?UK#QSV%Ke9mKM*}FRH&qbj?fr8^Y*!$)Gh##7G%L>DHD{Ef)zdGBMDVnKMXDx<57?01S z0lQ~zD}L8RqhZST8)G?HSn2`Zf?n!uHuAE=p|`*NS1{1WEr_ay=sxkjIscLyh7~C0Xqsvjb(jfP8)bMJR=jKYOym(+g(%!NH;Iz!* zRwkvAMj4k)99x~m9&NGvSk$9TU7(Ce3@fHKBVH#B~cHFb40#$Rr+x#{WT zZ<8#kFcveivBw(%!n$$nH9(BWSSHKS(Mu=g#|AH+v%a`>?#NpEef%PJ+^ZwJuO+Xg zz02TxrJG(*sB6x&1q38>cUj`!`Ccj*rOS@)XTv4*af zjKM;WIdfr<_JP3}UAS=l0QuF)wP<$8X0Hfg< z3(cGxA4l1fnSQa-deG3&u>aFqG_5ADdLN*b;+AYAOW!ISX0=y1xVmlHUVnqBT&wtO zKGBl0l)u#o!ZAf)PYIBN4L&kttDEb!1WF&AUsqedm+XGMffu7qo-!`1FDS*n@mWi> zCZn#L&Wj4Uuy>UHp*dpPnp|lD0vHb-I6y|QVgq~aq&N#%hv`Hfk{};G6^mSP;rN!4 z-##{Ho&BMWd6$5iYzpJzbMD|IuU?kOG~EL^#yZPwHxU*^q5)W8YTCQRh`Tb$xHK%S>b&fi z@>l!4^Y-2;t9!d=c$f%qn&^;mN#APsCjP0I;C!o!+XcC>-ayqiF`Kda4d&h+=h>w| zqqsP=j{E|2GS}+rhbw(N>vW~is`L5P!*^v?QTiq(vb488;UU;P|M)-s!ADcdToSzT z{Yd_$qF|}r16>8}=u2>T1kxuNiYSQ{NY^W|Asya7QmcRMfU{b--4IEos=k(djCRE0%Fb#g&w{ ztc&?5SAzHY`CIYqTq&{E@6s#Ydz?Ri_V@9GH%nazPx6K`uM+xMGP0OBSiW`im-!Qe zD4D6#V=LtG7+@&HhsY@2|ASh(h)h0H-30KqyDymq;;(CMdm zLNz}r|1lB}j-70K(4(lv+rPLE-Fh@7$Mxzy0*(q%;aDg$eSwn*^zcGCun6T(}EAK$EuzvHC(Jy2z8DpTencjZI{Lyeexnvv2O?wGhJd}!4%9ks)6 z>NdTYIdjCG>({d2ea9T6$g2lh-?AILQoX#}<+CddjDGzxl5)!R@uFcnXEVaM#lj{c z`bJQ=U`+`q4m9`1X67MJ#kXfmn6mvPG?UXio@pNk|A6B@rPU8j6bt-SjAP8aT2EwO&=#yI3HbrTsw zLW7v-39|DCv!u5ATlD;;W|GGeo`x{}-LE3>wWJ}@p`t(+sHMOK3Kte(NYs~tGa`e- z-(nCmOyOxgzfk5Mz4@1yHGzh(6?QFqZVd<7kAH zOXkcVgpCP!A+cg~Y(gT|TxV(No+0;TOD=#AYh|K5OiDr%b@0>hzAj9!R{yHXPiBSS znBo)N0RuT{|E;a9Q?8zDvbh#>$G>r>ebC3wP|IPecRR+b&l!05PSE@um@r(N;~4p8 zjedG@MJ?60bVEA%u@g!j+T!){A(esW9c89?xL!xgCfGGUa&jEe=4a|{D!N&JW3Ovt zX)og$nUhv2L$_bo*SFQUuV9(K{iCcwi+HYy<2m?+*S~-K>#h#_`q4&$qVElp+8|P% zAgta8b?SYMlo7}B&SDDLt4Xvr6kZj8kaEtAYMg`X$jUIGJsOk8tlKj~Vbm&tSa@ju zVE1$R8Rn>1{-re=KFS!G4C~6tZ^DyPH!~NKR&D=rWJLGf(#}Hng0E6bH%*lHzba-6 zIC4E=w8M$=!ti34tbEbqW*xFF@gBF+(!4C!DRSEekJAdwL_#{|Qp7p1Zsx!MR7@DZ z3>uFNGep=Qg`#t^n4`p?B_I&ysQ%%Ej|^!hW@hEPb7QT(dKA}V)%e2`x*muN5#?j( zR6?F)qn3so){9}8Hn zUqOe_!k-_FG6jU{j`CLI7i07t8hQ--Vh|HB(A| zywJ2F0jY3&RU&^3n4ZYdfo&G=?2J1hl3VY$?RTqrUj%}JGf%rsb)R_5xf~nR;NAq? z?Y(WHt$&RD%J-MjU(zDMML2|MQQ0m)*iB;kQo_RT+!1ME5!*G!c8U2Iq__D^7k%63 z=jLvH>`}hCMM8)MHLi+FCMSjGIz_}^3Q!rs68KJ?FC5dcL>wl}uC zL4>yW!H`W-dB~ifB!xEEprAFTLeYX&ZtNxN4hMnDxX(1poa4 zNNRQ1NRrzi$~U&LNz8$otN!-L<*0DUO#D;!H;Mom zG7=Cw#oVE0y?l$gJ=Fc=r-{K|ovULo3hfR;_8vR=Q2x6K#?Ge@r1%q@T9`VO;W8vhhp z`)ff;|GZOZ-2LLH^jlY;cJ&bW|G5a}^Gz6BRD4^&4W$LI&D7Jv!lhJ3P)(AEumYrN z5#JTbLV;WvFEVi(G-8G(f`Qhks-|wzg?5Y+xDE!ot*DPvyb7@r#HMt(&!h|{H|3L} zdV*6V^z}&jhm7RK%pDL<2vbaGJ{w!e9h}AVjwNzK`h6UOvZ6@~RRk+(@;A1V&-poU z+)`tO@*k0yYv`Zv%bR+C-3oUr9F(Ox^s%Nu2`!7)okufK<&JO3Y_R+n6V7EkFsrH042a30KZMc zH8tU;=0*%$RG`!>10+Gl57swVu)z*0cE0{=WP@;;1Y zHK-RZV_+b`A6?xLhw||7?Di+2GQ7g_PC#7_$)! ziP;Jqcn6?#5qFvY)9n9tU&J@tT97>@sf5gPb%xgYB^*q?Lxht=(heSq;S=6f4qq>4 z6FsMm|91bDE8LFcPxw{;kx_hUTOS6f@;AmZ8VXbt238XEjtz}<=b8eC$@bT;mqKJS z)S*Exbb71Y9%Wr;41*v80CZe1-VicJJf&yXU(a1=Ut!!?`-)+`f;H8q^^$)t68BYj zCE(dVFtYaYU#XIMgu>X*O-wKu5G<`hyf6jM4F&(nRlk}8j&CMAI`r5M z*3N&D2ot;?>^Q-^<|K#Q{ikw_tclS=ru+4h-le5%NLh zy#tGy!dlTk<$r!_p8Wci`k2HT8Bx(g6P!sJf$(Y=!7_}gM9N51Aam3HWby4!(zm^i zPqC(Fy%dKeCw*@f4XxGG-GKL$O2I2%{`UEucbxUflEBY#9HcKX5e?a?K>#MB^k~IT zElhv?x<^_0if?WPXNEQsvzF)R*x0@{!yUrV1a|$MNQ1_E5npPW^zh;O#~@ymc(3nf zzT%XnwRtV#BrhR}5#xb73>Ky@UW4_WhvAKNp6V2t1<~k8W+vbusAqnl>;L z07oc2FeGxBml8Z3f@Z6?Nh>i=eh!e}!!~bJmQNdE~!EB-(GRVswzj3n{01 z#hryK>wCmDZ+^4W3l^4jvQGC6?(i_6>&lwE<;?@ZYFVG_((?K5T^$_(bb2_Zk&*GT zz5Rsf2iC>!MjM}Y0NXv)uOAdKJ1zcGwk}SQn;bDHk4QC!T7?tygOBd|az!Z)Qm!|a zQdPCJuYe_FTQqP{F2NjRpj3df?OM(EG~Lvb*3IOq?q6L@g1H_5Ee0= zO|9+jn~i{rcsRpU=jAb6y>SK!j)3I2=de^LYdt+tI{D$z7$I5^jnIzo=lN5EZhwtu zMEA)=9*q@ga?!Gu;}L(D!=J)M+?&&^edl>Hq;~_$oQLKnnay+UY=7Rp))gLfk>@){ zWZl-rXVdgiKcts~(u)(v4SjnZB!>v|NS&0yltk5T_|S#5DBJVLoiI>xE7GJB4~pY* z@XqD$@qG65skY3@<0+O@X$iR-51xmWk~mdK$SAV4iSFp8T2hHS_EBV+b04`Ya~Th@ zx37SQ;R17P`h%xr30)t`@!q2xaRU?&${D4!R%!~g1jIfH;wSv7v%LJuEH793r}N$P zuaj0^m^Rrb<+9^q86iUOqp5eGccIa%O>SZ00waIWjm|jlRLc4WZq>MhfdM03+7ldX zY*A{V#_0~rC~$RSDVvncloUI0i+{d^={3#>&spxX2@V<}ka^fK3fXW`9Nsf$`a~7uX;6&O(mp-tk16NpXBZA@bOt3m*$(KRc zd{0x@5xW6dJYb|o+Oy(Hq(QR`Y}fZ%{JG;<3-ju)&ScVa^;8wMp<_MN0(*zpB3e z-4d?~F8)r8s4Z!zN7aIH;vqnUO5k3^dlYfe(l;O=HH;1fXf}x~CPC(dFsvA(?hxeO z_i)W*MZVXns2d{r^6g36RGn>1zUKVB-gK+pd$bq6b`SZ-ZlVmhm#g$)<|3zFzU{VK zF*dgJLJ?Vye0vZw8(!=3I4@5HxG2dmA&r}AgRIB5WF@_e7ZpfBi)1{?=G#_1Ne9^+ z{V+%?Mt1M5)YAwzC6&sx@bKgPHX$r7sPjWecxzwLfQf^nalu>zc6ca^w03%-z$~SP zJgf?yhHpW|L={-fRZ?JGr&ZTPZ zWx?;`gbRQt-qto<=FV){N=0P^;Y`;9OIJz?dO_KW*}6Ug@fcxh=3oHCpQsAHKk9#X zzCnJLlSFtFZ_m>Wg*|(OCCKx_h1#$4`*{_H>jQ0d|<;Cxoi^kV4x8 zo4x7xxGyH-V^B=nrCcj?>ug?;P)?wn%oYAf>NicTik zxZXWdn*MN4zh#Z#r`8=13xKKIgKc;#YB)e!nHCRYT`x@$SQT_Ms=o6zzscjwOTz9J|b5vryX(rPUYjMEP) zEn`Gf0NKJAxI`W1i{Bde_2ZIWKiRYLSXwIyE&0TqTT2WDKFwE9?9!+8w;hzBKg8d@TKo8jeH*=;aai$$sO|L8!+wxH` zlQYR}@e9V=kM}-%(scgr8ui|nWDmIo!3D4mtFITObvGEyf0nX0zbU#@GUo)dpNr+` z|298s@4b~rD3fVlrxjxkl^)%6(EMd*brOSv>+Urxm(UjMe7o38Ma9Kr`QE%CBjV#y z;O|%zJ~Imo7Z!#U7fVnMk`O4LK`cYOQ#h?EAPwS)s>yL4)Akv<{CXF1upB1)l#1m& zFdB$HPuK3-0x(jNv|C=Tk|od2Pm5Sml4yhH_-OYqM{_2w&t!1s?HG9E;4d*rLC>4@!Y2Kt1QgTc_k&+AcAzO zOxrWpGa#*+<2|ABf-G%Ka5-X%U}{|HdG*2nYB<{SzgaA~q+*od5rD1g(Np(lTL!CI z&u#n6J|6faF1;h%_U}8F*LbiY>}7-DhHWRk)IyoQxSn0*wWGc7mNNBRVHX?Al?gWA zJ8m-ZfzSA;cX9SvpD}&3E7{!oZILWN7GsY1<57{B@`wi}!Co+}mwd;~!-y&=OGQiy zBI^j5f*fnPQ!c<#KxhJJff^A{P;3h5v;B^<-YVnsQ`p|FQqlmbeK9_Na{C+c$55oan1c_Ao~FuWEL_n(7jPv={AN3Sv$& zgnsU?*}shX+TD|v-q%ud9^NWCQ`prVAn6yptSofm#f)>#U{jNGfTpktN<$K0hgMHr zl!05;Nx5gA3|bU2ATLqXPbAk9GZrz9WhWvLx!m8c0y?=7*(v}SB;}?aJ#QR6hcq}= zi!YJCw!orBA@dweFvQ!VJrACRo{PnQI9Q}sYn;)Z@CorWZ&lxA^Ejb~iH+@YM$UM4 zaPYzCl>!8FN6#QH-#a2YFrbJPNfUF#;xXj{@R8zyEyo@onuU+ETHob74tx3+dc~M> zf?YoGgeYcbd5z4cjfjo~)prRdJiUr{OX3=YW*EpjfR+o{Ce$S9{hn8VX4~lc*{v29 zuO&o|Zo#rWFtNg`({#+Q_=FVTu;65 ztLbkS^c$+mLccA{XH7Zs$!C?852aH}*9(~S>uW@oZ1dF*s<5BI-nhA*$KJ_{-jQ#| zl8#OXtI@7>R9z6&G1F3Ec@cC9#I(KT24#**5rMp4}(UBZ&PBfXJOkT$~A0FqI(7D;!1^ z4Etl`OqAicMH^{yc?b3YoV&y>PYA>SOf@t%vC)#u zC&j!M4$hkDYHu_KI#pQ=roQiq+ZKeUhaJdH`dt5D!yVQw`$fkU(-~?9wycV%z>YYI zQWEoG=`rYd1%((>x}Kc53MBsR{lWXE*-XBEIDX8^nGzJ4_+z1ShGE8W`~PvukTF73{9W<-9K2B6D^`t$temN;N|+ljD~G6@GsHfEty37)ub&ptgp##B$rR z&ARSK(pphImi6h3dLCKwJZJ4wm%T>gJGUK8DvvbXk*)IP-kT?DiX2j_=kGFlUqY}A z40dD;%bD*VP&O2wMTN6+$Bv(I4^cxU(itI#r}$)1d9KGDDJv7HtB-$R|%NH*KK5hi^w*TDEAI?S7ZF(l-{r#5#g*DwjI)<;=xpPb3c-^5K zyWM(*MZa{;hKK%Tz%-8&2{Y#Ztp;RQo5nsKcZ(@zFU=P>z8&Y6+qN?sav$2w9xg() zwr(LtK3gBeD4-8s2L-z?*8C)3tA>`aTd{4n{mbh0ia!R$fS!(qOM$By6I z=ENvnaP4mBve%vTmA0o>OSzn7p5+kN)$Lk!co*lJjykFt#<4A8r`|p)n~p$`T(BYL z=3VAt0A=1H!Vo2X+t1gR{TqoRF$JiEp>qO%(MGhoAa??D*OQ0S<+4}!+O@SVy@s!D zi60k{@7{)82aN*B_QIih;+_o&wnFDtoR4fTlA-8va0k%+mly_BUk_SQwG(<)Bu3Tr zteHn6CO$;p;*jWto-hI%sDvDXTwnj*Az5x#f$0 zj^{hZk*_C<(SA3Jv&Kl@FJ%XWW5nRR_(DotIl) z%G&4Z_b^<{&(-!y-M%c-%29rwBzu2n+bin5cQh?OWC>N>UoGV%(lt0+@->Esp{n)- zo0yoVS62TGPyZ<_itIh6$$8YPo4#JVW{y(hj79gnukk*qbCHSaG&`@%r8U0lrTQGk znEQ)sqqxAs9y9-au4k-s8WJ9a@8>5kn~@O@kz^<_lbwZNFwg?4T72h{b)uy>V8o98 z^9$@q8lj^=LPQOXHiPXunx{c)y$+r{FR_8I^epqh;Bug(h%k0!J%fjYq1Ts5j@+ri z@$vL0kL}%Yrr+Lf7gV&XY@(ga+(H8yQT-%;pux`yb_tAs!lPRPyQ-8&7)luQ%ot+2 zBFwXJV)b3s+9mvcmHdq7H|r`g-O;|PdSa;123daK}_9(}|V8~Vf$0caqP${yZq2sh0 zVwiDBRiCZH0cRh^SRgC>=D8z{&!Dfh*p1IDbsXZIt_7CSqjzQ*4c$Aq3L8lP+%+;K>5 z{WJ0F`}r-K?jMM*;T$_BmqS^I2Q!9D=F&`m`k=O}C#?0AD5_qMnYuBu};2pP~dAln-%#i?co~|6ary5_ZHq%(D{k&sUXzzyqPKxV! z0s;cYPzP64)%u-^J8xx%cP<=c{ z{rNN-hp5o0qD2R7rSh3qwOZB=qYy9lZr9h<6_0rNuFgP|X@{e0i=O$@B9k9a_b#1# z|D?u@Yl|QQb3@uuUvVYo_%=q=3&*U~ejT@de9ZbB{ipIAjl|+b3-#_>N1ya}P2VYq zy&3Yp+x2?5A*Jg5wf%OKLh7#-pHgqWv#T^ewj=OTXxwV2P{uc@JeDn&w0)Zk#pgEu zptrTV4+X1{1)onw=-P%uu}jbEt-nGmtrGk}iD9`V?Y^DsJF3lk5}&U9W!tjcQApM! z+(x+c`Fyb3Y{Wg);MY(HoFAFSt}|f!6T*;0x;~sPN*UT?bzDSRk8Vjg!p5#LsWIh2 zzbm)8d=y|%Xb*yqs#c$t!0-%u=9u>JFJtft-?q9>LgaHPA zE4jnEM>`$X$t?ToC-%yeF?wyBLNk+3_xjJHI^6q{R?kUi`y0}nKX=BwK*~Y;`P1Ze z`ugjh8*&~wR8t;fD^T6Zd-8tc(t+{8y5SGe(9-Vt zG8rzS%p0N9*yGS(OsS&XYE$XXx9@X9P*ECHSUFX|nyVdE{N(Q1@cG1qu5~H*db@?E zyK}#>$QEJ#?#`;uPom@|F5da{b4o3Yea4hL1ASe-sQ-ttHxH+}ZQn;1r9lHFb7&Sq z2xTfNLK7JZA*3>sN*Pm1#w1gUC?!Kkgp6e@A|+$S$PkjTO#8go^1Q#j_p$%j>-Zku z<9VL9SnIRy;kwT29Ga5VK7Vf6%RTOo*-YOyfmysKmox8 z#il#l^Ul84;LvH8VpHA#Mi==U78yzVHzv0vBNK0|YLdx{wcf0oH-VNrewR$k3H6d# zyU&#!_c)6`Fp3C23Qatq_j2v?m)s)4r%!oOT*eC2ImW!BPudBee(IXK&(bMigg*Jz z*MSdg&gG6@_2?{;m+VpVFDcb3k4p{SdwhFlP?0iyL(`#-^Us-mOO$=frNr|Fth;Y9 zT#)5{aq_E9Ij^B(tK+V;KL=Il8eGO?6nJZ&aT)pF_Seu_SJ%HdK11gGrGsh8Li?V4 z`EDzHIrppOqty>OFFMk{o;0nR?b1s=>hj3w_%i>a9*1o z>%@jQYAAfGE}Y&ReSX1sG9du)&%?Kgo?UmDHlzukIwjQ5yqvrAa);ERy2p3bKPO#h zd2p3eL?GeI@rQjHXV{3#vsK zQ+n2`=GI$sDN;nXoYNk$2#lTQ%2XNSX;0kaydg}@IAy!w2BD`O^Fl^Sr^A+SH!yoHuycTgY5wnaL#|mQ2Tx6W7h@}b>1HC<%8agJJRFe$L|5tZ zyJ#tiwM>JMt=vpJczk}^OLoe|z2cgBzR2a1vzq)3AC5>bS=c^DHm(|Ccz%7=hQ~i* zenu5Vo&0N}lGV_7QY<|O?NDJ^$!Z0W%TBv$>~3AU9HaMe&rJKT4*dv*x~S8-!KkVl zLbdHe!iqB;BIZVxeiHb8`?2=^jlu64D-`9PI~p+soD~YblOpss&?ZXrhRV;u`p9f; z4WJa6$F*(1iM1!PjjI3&lN z4>NnvXRt9?2z&Sp7M<4MdgSrwoK3{*V+owQWa##Ias61gH-5tvhKB}p{Es)yKcAj) z+-Uc2YEg24-PN-gVAsUk={_kwow2qoJ)}@z>&>pP4d>?=cBwT~#QnCrR9y3Xslv|4 zY_nVGH~d$wX590OwKn#0WbEY=J2LVCEV{gQ5q_w=n0{^Fd(o`n=p9#b3mjP$u8kJd zaYe7Rk9Oz+q@D1SD82ihM-xm1xcSv72G(^K`MGUprcyYBerP6f)-7)pxx9`025r<5 zzpbW!@s5^@oR& z&p5vPFe_xH*&t-L#Os9qW(H0D>`EsMor!(+nGfBptPktjC40BD#9HU=5F7aPStlTd zXEv?7!C%ctHp%n}x2r52ua>L*wYznZy$5&#pjLu z19%zRjL$xG{TQQ29i6WBHc5>T!3~x84nL6Asz8P2gIg(#9}FkznsZhf@`M zq+Z2FD`qQB&t4q=_bt%+>12wlaZBfvva$l(Eboc#NvYm7dv`Z|(0E+C!AM(ctXlVi z#)kZF-f0=L^5R}Y) zskJe4rgLEb%Mi7VuI2ApLS2~Oh0Db)F`8pF>TVoaaVsOaX3^SxpKpjQ@j8=cY_OI+ z-lyq(@Y%q-Gqbo33_o1 z%rD>kGt||;_0qMUp|8Up?XMW=yd0vrJYt=lW4VNbO`pNX0rp$jVHWPGZMR>%n7&`X z7@Uea34j_vEF_v&#VaSiA83m%t@5K!vvzUXaO;wl+5ad8NoT8`SvR=VuNb+ob+VA` zX{VS)yelNX-}q)`xKZH~@RtOGlglSE=iKgJI(Yc-%4hzrOT6+Q8Q0I2FdFg8zgdyv zTt$gKE@7_g3@8m%DJk_Vz1X@XU*@Pg*z&YxygR5O7`@fJj9ti3{;MeUKWoG z&tzvA8$AAYUvRsnI#2xui8quhni6+|f^WCbvzld2*x6lpe8nbTZBa(%t(R}yqABgy zuH|QZiPs)uEHg>Via)f2?rzZY1kD23qx@&Sort;gZ#X_@`jdCLhvti79IPE(T{qCP zzI&SP=3CXqbK|{kZAsR-RDN^$5_cCuJ4e;={nc0`BEnxYYL?yWwB4^Oo5c5pA$i6i zDV$R~bn6Nst$eUjH^HUBaJQG(>1QT`6=M$PE~_5bn#sHJ>)fyJ&1^9-Gd%YNM$SDk z=lq^0kjY!1@uD$ct!mW+9?_sfJ`;o5U9R<))-}YRHSb&$*Dn8Gv<6-*=N-GH|;iljaAX z-%IWIEh%Qx2}nC?xG*nZ;RHK+D(fth&gr{~{k0skYcBcZ6QQA9=k-qQ zG#STH_94#{Ri4-;xuubThGhX?RIjBU=V29JInc1|NsjdOAwvl~4V~Q3!@hgDB~`ow zCCr)5-#O-%IOX;{ed+6@(dmb57wzqfeyFmHH#y4i0J9`v91wO_wO5}yR&^Ubv8|G$ zJ+s#(g4F$)Hl<&$e8aGQY5wVMx#;0q4xG6)%at_jPcT3Jvb?6>_~{XkquUenvQ}NQ zKASOcT69@W2-h(=H}~A7)ycose#|&%cqxoNrGuR4{3OjBXpyELNQT}hWbrU0q=!1b zKl)kHpt13N)8|K@3l-1268mp>uFAYP_w8R>mn#}4b_w<7U$)V+ifmal!5G1s{-odd zOkB_Q)8(s1>`Tk`HyLBuc^5?bV#fz$IxKdU4XG3vU*275B+*%|TqT}=j+tVN7Q4GT zAZ(rM>XM)Jy2z2la39f7D6lY2TcMoq4H*^YIPWoDTq*lCA?C=P+m8i34xBOSb6L+S zUitS}qMo#@A+rzT`^G(c@Vz(4?(S!O;=AGl6I;A5!LvARzOP zOy1#hN6Mo8ADnfm-hdwGTJJBXIh%=&Eo_}qfB=0>=DZT5391^B`s;tX9&70#~LGf`*aa(Xc zFmf3ofiS;e)fEdBUyoY$r%r|Lvt4Z;4qR%$6o+t*#E8WK7=#xbFZd>aPPwM}*w#Dt z@hK9aa-++_^8Oocb(~a^*=8f-|Lfho2%DzL=3W7V!hh8Y5{Yt^O=E$R%7 z*{5q>^E`bcd6Q1@cD_x-`I^s5mtI^%+zOsN&#J_JaKgvB-acA9dIVx=k~a`9!cLd+97n|n)?eWyVN3V$iJ@_vMuVC?|qbe=gQ@W zx0Q9p;|>J7EI+L^Z_?FcJLL;GN$fSjZQGV$ieH-dRm-Wxzl0^F`hDB-tJl<@G=JQB zyp%_2;)Rj@ftN}U?}rNO-y&h7h;StakgIv+yx~w73+0G%n8zIQY3A(vlK?X1@ zQHoY@t-*+LMSk;x!0G$kWooN5crsl6IzWte*iXfu%L`292TzB&`~Er&zWq|uTChA+ zb?a2WSHSsSL&o>!7K|8@4uYrC*LpJP-_HcYw}L%H7nT?!)2nykH#^!4zQRvkD(*Z6;vBK*W)7 zxqtu(ZZqs@0$V?-?fvp%P3bMk9@>X(vOO>pYBMjYE{1FwO=V_$iN%W zdRBJ!DY5-n?5{EbLeo?5Ph&pWX9Yy)hXJSOW^N(_R}4+xntumrFU9$oCJWS+>X8FYClK^mLi{j+^E@ev!EH{K!*P-XsVwVw+OL zy$l}5FEa=-8M%S{WI_i78ncvg7k;mVkuB%M45QwCAeuP6eDY+o%;jI_Ysz4iPXsk% z2qq?r7qi8U!4-m!R@5nwQt%A!7G>B39jUG7V;?qdoVzz4GnLg~QlC(oSW)mB5Gm0K?(()>>K3d40bGgbN;g@OK$&mEWO?sli?rj{?_w8X-Xm`4>TTi z44RzFgA)+-m7vJH*lPDf?_+LnZ`SJns<1ICRH!2esC0{&e zjf*xW+lT+ot+X-w;cMOXZjo3nOEf)xGG4Jw0*U+lb)}*Tfpr&4DLy|~bp6tC-g!FQ zA)t{XtU#xd$TZ`xVW=a9-)Gg9&jX&$)Ofv-7*2U&N#T$^!r&=;so$*TGPFP>@&wqa z^2^5$d>D*lxvUU(jjUyX$%g>ViR~kW5~4=%>5csSWm~#LTUc)FAGu(%zO{vASWf$I zRsX^{X5op0jw8R{yERWl>u-5m9!GyeY~Bb4{l4r*E}zvniBu*cN-1U^nDd0EOR{t@ zkSc+v#BcD?V=?5VP>{Ry+U9#xdlUYR5YUL}8c|x2TyG@vf|3;K#;vuyWl?ziExj}o z2J++KLP2sXVq$K2>Ef+|;Tn^gdfe(`MMV+G8Nfc)v7CuMw)M>EuaaV`?XDDti8^{1 zl>StvEvfj4JrUv8??fKkxcS)DOw~WNcSi?IdSBTrqCg0$H#kjh;@AhD;Omcpy9#K5 zbQEw@h|%g(&G&8deAsQ2$2y55FdAn0JM>;J!=2yTn)f!#-3~j--|*PCqGO)%w|u<4 zJ!m8+a$G-=xXun)7cQ5+3<%ypbBNV5op+XC8U7_FT8bv?n}46*lGQ#I;|m&mvsG_D zZRzsLC>!@FkcyDLn>Q<;HZ=_w!cJHir|S5SvrVT9Q9Fc4^BOULo#3J>r&K_;gGjv* zNYkI-tS7FOhHih)Lg}~H{8Ol)C;7?+ac_X=eU|O|lh{~75X!T}i1_0}VFQgT=_^n< zsmAZxI=#$&!pa#mOWo{a)TOuyzJZJBq6dB7k6F(v#R+FuUpo5s=GY+LzT-XglqIXz zd!PLnis}7HA#L&mxyrzOAi4&)TPsYpFOtJaV0T%{Q0UOHV=B0P*Rrsz3Rv!G+;g~3 zk!%d(v#i$<+BDKOKc_n^dITjPEOJEa1m^@L^$s^VFbXuPrCI;$Dsqa=u+3;)N_w?5 zYl7lf6Eed8y)QU9axT}<`Tf`6L#=6)nhO*PW^si7@WWx~J_6nkjKNFP6ESDR!>1wj z%pwFK`RtvWT0~gGpb5kr?@d|Xgc9xsytf;!<6@`;jTmVrikB~E#?_|0nG)5H<$9U5 zOb%aV+OigSeZue2_a~eYI%TFvYX;1V@&1pe97(L}L;*;&)e1nJktG*HA{xlaj5iLN5I#zX?2r$u&`xmv;bHzt zN#C>-Ng9%DluP-`Dz`qqV{r!zWhiLt9?yc)q742;V%=nr(1od~Hy=$}3X7IlVp<$5 z9K8m+i!8yWuIPrh##J_2~?ag2SN}F*A zTw=w@=g-e&{L7693e+u-U6uUu{N6JWTAm@Ijt4w$@6Y&0ei2@;o*q$Q8Nnkh*S09m z%ooo6{pH8d`5({R1E*1TZo;^+R)8Fzb+TvrMeCz?R zeD2pEf~_D5)qD4Rbi{=X_l%!8*uK=afHgwcfSr8d>c}H$e-fx5e%`$ySh&05gvHJ? zUsTO{5=%0{G>KIi?Pm>HIMjC>d{TF2NR)lV?Z?3u_e@U8PN^h;sLOBt-&RN`g{>zG z;tmJ|Xcljwb>PIsb^yeixxYX8T&6m1gX+$=e!UNG(?DbT^P&utfFbvaV}1$u^xzA+E&4;Tf?Uvg{FQKca*S~y|1cT1>Y1Em_sFn3y?o8e@HMS zb|G5&+2II!+!HXrdS72(-`-L?z03#!hvY6o%IYE<4DId4KYDryr~~@xJF28=JacM= z-IARe{<_~-1i@ZSuKBa%?B72rLJb!?0yy5-wosy1Uclcq33#)>Eiuzno%|zaQm2vE znF5^@<{gTuKUS9p@T`Dj3dEU}T)UP5fYYfACHD}oa?J6^a1l`sgb=Oz{9DL&Ku1?p zq@xgZ5=~5y_@>0jgB;j{XFjrMIr>j89n5oDsyq3OvfT zXWFmBt7xTGk)Y$>@@r%?@yu_nwvvn0^0(n$Fwy`9Wc$#NjlUN8y2}?|9%{J*=h`Hr z0$z|&6pf9IyHBR^+ zl&e2h>YgDz%83glna6HrDq?qkDWF?;ODrhM0J;X&7sxJ(Q|8ZPx>n58cBFTSwc$4C z&u>9!H_`xWm*3y?ZBK1JP5zR2Iyi7$abbqO*Wz7ORq8j5#zBbzMpUX6@hB}~ z!Rt10fhG1XxOr5L`uY~)N-x2|wkp7qR>i%1{?pf5bn?7U&?D~1UA7DF<^(3_PNUD$ zPri~qn&PTgV7$ZqrTqQB*-kwX=Riu>&daUfvKzv03YJs{^%yO`hcliC5s1yFp`js_ z^}c?OW*^?p+H6#?o4SbZ48$%mH|#(N0Dd~8EMK)s_@eOi!Gf~(<1P<4ZcC09{-P5; z61;5+B4f!F<4S{aoF8TW7+Am0J4 zu#ncHU!c&mt!~3$Z=?YVfq^==!WrG28mpduKEKoQ+YYhI4i4`?rZeiEy9^G0)BgJ6 zthBfCI{7tg960A0A(+7{_QAl1?*=Pp`%?`^>fdWzALhSY(mpYVB9BAbW<41Fg1n%e zO5{V@hTg&QDU~0@rfE3BzOzLkU7>2R`S|VlB~4lOn^CiGDYeZTXG7>>!q@u7$cXTa zu3Qe;?Pe!sb0k{UU){XGEt0GUioh{ZcP1~{&ol24S(UwCi+T2a?mq>Wh__C1D22e; zyYU2CwNdKn{PF6^kvAt@h3V6AbiRiclcar`UD7SEYR(4PKa&x3_Y}`Knb)neyADo>Iy4l}k!FQ=Ci8WHI8?2pv@D)u#gqi*DW9!KJmVw!F|s`tKdT(dlu<;gzz5O7psJwki#gs7qp25~Mv(!AWtqX!CA*8umPgAs^J-pb zl>SKjfCs)g$R<*{q@s%Bnj1|bZz$aYX6@l7>{^^D?>w<#=g!pDnNBK#9WO}z`3rJ~ zJ0s){Dl9Bio4hz?A*1kb{2s^Kre@>8-cNR~x&vjY`(jO<+|S@wJKer4&lC3J%4s?? z8!mFb`{$-i9yoI0J`i8e!!rVv&|}=b-wsiAi}9rrExWzElVWkrf%3@?DEu>&VyKAn zYF%3@%gYk=k0~Y!Jvg`{gd$|r>kYlewolZfAKltNw*tGRo9oxt3C(3%{_rWUzt+p@ z4yB?$yg3!)G&`^gIHasN0BD-}-={7v3%R4qwI|fgi`3&gp}qlQIV49JmF-mD4wE5(bMIyTnHo5ChuvK;gb?QsQNF$B z?WeA?VAX4b5USqdTq6vIT5;ynr(1~p5f`3GGfqb_ObmRxsjZC`wF>?CLqO77y+Scc zbA^gNr;^J>deP3CGcfs%BOEqwQ8ry~9e&RqF;Vf~Rq?9HH8R`NBm zRXf@wU9o|PoN#qOMok+mVK*v9kezXw3+iZWZ9VGS6ST` zS|%`1g<<#WhHo%aO*;LS&0&vw2>hGT-R#t~* zfKzl%;gHa>-50x}1it)7(Fn!#XQ#!ieO{;SS5A z)FcO4JztydJ%12A*uv44|7Sbb?(HjI-hqr}er|HFn9a8&PPc!3`~ZheZaGJN3OT5H zHVF~QZ4iuUN}+$>mE%|U?8zP02Y>Y+Y@lG?lww2efbrqAbspDtjt?K}N_2neW4(6y z?@uRUPVUYXUgywNaIe2LPgcsRf#Rif9hZLRLd73fOI^PSELrx9L0$mq7*Rhz;$Y$w zUyQzS$xj?<4OeBd)l7fwesuHHF|T)@L-%%4zuyC9r$~#8`uX9J4{Mu$S({DGWKWqxNYfz};#}F*ltBp7(KmA9ab= ztuUZ}Q10nA)v**p&^+QNhJb*ZC{5qPZH$ia*Rf_D$6ms>u_m)f^zm+8GN|>IVH@E zQ5r!fxEWdIn|R+s@B_ccWiUv9OoI#(C?1}k^fQwM^CkF;M5&*2GEFI^8vl{t0t^fc zglOOD)U|4)f$+@n=8gZ2^-Y1}4r9A>`O-((KBFvwVtT21=nk-M>8SN@@sz*ag(q*N zOYp52|H!@}uDYc#e&|R-kT7|_Sf_t&p8o6#0b-6~GOeodXh5ClC8fZgsI!J)Fb;xR zZ}Zlz2d&VXu1B=N*>AMFpr7RvB|ims^~p37FU1jd%952F9>A*UDQHjQu2^A`fR+9J z)+!d?0*bM%Z6IncMW{vs_s$g}b9?+{GxD;B5MqT0aua-tYh`eV3&6h+h0~HQjlrkK zx0A8>0TmUJvy7=ta?K@>CBo$&pN%Vh05JjXxWs$gXrJNp{zr!^3-6|^9$qSBJq86mU)!9s>(cv|_17L}zc~?6Pqfos?1qJKU8GjE zB+zR~0Xr3OiS6%LTf=%E6l)6HeZE-=ZQo9zAZeqt_mXwf>mqP)?EygNO>}1E4yZ+| zjMr!OMjpSQ{>>xvO;F6fWL8rZua)m*q~77%-aWct4pFJZym&`5Um&Y{=|Be4Kzks= z`{s8#B0{5j{>nG(yGlEdFlwR?Ic@5NFWb;Hq<7P9E2A`u0Uay}8?rC*6TQBY)iE_S zdNP41tf-he@p2Wn)K^({3d*0Lh=}DBjN64KbD&uM@$)B~S!1r*dhQs)H;W{c{h+!g zw%i6|M*6mTXz8_GH9T#8gPv1wv&&Cdw4WDJNqxhz-umy8g^oiz?D^LXB3qYURTExs zeRkQ1J=?J4iaNc@7Pr;&vd-$m4>;FsbV{}zbqKTl*5?>5WzFRVo_<;~5buUeYi<;p zRiozAZ=rvWjz6!`lh4LIl@ahASA;cxlxLvrbw9q_(U+;<^eTj1VP1&tT=2zWQ^>u0R#{x<*S|V2 z8}L3Hcxxw|2iBP)aKOIDwy!PZtJ#cm-M=Z$lk5d^oW$x&{3^w3+zqVhDC5Vq!{-`{1P_?PY`zux+HNx9VdpTn9lVr0}u%BuOB|lNd$aut8T-DFy+=Jq<9^ZG^7p|Lx%hUACV`)G#4F_c`yM(NVg6wiD<2wg!C2 z$(+hrn&Qdni7aL>oK?feKST}2jx|u@3WJB0A>`py=73}BR^vr?e!H*ek8{uLk3?pRd-1nG! zdG4nNt#<;r3;sc7gLsYW8>p@B8O%568(>>$so&RWn+C}82@q@T99l0Vc$>j^Gn=TP zAMoxa#0&ztA5EFomSed{$+Tx`t7Ryt^=XERJ-Ob{;P}g5m$cH)nwxi`U0l-z=8h>m zsi6>6zNRpj<*4Fk)XPO#%E13?fy*^+E%RVmQA18OQ(BkfpZ(Q)a0f$uq3xAK-C*Tf zCn;`)7?h^M#CNx>tSOHFZjmE6$!jpbr4*t7vaplevxgJJbvgGLMc9 zhaY|Z9v_B|f+IKOU*tw#{TxRD{0IJ0a$wYoUuavOy{*;nW87N5zSge?sPiz~UasIU z8=qrf%q}IxrCi*z$}S>3g)yl&>;>Ba@4qm)MSXZ$r*}}oUI8E@ixQJQCgC}@$lY)T zUb~zODNsuJ43bM$-dXCko^X^?=ki^a-wqGwoOyq%2>A(s{!bzdyEP4SYNF)F*Tf-i z=B?Vw9npsWly-I0T#U#W6SNAeQ8ARUNS93YsUQZ~;@rLUv=Tnv4b{D3l$w5=<*1fk zahqg5gAVXsGH}HN7-WMEWH=r)Ffh18L-k?>J0HB7oQ1h1*_IUOW+()h&YvEVy)g!ET#$JvR7Uz?uC8e6rgx9o0i?gG0uH< zoE_y8dagh~mUtP%SvfV`v?E0d_cq3_*LO$i#~e3?p@Me5W%C{GYtmg9r5?GrnRer9 zM7^>oX}dSD>Kuh1S{lrNFo$AqCY2g2?kxPNebfQ%_w>JA;k4N~fS%)iwscBCL4hG1 z2L&juFSrOK8H&*8;DrLSdg2^S?75J&syyBQ^#u!Zy+`VFjfhVVvC72nXm2m=dM9LbV4<)VR0l(EJ?kq$;YEY-cQ-b? z{jqTpU$P$%N81-ap?7V^2=VrTO=7C~wAFc)3}qkS>?P3$Vo|bJY$DS0KW<(|kFJxs z>lyFrUiNM=WRVe%XJzeNAV0xbH}Kgs5#*4&?R!^8A=@nKMLj2pUup`RTHV z`mrocz@>N~CnrZX<_y|Er=jehz=pjA$1nc*C;w1g@RW~>fzyVknhIhIacj>cbDU3T zUG{u=fZngabuk6L9AuQpK0h-W=#&cpzai(65XcipX!C(J+GJ+4trk+r9fr9`KQ;)7 znjNsPz0vWAD?(6``YtwY(!e8Wy|(GKW_=}Z&017*SZWvp6=URDY>I;QzqwcEVd{Q~ z7tGqmq_N5lbUSC!IPVeP(~H_4w+9keT)ZM~Q(}~Us%|-0(l#Q0gF+z`5)d&=Ah_k+gEe>dmb+9n9F&><)(`%w>Q!6N>MQ7SOL&83?a&k<-0Kqa$Jvl`E`o`=Y znjF$%z+QZ>Yvk&MGM57f%F92mswsE1AD0I5+xm=r8JubFGCAk-AD*<*KgX>f7L+#^G)&aSRrT_4kU!RoPF_{_Fy~-BKH)eCUA_H34GEWqWc@;T~m0G zXb%nF?&U7my-+e0@_XNjr>fNVcfhROTv4s+G0cM(T^VXA0!Aggu=liCurxx56*^{(+I}#F_Jiq}PIkffpSE3K z>2L%NS2V(SioDEt+3%$@{~Z<<8)|FeEK>M%Hp(3}E_TgYT$Lu=zO;NH$>^_5%bDxg zWFD=C=+AS*I(Y$W+}42#|KdN)F8A~p*QhVvUH)Z_tG<)2?0@5SQ?{WyYm8QKZJjd| zTO=kXrd)mG@Zn;}&C%IizwUBAf;@wGDP!yvPh<_KJe0et>G%9dB~-AEeH*zbV3y8eacp`7E<(LztnbocM; zt$ugi?&|37_QE%xx5*C4$>dj1C+9>U4-?kw2j1NdrLGZ+MF8srjj8ECh|7@Mp}{>4qbz6 zhl6sHC<;rk#Xx*Y3X-Aj8f`j!w@l;~K~o2d%GSKZykB+Fw^KUibl#-~sqV~i{$eq{ zP`#KPj?wdtbMB&!G7S6KC=|4HbwVUbnw|JDRa@63ux8q}t@ctnGm=YN>HS#gf-pFA zn*4qxMkkSug19|Q9#g)((mnOpV#Y~a)i2tumg6RZv7i!=q;9S)8kG}2Y-hSM&#L2+P zIO89UuDeNmn~;&e=)B0OPua-@LQyt9&)KzN9j|OcMA2&4@}&3BK4uRfON~7(-@n`U z)@*cvK^}Mp#X#m0>Y{Nsy-`_PvO6hasor$9j)eNcxwzkmC(9GQY8=(q1lMW4nDfzr zGv#~K^;UL+7^xv^z;uFELX?l;d0blt_^sa1c@@k8ZYNrf`((VWZYzECOG%8kEjXQq zs(6==*_>r-k0e*2Q1J?QQuaS$VPUxeA5e-%R^4YPX^Z8rV=q>2+DX1i4dG(EZH9{O zKNjG6(|_Z|owGqyWo-G#OB=m~mA*uwnSCh~l?a0IICbn+KTjao5CBKvAjc>wDoUtQ zWSUOiF(+rme?J$D)F7_|Mjs2keprdQ#i=|ls?}fkfpx49{_KrDy0$B3aQ5?u>bM_M zi%Q5$jD*&aF#%=6u3h&J8k5HqtLKL=YbnT~!~tt%XIIyers14;%gzCqU~sgi1*uLi z`1{g(csZMFSavW4gVk2=WorUUii?e*1HV?%2EM}$g_E=hn1npCv<`yx2qV8KT*f3* z3+HLgmRUm})pH*OOlCCex25WqAFloHib&h6`{nmBs|1d~!O!dbThNkkz zOG%pvY+cbBT7GMxF5dE8A@N6hw2~J zcGdrrQdzXri^CmKI2bXpb)kh(5_Fwv@u;b(aX80FE&MgA-r)kBBpnW!KgdA<0Xr3) zw{63*wvYFn{&QeF>vy5Wuf73emV zmGe0Y)_kVFEG16aXJxemsL?jwix`;VAKEdv7?Gnr{+W}MXV;EFYcTprwliM^Du8fl z+bGq4t*&!Yny*4ke(iB13g@Q{h| zqQw{$7f%g7SK1h?hAAE1>n*6QM;eG9lVk63%Y}=Y4Q_sQmS7YToqI8{rb8$9O!c3F zoA4ssjpR&XeyTZ?ys(9y1>ZaEopD=@S7C_v$908CJ_SRVfun8R$~(}MAw(`1_2-AE zf0zwBvCa}!?)x+~?{QRv?uh80E&2YPMq{kKGXT=Vy?hJ=WtKw`r> z(DKQFgn6`Rrt{USXJ9Z6qmeXa+OX6Ne6tgIA<@x+nC0KGNB==kxA3RKH@_Opa0YKG z6}@Ch4`zkxzlGA6-b3ihMQ9I>h`Ie+v-h>-SBhRT8#>?{%1`JhaA_a}3fTU9Q==W| zJ7BJr@CFlO!h{Z}yUI`!XA^4wrNN8BOIgv#!hJ>Y)$6CepTuD&>}UQVFfy(ql==2tzQ zeEECZOFOFMKfj9c9bZe`Iq=E)8R#oOX2^d0ldC<`l}z$(QjddpWiY4vLl%lda3jt8 z+^b8phDTDHJO_-EaGC-TrCp{wy%39oSY(HpzD znX8qN(oBR30n5ka@Qu{1c8VL@$ChL+QVHrU;9G${xovR1czlOJey|Yws6`t?9F~jF zKB>4r#6Qu)f$Ux8h5_5Nrr1ewmL?2Z4@?!m9io*<7Rv~Q;}tPWBkl#vizyy3F;n>a zgC#g1tRAF{$US+(_m0T~VPg7&nd+h70xOoq% zsrhh#fmmi|XSegz{Nx|42Db#WzdwrxXWIv@&i$YF`4*a(;ea-Dm46ODY&h|1c$m0d zgS_?Se?Jq|K-~rSZxtbUrZ@}L8Ke@as%29m|CTV;$%ygROVm7_U4VsVGdqV5oOcBA zK#)$v>JdLLv4RArZ9~W(T{-I0*1CCRsn&C1+iQ`*McEKsL1ws6Ed-CJd_A8wawwML z3X|jh_!QrYf3!oND^X5$o1;8^VP+Z}TeovP7Wx2@e z`>9;AHmmK9qutB#bG48<|C3ktlYg`>eC=mOY>y%`>496MzL+d^HzZX2_;%Ez++e}P z=?p#q@u{wuHgbebr%<;|@)ZRTyZ-{(#OOtjLBIhpfIlallCXTPyqMB(++x_s>#$E)jwIdVOhee7 zK7+0ySHPn@aSt6Oq1*n#0=>kRXEWPjk0)~Td*!4@mhSv|A`SqTETJw&a% z+kvpXyIadkOG-L`#Syq_vEt6{?PxUN)WHII00Bb)PgRD<)M)}+Edg7X3??h#0FI2E zoga6%@e~9oWgPV@^BP@3tJ(wuC#%dV!`zJ&kH#qbg_ODP%iqF8)gLT;H!SJNe2$1B zS;xjEth){Y%1lgw;?;G1j#CBm(|Z9H(ouk1!gmX<=;$^|MzK(V`jD`Li*ZLXw9p+Y zfNG0?Dda4pw%z~yi~Ns-58%yBP4r%(dQ`IIf7-@(oBGuI60O+!x94&6oPmydA8Ekk+g0q~KS{}GNa z;Ik!SxX78(4af_&{@PWuWY!;>j4VChjjU;qkrGpChz)^ROTOOV!2j?yV_W_)#vYai+V7CWGO2}O zMTh&(A@4MqA?91-0|9u3`eH(~UFeUekq@VO$_5Bj7`mm zel@?Y^P<_-RcY>~ADcuwpvYLz4#j^H*Qd_HDDA&~84O*8D2K(!*~w|c_V^ijZQnpR zsOpF+CX9n)PXf%u05-tjL84OpTL>%?gnC|b1Oi8d^|QlETD>61HypW~BA)(g=aq96 zR@SG_tY#}{X|rOIru7`(&t=l59@f!Xf8vE!chnU)=u$k8FbIH+ugP}A-yQ{`q{P#v zAprf%VTTuaFeyw?uzk!qE#G(CYb;>QUWc;NiGgp11J=3?nnzWm5lFL<|EkszV^gx7~WkE z&L;WEvAQW}|8bRjowpZ6{wi@c+D*cT<)f6r7WSj6xIsC@-?ok0wuE-J#Eb02Q{1_e z87f-@h8-6SQLes&7+wpD1P;Jve`98!J%7H40`bw2;O%v4Boh*daETN8^}A|Sql3BQ z4VMVPirl|Dx~R@R8nf@F&6^y4$=s7rAF}?r^4j8n%Zx&_&2r$<^+v`**tb)TkNp_I zPO=?)2or5p7lM`nI9GVxhPr`RL}K*&O!dA6k0Nso#2B!~rKcFWjFd3C4r_U7J}G&l zlHzMmwF$ppu$P_sDK!q_40rqZLAT4d;H9@cw>!IEe_gE)?K0IkeI}FsH?V<{`e@e) zPO=2^y1>UMCVec-A+UOpP2bQIT99O;q(l$e28DtLEbUsWC4~KRI8us$8)ZaU0Ia>* z%5?V6u<+CFP`NvM6&@}?9Ob*5+6?x-sI$Nc*_pUV7dR_h^Evs(wJ;~}iJ-m5(;KCA z%W$@%dfAC_0uUsk;sQj?f%S7pO-%`Ox5bJS3gmT%!*_PXxQ6C)O4a)vOOVGvf=GGN z#|avua+;-!nGcMXiCxW3v?HQ`eENUAPXd1*8$0#XF=|KVIOB-Y?qO+K?|Gn7Y7qzZ z!pCMDxiyYK7BS_7gp{(b>c?UaFrsjTzg<8I#6+3=p51GApYXcsTGB2=o*ljb89t^ zVC!qPwy%tyl%-3T673Bc!tyoMLrI5ukuYUnq?C30;GlJW@e5@^!_rF4ZwI$}r=aZM z_}W?>+qJL-%cgt&ZB|g=J@M@D^{>fmtd!oBOwf>;lK(wm)6A`peY3N&jEg)Oh{(z8Cj2=1?M(#~7pFShR@6bssM%ls8OK;u@{E*LB+f#Jiny_xNmv_tW3MMFfl4vQ!>udFHSTm;!r?Y7 zhaud02la&AEe&NR#lYRapMu2s-h)+4iTkkA|^1sD}L z!IUr}V=6K#u^7$`CUUt?_R2l9s*+lW{HZ$)fTOH>@BGSr^lx8Q`wog+cF6tqCVPA> ztyYxz*vZfiqR-Q3&tg+kYijqO5WbK80EgN>Kcv*}gmHa(K=%05DzfX-6R#{r){h-|`Mn;(?@XN~Vn;NYj?BAWv<^ft9p?HzxoNB03$#4ce zepSTkI`NA=mDkm@wfsWV;!bASZQQcZ%H`rn+`)`uklB9&?DZZY5^XqshhM6U7rNkmq+(5%z5JpoJv%CH68n#Q z)%BRv-@#{UpU19g|2lhh-G^VrrnY-o86)mbF5oc==0X(A>8c?F0O|Ml_;?ALIA$&` z3p+z7-AL*#PU3yZIwVuD6hUfLA#8Q@f+37h@4n=Co1NW(yb-bKtO2#ExpBu=kQBI? z^U(nFe$iIxdTWBeaxKX$%zC8ik$L{s6-zvre0x9w3<_e$R>CqUd8uF%@@*XP~ z8DZH3AX6)5lxIC;-Vo09sNJy$LZiDGg|y^!IPH1m+P>tT zo1HYug*fQAU%9}o*Os)QT8RwY z>SG_xW_W&R&}zb-cJIVbK=#c`G*@w&%PCmBn0OY>d=D$R_|U35{LjgYmT@#}aU8~K zTAP=78i&}s|D9y3b_nSI$j8A$iNSZww9CFQ^#UN-IpnZHE};5< zEydkMSSnVnx!LcWjkU{Srgjy_{QzY4dmjpt^gtSHro}ZJX&H?mixog1*L8s&dVTFueoiTMFO`lVit?&GvJ_i5R$D;v=s9y)Z$Y3h#0Ntb3U0IyF!{jCPsF^fUB z=96}K8xhMKEHz}mbi`$_&&*eGaCV2d2v$+FDUm zl%)k+ByYJ@xA_0#L$SkF7f!k;lJ;Ba<3xn(fj3Bknt4ljg=0Ca(kcZ0Yxp# zXcXC9Xp&84S}sUyGy^UrNIy=@=LxTtzf4Ea+ixqyl;UOIAV9|*1H z)ltHwn4yI4x%lj0=~4Wpp9K}TY)@>?k`>$`E=7B4VmPEkNLT*qtS<+mcep@@y$(w5 z0+3eL%E|nJlh)~(NgDWiCz&9x07)jdI0{f~+S%LVQ|Jerz75BoF5PR3^~@Hs+!FFY zF-b))NuuddYLSHOj`nsB%L`YhhMX#92U(I`YhK)W%PK)tKE?j~l;lid;S9ci|0eFh zJNCfPi5gbl>=$*r8M1ekDdl}`X|C90x`3n&eoHe|A!q{;FCIS*Z=(vnw3=R@2?to8 zIHB6Jd`8=E#his^SBXgi+O1-zj(1*n)r3crIsV#;*N$qL$cg+{BYVep0vjL>)|A2m zX^D~Mzs;R#KXY~|ONc6%(HNHB&em}j0)U6616*b$1gF9AwY|Te5!VySV#{aB<_<8v z0Dby|AJ|k^9fw*PH<_;R^tg2UCDq{O-Gi$q&8(%dwazI6dO;T z6}x=AeCObZv3eY>Qq|h#%T8KYUvwhN5dcDlV`y?#eM(DD-w(-Y4K|4%hZsG+=g_0U zm1fvrotFIk{JC%oVdI`QcHx2@mk&B-Mci)xoe1%pa7{}PM`@7X-}6Viki=7tmc4U4 z&g^9=XGw#t2pB)zk#D{NmXusOm9*yn$%McF-9;0|Ns~N)4}US9BTd zw(@<>cj0_m8tpLVeJ0f+UC&NTL9kfwV%{R%>a>WE-EfbKuwBoPp<6Lo?EMq;m7Qt2 zcQb0Kt8i1TX%IargNenZ*_rF zXDloTWT|&4Oqg~(1T_vHC0@@c{r1Ia_GMi8)vzW|?$0vN*g85p=|Rf@D7FzEaLCx9 z1|U4#vpFgn6n4z@2^F#T>x1PIP-)Y2MGj@q#rG(r2k)p zd)>**sWsJ6~fmslNVYRf5dzS5rqsr!9mQ_NaR(XDaub2m^0Vm z2S?nCNRmS1pLizCi}xEY^@MUT4L2-`$af2buLR^kj*nrY^^vpF zRY8_n!KYt|j#lz3o!KnWHq1n=kGusxCSBQ1CJ+D}@z!zV5Otq1ocCFC?l5?eYZ6|? zQ&?IZLqn@kv|&YI*!1<`1^r?Sed#E`Zg*|pr5>jV<8%ro6>z@qPjfYZB+%Kv1GOYM z^nH(MeN1hCF(EZ2b}zul$YN1tHYN>z`BOpjnP$USb zg{Ye#NK_;!2$Df0ND@q?5(GD(0!ok!vQct|Qpp06C5aLxOBN7#eQxxgdtZC+$M=4| zcByviShB-fYtA`BAAR)U!N$ghjmw#1_I#)-=2c%XZMON;7U6u?)W=$17BSVYmk4Z! zV@u|u1BF0t41d3ww4*EOKzQse&N5iPpgoh3jB%{Yo*CCic4J)fE%=4Cs298;Q3={j z9^20&{-96*6A3tEr{{dnZZd{-Cj?6}(G(!ZUnsn>vgKj8X1!iRLAp+~aDct&`9muf zx3XSev-36i$H?#iD)zS~@$2Mn15o_o$Z%O+!p!V4wY~IqIXvq!s7Cw?g594L zWUL27>L|V$?CLlUODHf%#g&!9>P-fIko3k|cMu_#5gY`!whfOH`{m$FWG1V&H2fl! zL>oj~OqC1$OpZapC`g4`3?Y9y^t7TyvpZ~N?Iyd`qp!{dEe514`iN&CibA2-+S)F6 z>{qJ;my`6U16PNwzJ!UqoV{scTZGL$h%JM#(t7s{QguR_M&POr*xr9I5ElXs?@VNl zo;8IKf3gmiZN$dhoUW}I1Tto{O6@)i>9hc5Gy3ej1?@E3!zw&-^}-_LGPIlhj>w4h@iyVE+;dIL>{i}|!)Sn;m|L|$A-sNy!EZm%r5abr1OAHDx z<+W0KoRcGg;OckGKmZcsGk#tRJiC9xsXiZ&Sb$iFnN%!jp?ia)CtzI5MIl4g`UX6$ z&O_m41QI3gq{2S#_W7};$d~Qwt7QxnyoQ?Y%U)ZIeLT zlDp6q2BcRC|64+N_>e>Rfxd*mS%=0ed!yWj&-Z`*3M$gs0uT8p>#7SFs0%NG>1L^h zYc42gGwV1WX>RvFbvwj6aL|9nS$)aSMZa}Z(lt+=r z=G*9^(>QeR>5)be=3q6N)H!8$=AXQ(Xj>y_WWi-1SYCrc>@}aiBcwi|#hsDo#3BCI zYAwDCm|jDEV*gB1eFq_+C0{p6%7>;OMLgv*X|hHmUae(q^!Q61iXrH36aoXEOyD!e zw2+j|2MG#79DJKHH_Wl*w}51147;AlT~3I_^)1Y1t8$$Ly?8uIiLoi&t14X zWH{(6Iyw9iXMYXlJ5F6lP4D=_UTd)dNPkfXH=vB&WH(+dL{2?9V=3SFvki4K?;dTV z`Pc0c6S@_s*LorLX9Z38lkkO8BW@R1*4ZE4{KWi<`-pGRtuMwF$`ve&hM^uggyr?? zB9vn>Dx`ts5H%R`L3`l&C}gxRXK#`i265g66-*$in(PH21r!QlJE9`k%eDg)ZAk7t zdiwM}N;>^)7r@WK@%Yl;?+()cQ$81ESt|8k#;jTRcnnZ{Supm|YFtdEiS}ZKM`9^o z&@#qxsd`&`Z?-#tbJwmVP^k( zk=vn=#u=hf`5P0HYDOMPExOR#pdhrmbz^SEpS`l0UpxCg{D7i|MN2x0*Jwo|vM#UZ zfbrbHblgPrrlFpBiD+zM8=&2UXn5jtb_)osqmU37)y;;ByeMN4Oo>EZY6;q&B=bZU z`F(UkWBLhxUO_Ms+CTn3oCC^`b~i8u@DcNh+C(tEOs@ zx;dYq>1cub?f5Z#0O6d&@yj>ox@en3@V;+SDZtLb^rUJERI2O9}kagT2y+`UG=#qdl#>~ zTLn0e2@ht8>J|Q(lVE`_Mhm9_OY}%roBF$S&%9NH&^ftC9J?>~@5`ElZBAqI+r+GY z2fZ2$zqd)1lI~CwxJ1y>kFUN+CFD8J9zKj&WoT>qn@j|L_WU`S_w|&c4c>P5;lrtK zdVr-}!XvKRk_0z-F4^#G%1ku(79rz1D3X8nQ`bHY_su7nduH!N=UV6n5BA(LKR(1- zxu|>|{lXYQg4qNPV>@c5e|4Zj4G4`kpH8_ zSI2pR1)Sv-elkobE&wrT$HtZ>*=O+{FH*3i)9H)DNS9z6{*ozENP4w{}t} zreSaA$>MfW%K4wz9zN@_6DRroCob_v#M$#yr+=iSAt9+T6&x@MTn!@zpA;zU4MLp>T8n z!wj$T?8V7LKoo){($^kq%oT|~lsVh8EGcMZ%4z!I9*ERMFd_>Ocq^>8fZ>Oo$Zjcr z1)t%+p*8cMtGY;Ea5FJ55CSC{6pv3o+4luu)rlK7Ac>vd)!cYLlg3%7_a@-)U1`N> zi;U2-DI>X!<46M0QGFj04X+1DvY2;ct~8so`^@1Rl3G^t-jnl&gFVLezxG!7z(H5{ z3_Pll_JZq}Mm+*W+b@vA6mgjsM;go+1_xpjAbGY-+d;Q=dro4+c!QFW5(h?w5&pqF zOz9+*eFz{G8_V)P`83-A4bF{O%g=GFWo(2OvF*-DM$VY+{2k1rcO)2uPdiweV^9^e ziMin;;l+O4p{uBQ5UpgEZ2Wp1% z_wgEq0wk}6B<%{XPE>aja}Z*4DT;I(dHRZz=(_%TOz;1VwR4AL1}+Oj0%E=iAxJt7 zt|zh!A~0Dv5G{_v_YpG2K_KYBrCX{HU3)eC(Iz@Y+RIRuJ-hS&C^KTWn(&yr+>t?Ylj@{gF{ThDo4N05g0r zI~1`&!AA&Vp=My3BPxT8xbKRHHa z5({Z#X;%-E_JP6XXFRft!CJg3&#L^(lAvVfLxh!xp9;F?lhX@dR&WlTs;H@v{_4Al zynvP04|T;ATz9HG=H5eOgzzKePB?B0Kd$`%617RlSS6!}FlT2@z1$>ho$~hMt4M;x*Z1itm#=;+@lHdpONSx%66GJ(fsUFe3r328-a9^QbsU)HE&_!j-<5Nr9( z#T3t&j#?2S2Vchu!oh`5{HDCBqbTSLUW;19b%-FRPY~d6oS?_&e?tco|LB;kxrfk} z)tLWFtT=w>-tDlHpXHAjdx8&t4HceTyc3lv3;{kKR;4nN`Rs2YwI4Rp9#YQF1<;4D z9QpIpuj12hCc5PlG?%11c61`N_lkcXkP^teE-~AxA^tPHN~FJh$xHOX_MG%MW^8O6 zL7$tsxKwadMfV39RlKRw7X=s#P-l+MJX`3N4?A3QqzGgC(95>;zu)m*X@tqhc^>Dh50{v8p8u54-<~51W@m94euvGY~D zAJ2y_rBEox{6lsBS*oS=o1|6eu+eKLXXg~yZe_PTnkhQ427Bo%EM8j3po)N>F#@5@ zpx@2I*3dAzuP(`AWzXjRwLC%+HHKb!VWM6v-hY2Rb(>6j#w3^V3)4gTxzQsFCbEgj>sFmL~OYPd}Y zv#j;ED;R4;=C({ujXg6r(N^}1oqcS-c=y-BBO;8Ti7MtYpZ$6O8s{o-iGxj&yT#6! zOa%)yNr?$5-Zt*^BNwB2?_QFv6Izy{281!fm3vlNTeP4JNCRe9ZdE?s!`t7Mmxqah zW!C$?avV|EC)qU|eDjof_mB>GH_Hb5Yb8Xa0+Hi^CPebpd%VPT8&cKcR4w8qFDB3J zv;SHW)F3K;lDztWPtD4FXfF8{=w7WK`f_=i#pT+{90;dGGT%%}d>N27y+5oq(>%o7 zRHd$Wcolg|#j#JCPC&J*V3a&<(E@b{VNI4-hu3U{anH#0usOr2!q{&0lB!GONx8;6 zlVnP+Gq(A9R7B@efesaJM)~^nbO2|byDiVlt;iYqEXpPkY?{2IxBmWqS@I1k*E`E6 zMJ+$O5|)ZvNki?l(`e5zUGV-}s_E@C0Au#u&(R7o6UB&=%Y08;PTesKx z(B!bYTW|)Silp|Lz)h>ktK~Y9<$9umZ&=mSznp-C4qxK;z&%mFKT|%Son;$e6M5^C z-3%@DhyA~XYlo{9gNDxJ-DD&0d-hLu#&x383N-~;A5Qt|=(Az9NMf?m)W+VqQ=G+4ff3UZL4)uI@b<(7eCfHKXNW-+OuESDUK z)``!a6VE5L$$J!8iv!&;C*v#Wl}CbHmu)|$Yj&ikA&)umdVyiyb;qaT>Cv0;N>_JX z`kv0SEq|MzJNL9^=!ai;(jQM!M!jlI&Mt`6Du;MQ)qGmZ_{bY6aHQwvD)(wtMIBu* zN-#N&1&B8{%+e zc0In3j97jB@ZK4xMENs~QTp;J7K(N2b}`OeMprU-2$(thseM3;XD0f4UZ&l2zmPcQ z+pb*gIk~5G%g>b)Sz~|gU!qlF)|el<2-LZfHpRaKaWz8IF*5fo>IohG#5m_JzFUjq5g>FEdPw*%&WGcRHQr>COs^DtyyVr-mKjJhOU{Rgx1nSP3TFeTP@t8I55@hyjweNvT)Pj zzid*q>>m?80(ZonyA_UehmaPIUQo$CKx_GuEf?37!zZ5C==At_PQDcuZWom&UqTn9 zdo#Yo0hmf1b`xw5%g13LT3Zepc$6nfI?r(J+!=Ci_)3a~kp9Gyut8YgE9Pbz*&*bM zLT(A>Q2u1TT+tm?*xz*1uP)8TYy(0B{P?P%9Ky0~+slt1KQ48Lphn3SNvCww{XFym z{W^RNz2`P}W9!d*c+Bl9 z8$2hK(+>OTaClX_kk?Y8cfyghh+gYWRQqwY$#6mE-d{`(jsc5HmcMS4XX*a)i6{^e#Thv*HbQ{63Peb*#w(b$4!4-!q2rFpJnM8O7XC|@^8JN{^j!=R95#x zi6&ClmXX_}h`g_p{_HZ>uTxrFe4w7x@W?)UFo~ecus}1@G`VQ~Wmy4)cGV;PrUe+e2x`)UwToVtWp?$r@5tEK}9NP1X?qDyA z>}DL^Xhrfgic1rXhr07TY?~c!_I(I(d$5kN1D;{Fy9~yXM z?9V~9N;d&gSSEYoCxB7M43j8FOP3=gFZ;;!UV-uufE3!N9$pP3+ae`$_{ zyvOh4MbCycN&`u}b9qF#2-^ zXyIV>L?fxJaM+sexz;>g&c07unlg^Zv@@QM`WFLRN_r|^qrK7} zPY!y(il@(RB;TT#zJI|^E~;JGb8;*vFot$7YjL|Cn?Al|Q)J2eMB`74t?a!ME~4Pw zq95vNji|LQZ@T`@Fzw}rkgVqVUl*gpMR|MZD4R&!*HzpnXCDP_#o732zeBQTuh#^- zDNu!*I))ATg@vuCYYD{QfP{%Zld$CtiBL^ydNWa;$qLm!#+XO~FySNzV9aRpAD<##Wn2%mI$mVG(- z)Ix`t>d1uP&rQ_^lK@hcXQtio&_LR8CU1WzYqZS^HZ|P)EkVyc?Ss^e@Tl6}B5-1f z-!Frq7*u}F49fM?(tQn?-ONcExgtD`B#^(W|$a7 zbGBb8)bqs@%6uLwi9<@q?1*`?g@vl@E>&l_MZ2UJlrtygdv)FGdx6q= zj=~2;cav|4KKD$6lByA3(IT`(u!BEG$%X++`Z96J@Q4cF=~ggh%F2i~IUWD~%=T0PO~pZsH=p3Qv>k-D-0 z_oVzNir=W{wR>UasVreNn>-oLlMJKGLXxBWmhw}c4K}y4>ky6pvH=_KRd5<39^h-X zmnkvwx35>97^!{Ry2bym&E(cqoUkU$Sqks^r@rPTv}|vLZ)tfxQ;b;#*kVivdK$t4 zvS(aZI{p|b@U?bM8BE{aTPUYQeygZS-g-@VK|gyE8vx;u~ru zUr((g!9B(8-UY!gmavj34p{x%JiYrrc>XKOF(p?y+V{deFGF72`5697jGM7Emh$)# zOC8e(Y6fbPn%d^9MkgBn>0ZvTBL`X$;@>?p`)o&?sz&<6(`!q~$)RkNUb1P8pEcIs z0_#6)sC8i?_WMr@nr~X7E|_fE6SLkd%esfkp;KaMd4jy@b0*AgXs)|R=sV2^(}-pT ze@1flmS0U@&}sx_lg+F&BG&}VTK%^-ToO!r88FqXb23Y7Zf8oI&VL#ACT=gJbC2a0=+OM6(wcm*+y$QRE%HVxzalt!2%1 zH6JQgr;jn5vCRFY2W0fGzN2zzmmFx)e9lOaxnvCGU2+6M{0dvwMZWE!`NBHn`ia*j z{}b%QGonf|bBgcs^DHvF2g_oo^XBxEn}wra^h@1(ziQFegnm~W-RZ^Hbt|qNJVH^? zG?{u{RpK7Q@X-E?X*l^53I%z_YtcXw<%Gv#0blAbKHtS(BDOtYyFD+Ot%!fi;3KUq zC9UPUKf`Lq3&`)gyEA7S@@V(w6ZGyss9Jm<`L{QI`5TFL%Rg-jn>!XYGaA=<>A%03 z9bbKOf9eUIvVgg!*76r$w6+9nVI<6q4{>IKa@P2X6A&FQl6vZY`TlrN<*| z;5^Cj=ZqKbT)}jax$i^c3BHe8;`NUxaEtOCtenvZvBmnBhPrLs}mA|T=GH-x|rhSqj9PykGkdm`WT99M$7v( zl&(r#Q$Tj+*LDkYd>*;NGj5DWzxWvv>B~f(pdFzP9SSudf}^G-Nd&)2-@mU#VIUrf z%*M@FSrj%9!!hy-v zhkg}Rprkfsd}&c4Tt`tQyQc^??GfW0jWFp&Yj1=;!rwv%oW!h2^{Af$K4|d% z16U*p7~`VC&CTrqij+0xa_@DjzQraM7~Oejc4eEo?3uFKFW0C)tq$XITywZ6Jywuf z>xBZg3d%XP&{fUIU~#@qT6xdMt3jn&QNm&Tz+;qWO(2;E*~=O%Ks4GxYYehL4eA4) zk6BuwLH6C718qQ<_ds7>G={?Q>EyPo+fi;_qAsKMW6sfWi1MH9`Tp-E>CEU58V3PX zmNt~xPZ#e;xhf7EJt_^Nf)+{qQO-Tj?Yg-Sp|Ti*f(k7h@P?@P(3|gSR{Ogq#nl)| zI^!M=tYZ4Cgw7N-p#AJIsDe*VmYmN8$04vwbgt6w#7_p5{_SpoImt}R;s z#pM`~#+5){n47xt_usvPb>{C@f|1ia?iY?CS3yycEDcMd{M8ns=Q334Rh44t(fH0# z$EG^Is#I95jx`q-KL`KLKcUhuvrAO;ywc>@ja6U|?BUM(>NsJ0*DNqet0Fa{S-QaY z@nZw^wp%UDsU(R>e_8OcaJP|S^TohQ`}mTKt=qaU{rSFV(^K~j3(3NKCy$%co0g3U zEPwbYD9GIQ_b#_RR`R}o_9QR(d)%4mZH;KbeuxlD=BFs^Z^GPx1X5i`xF{Lc8_;l)0Cs zWhT0>;Y7*RGe0kn|6$(GE7l`-=X>USS3t`W&6PI&21j{>ckK$zkMYve)is@wY215C zQPG>d7iHGF=IRkLELl-5w#o1r^Og@7q- z{M2f*v{nOb*&Og+_4V}|UzU}YR)f{VL4}2|y57R^>TH}(|5d=i&KaK@ zvh4*$QwEX=G(dmV5wa6tO>8PtR#q0e^yW7hrdV{>7*~fsD7OW1P{nhL5HlNFJifin zpcNt|3FulS!0)ow4JyCIKd^2W7t}^{eCdfa)2~BU{23O$=+Z?FzSq( zl4HOReu*rvq^Rf=F;n~0=>xTui~xmkWBE3UFmy1WWKdL47Xa`o3=8nE)zAnipFydY z3pNm3ry8_+03m+?9c;tT`8Pos@I^o3sB<1#o(Nz=B+pY#8(#o@h?Vi%x8=Y^SMUtN zfX5!0T*Sl}-)3pV?0T&iv{%gHJof^DVebq01!l)Am~O+SWFrm-1;*nHv9mHEH^j0kf#*Z;opKab;oAI|^ooc}!?|2Yr;zcC%+ z@7LtBc)&GaJ_%1UG)^22%H5wkJQi(IgA^^ZxoVu=wTnw450dX{y&fXvoW(P+gC^h z*P!ooun)5r&w~ke2AppP6yk&+Qb|=ZFo*vZ8$&FJKtIkqvUGXCMrw5LM$EJpt*>sh%pcrTbeIG2^ z6Oba~0nZ{$J3|YH9-ZC=3yWc=sa_KM&YFK|KaF~dhK9x;N2vuP2pztn?WY>i$2gKN z`2XY&^I(csgA$p59z`BLK6x*R$)|7i5WLxZmy{)9-}cC@QsuA>%hDE5%UhMb^UG*g znMRI73K3dh6SX?Akzdmr%Rw?h^v6bnGlRhE2{_R1EQ~Y?8fRRX>HyAd7!r#F)Zn&u zjF<8xe1-N=Mbg^1x>#_1RUrmg57v(j8pk;}Ib)R~WMa>T{R842`g_D;APO})HB~t^ z)``9`!HOg>K#U^fV*wgT1cC2$}!;h^Qk$lK; ze*2ZvAa5w*iqrI2`f3^aJW6lM4r9TWC4?H z8bDDT%x!i??JGdcCKfUeGz)RG$+#Y@Zj^2tNd4w8kC#{lZBudjqT7-ZRqiVbUy^7Ee|`yB1rLg0jY?l`K=>K^e@ zlBDIOqE_XJQgkZ*`2%sTIJ4{XV(>j;qJpXcjE7@ZIHX0Rk)ERwOlSsMs@t*75F%{h z1G-jMF&U)WzWWsqjLdVjm*+xBt@ZF>wYB2kTtOTfldEE3VTo9={rWL1j5-wCa-Fxu zr|Wg|LWV6Vd|*EZISW*rx05$@v{m9YgcJq{;Rk&pzW)A3ym3kqZb%El!Ii>!f0rHr zUUHZNwBIJM4P!x-u`SpHJ_uPAI3=1&g0=+NxOie9c;=B~5%?6swR9*)G->YY`0^zb zo*>S+E|xq{>{oK;sf|=R=C1}-s^aAxDS?5382uE2cgWDt*1)o53#LdO#S{37t#Pt# zfA06cckSA>Ma(){D>#8GBY{3SK3)qw#ANW%sEsv6!loY;(PSzHQW4l0yQwp1HaLXF zWnEywjrl zI$Ar&Ar^ZXdDcrn6KoO=wcp>gqou+Dn*m;Vz^Nz`XJ1B6t`<%h#1?Hm`-;lWI(pWS zSfD_rgA=N=Vc0)1rSohJ$wtcXFTK#PDtxjI7W4xykPcGuS8PHD3~}`6orE@gH0&S9 zFS26SzlFS1iwhqJ9>z5pgF7=_=yz~~)R67jp0u21O^MmtVJ;TFPuip!1pgas5pj*L zWl&>VXFz9wXt7X<5Qmlg1I-QH7a0%-1G!KX)Aut`qSz*4MYbkXJ`)XtQ;CCH;&&WuIlc+ds`paW2&|pZ3z2OPwPBKXIBZ(Ck`s8*azvOFk^9#TA20m z5(k)Gk2~3D8i5uvt$h3f=5Wuz;2#3u77-rKB_yN@L5`Qun2EvGVxHbEs(!r9x<)<# z{Mz!3%YmRBRzD2};V?O(9@im;IR0*f0!dH{rI|%_{sKSQ2q#|yME6Gmn%URSV(omR zqCltW<6ga4@Ll(APnhhl&;k+2{fK`1e=y|Qu%>7oSoG15LPeXOLPE}D09en)ntac^ zy+gp7??r6m&5k}hDk6Yth*kJKejIskyTfVBvAR>9tOCR>iGXb6`VSsdUYM?QNw)m* zW@%d+25chOr~x6Hj-%8w?F2qEW?hDTx-n~p%a9|H&`80?6g*h*Kf1yAn1JrN2_`fF zseAfbyyr@y2zof80eor!I&r|H0Bb0+ZqX4pFfcGi+lB*>ICd&2#iBur03K-Ck`ZpT z_9478A*1SdAKL)lPGA7>7WE0jP0ED6#`iD+$uGAmfAh$M#`tF`O3Ibx1CV@ja>^co zsHY|l6)e#)rAd_|(YC4IaV=VC%KZ-(aqfDOxWCjNf&n9A0f1BhnTSms z#3F>W?`p8#ZdGpD8PHfPgt+n9?IW(wO)+eWmgt@fOiA!@n{Bfh9vGbrU<2*D*dP$m z9B4OZX|?FoNnDlci=e{>BBxoi>ai*%-D`Tc;jV-tcP5^~$C%M8hCq^gO$R~~fZYyb zu?D#D*Bdxomk+Xet7KRD;)3v70?*plbFj06KbB9Q^IZYMRfT@tOUV$=Y7{S#H%FjZ zPn7C`N1raeQCJ148u+_$8)qaTtP~(_pZI7e%Vnq-h+&>#6_iS4LPt7Z*|}vp4rN4r z=&?yLYbJ4O4GIP9RP^=MlANH)?kchwdj9@(u;#?~<;)Ic(Gv5b4S}p7hSmB!4^-b` z>}3qTeIGkJdt0u$!d_Kxh{^PUqAVOKfCkB&?bkmq$x7vm!Fv_(I)2MJsD61KS&O~Z z1+M}<+h#e(#}{CX;D7dFn_AEk6SD0JuOlQC$9_joCx&BZ*n=dmQAd-d+j?6BD$I0fnf5m8Yl z2u?Er2(D_4#2$!+(|I(D?u22_2H_UV+erM>Iu93IU6`BN*hpv~CzXN@M)>SOt`r9> z^X_EEOoj~Z+c)sSzZ*o+p8Lk_Bj1r`SlLP-I00C;l-gd1S)sp}7G(5W$uC5!TQq#0a{Rm%QfO7=4aa$VqqXjY<}jMNGP^kKg5Av z`vCuh2gB4 z+T+)fQ(7Bf&oV)M3p%o8TLBC_f^4ot1*}GUdpo=6a5`?E!o_A)o=x@@Inp#OoP)78~g zHz%RdYVY|1s-@Ll#YuMKyAsOLA#=>*1uJCCu@;B_u!j$}M%^w3dk>}0v)CDk3{5>6 zV1{h^a}IJDRjNZ{9F(-)=tM+m`3@VEvfM zBTOtp7{EnET%78d)#A)HA$)f1kA3Jz8o_nCw z4qv45XJY^drjX&N>o< z8q!(1(|tYvH2PTXfEhXe1y>OAr?*Lym*$4MyBc^mk%!B*3dDz95)-w`gfu-EWWcjm zzy*K8Unuw;zteROawj7*055FKt|4vPlCbc-W3BY5QyVkNQR&r{>*AmyR!crwyt~C| zth}(_k4Ifl=*hCiYY|t?5zlF$IZDepCnv|OX>$ID-$sZ!9d;ZFKM( z#xqJbtlF)WZu#$}s~^coAmjR%{v(w^9cGazD3O%uBxdKfsbQi)u5%uSiIqPUcst~b zkgleCv(743UPWZrCAWo+110ci$eWuCME9JZMIHtkkpS+cWia7haB-7k{l?pb^&#;x zB7yfBmryK8tQ`jx?o3RxL%@T}kA#eN402@Rvu1M8B769cT9Q~{7K=0hIX`yjeku@r z1?99jXrz!Zp5P5IQD*eXA=qd{A;aG)1#bMfP6!Zy2z=OK_)jA`RYz^!`t53AE-tRq z8~1Ai$cRctiQDYstK%5gMgoYpdRK*DT0R6?JDl0i?~LSt77oFWJc4M|Avp}>&LRL(t9{S&HS;PDuY$1Nk2|Rc z*2hjIfWnR0L16-EqD30T!a9V)vA|ZjBqX%pPrY!m46JJLg3lTDYBqEWR8?PboD|;{cg8~j-d8qtpSVItFd`S3J=&OXDMv}dZ4mJao z0&jjH@d&pIM%N`HO6iE5Lz@#~^`>#7Pucd|FYNe{kRTM2C)1VSKYSMhGDB!R*v|lT z)zK^T*u{;SaK6w@%?;AOPQK;gNY~}`C21{I?tnXltU6C)*s2V|kI`>jGc#V8Mu>SL zio}Bj^-OivW0)EV{z=Ug<>YOfGgsqV5x<{8%BzmhYSrKcKp2Q%I4hHGdOracfo&B* zWbnIF2^EzhKCcH&9lk}4wS}Yzs4fgG`L5eOZuNyO1lKTEbEGgF!#JbR! zj3DD|$!UjuQ$cXKPc=5&)s<2ge?cAf95oaH>LA#B_n&_P*sSheMCXyycT2dCa)arA zZ?9mRkwoHE14b7+6?st|%<(|e5VDFbm>B>?COuvxh&^D3aNR2;td7r3C+NMvrtn8D zZt!k}&lB$H*E5%{L7(;8ivf6MiHz#c5 zJr{x+foRV%-)qo^sPOTAvJv{JAC4|eD-;wIP-(oZ?y&XYM6H8SUvYJNs9BT2$io$t zm|d4_d;**YGH#^slX!k;a4;SsQiNRw%qsKx$P02}Fz_g>5~w^NA+^GF>bwLmhSGtV z?WSo)8OC!<1o86p#^k-ZZ|6$a-H zSbFK(x0C8<9gZb=2sGUKNr`QGtfx$*$*l)7DOE8-2e$ww4X0UaX5D}{JC(RFJXz2- zc2h@ze4^tXD8YW8J*0B>c_kw>!|GBZ(9~dg3FC7#UR}Id1=J;Q0@-v~Ss72;Jb)%r zrVz6mdWj}^f_*6>sW02tvhoW4pF|Msy+rtxdD}Kke%dH(#aBojAErl7VLmhv8Ee4b zfs6Gj^<(s2&Bc}E6HZ!2*+xh_wrqLP<@YL08b;h@a%q=rPe(Q(wG|Ycms}34j`zT> zyZ|^F&`9>~*sHB&<7xly%TqJj2n^P zZUU&lZc;>++ELzeYy-!R=|WhFX%9C?)ZCorE}i{$tu=J|KQ`vkAQ*9;G<*m>ld|ht zD5D4&*GAXQViHtLq1TS!E7Pd1j1IaWBs7Ma3DeMil)*@%1k?Qz${)BaZtP41q{`^1 z#1McOL{1y0_1_>}H`RrdNC<;2rICI8HFD!{@a{-83ZYo7yl`tcfhwTdk=mozLliqm zGDHI?#aYtU_9Hj}!#4V%%|TL5_80)-1V;d*d1*?LUo$-f!BrornV>utfx)TJao`aV zIY+P+0yH9%WX=dic#eqcJk8W)IV!X%q*bdqRUSy8G9bP_5c;rgrxWLCkTs2sTVU;TXQy$ALKt8l_!}QK2nr*k-@bi-5+Ny#QX5Meh$Y0k z%ji6}IC>G)*Q4=lFWPxX`O~aP1%Z+nRGVPt_^GAf&7rs0_t~?TK%-mfbDSuL!U*}X z!{YX0uf8@pa;4OnDVhE5ojZn*h(YT6+GG~b3P)BAQz2iU|AhFeF8Zt#slQlt-iN*f z2NjgIEg0$p#Y$N~@0IkarzqK>TFg$3)yzmF6_V-i-?=zB3)9`ikCK`w%?!m@c$(76 zN(F?Fg(5Q57~%w^fNwRMSn>xEjBcbOG1(d{O@jIOenU;?wj zK2C@true)C3E3&s){NGY4NIY1+`ckD`qi4}^0l3H(tA4@4HKPU$994%Ub93?niQz0ZAE`IP^jKVDx(c9(_$DxqWofx{lNXTIQSCn2UwJf( z4Hk<8XD`)Zvhj+$jeg=mqCAK)y>L^5mfdq8VCpZ^BNc+p**hUILP}xVZ+yQL=zJL^ z0~jj;)DoC{!-g=dxHigJ@J)`Pvu$Y>!FYK5FuDYJBQPQ9%b&93IeH!DO5tjm0 zB?9lGIlAFo9*hCuI~QID&07jPGP&2M8Tk(>$pFwNL`6XR1Z6)Q`~~T%15&;TWGgZ> zq8&qG(3zs6^Jl;I{6v^|w*WMZlvo7LTI{K1Y>hO!4a!)TN-$UX@)ZG>S=BJW!0XA+tW_)GylqY;>?|pY6B4EU`dSz;BUt*v8xaF>|I}1huWao>0zc8VFt1J{8Q#~ME4IJl zVZ8#X0hES)OxKShJSD!NK&BW<_=~{!nc^$&pN2lscHpj*M6hi`5;Vm3z5p56jR2I| z(>p5^76b7K_{~1#^NG}Aj+h&BGt=fzfW>2TIHOVnfLsszif_1D(%hpoD)oM}Sx>(G;RVu+2*|$&jE-$cWepbELa{ z%|($rb;fmQu8@(h3J;-iNrWIH~6Hfe^IBgs#heLI?=}FzYL*+ z+(9+lvorK1FkvH5HfJzLHU=k~sssstvfB}y$kKky&DDX#+iLlCb>GL=Vdh8-QOOkL zi^N0bk(AVCG||HNY;0<4Xi!I9nmLrWaDz%{Gelz)W`reE3nXKduDX^s%)+XGnQ?G^ zKQh61ojh$ki1IWS1fO^UT-=Th(GY74J6CCCl}s4z+Swls?Y@3E4r6chsHn>r()t3l z@)bLr6p>dshRz{^i$$U)N95(%jWEds#P%#Vr(ndV{cyj$YNj-TNk-#&GSMxd#fji8 z3aHO%V!4fcz5m=k^$wOBCoNQ&m?k~Fyc*z#h?oNjdPA=wT%&@kB-lg{ zS%pxf!nwO=H~J;_G+2NDHa*!0G8Azy)W*uu&6td)HEYX>v_5dZJS=4tf^KG$9RSY` z(2MChdAWe22Ab$F1ce*71J)!>@X0`5^U4oq51?}3KWuHLUM6w(T*bjBbPU?Ll#SRk zsa1e^s%H&W{h<`yXa~8eQ@aUc2C!Omar&}N5o&csvJ!8Ny`OFjMIXKD#MMvM>e5&3 z?9?GgRR^&-5*UJveQ$E8H4awP5J+7iOOYtdG(u7?f>FS9vQ7(VS!Rjc3=eWA#EKzNHAbtwGO+0Vbo!Bw&N^BL)Xhwp;*ek*6YPW;IAq| z_j!vH4IAijhvi_P76hu^&^J{)pPiiwkT^;pXG`GBSYn!q*!qm_Hj+;rlRmj*Q$%;S zK~>8aw$dLNa#NM_IuAZoHrHsid1RK@SL&BuaoBC4V0TxA&#CD}BH@M*j|X2T_!v;E z34k7rbj#33ABlbrJP`q1rGk$8`iOUbM5*k-r1A&9w2a0h5g+U3)~{brNKs*PV4?IO zSodXbbsWy&a7gt|I%M%l6P~gG55XEEy{Xl}ve*H^fgTWBJb2Bk2sTwld^@0(yY&4- z)|W)LO>~{o)qMepHxJw6>TS(QJ8*x#sP3R*D~&D8%eYe#;(DOB@QvAa4DPeVj-;4pjfml1XjeH|N(>fhd(5^M!wR@9PZ%c0?Si3kCO5AejeAj1y)EjQdM zH=QfKMcsB|_^X&At1Ph*S=m{~K4Rkt5JgkTaj+qwL&5z-Qvj@g6P`XoOh0e$o!qnT zzaemw7h_AIYhRmyb8sQLr}6;UC%!s~wjzU679pfTRyZNNX1OqwES zghfc|kaJd;T`H_ygffr%SkO@yZJ178wLde0rL9r%=mQmCU;X4H!Vf=k=m_)bq%H=z z$zcn!%={VC5}X2?G=r`Be0^}EmCG3_V)7fWgZ@~K?AT;gM&vcJn=-dB>}Zg`W6U+S(2YdcXkC%$jTj$o%K)wth*yb^X~ z6MF~&`*0Y1mZrrJI$D#z-lf#IBB7p#BxBxrluJ`LM0NaY&QFb+ ztg>I`cv+3wpD!jb8_qG=P1Nc#$zRnkvH3jcpBRk-EIBhxKuh;sx+6V{&}X184YJN1 zHe=nXI-}qh@NjVz54j+tOKp^#%1nGpj|a?2(!Fv=AmUO;M=H`57`0bP7<7~vpRv6# zvT~^h`!A4{z$N>ZTgmtAYyR03YdfGx9GKL+ZQQ zR9!*pi126Q0bLP?@o4!hAR5z?W4Oqm6lOcZtTwn?e-frt@mHiitWGt%Q2zXgIx!}oKeAJY zv&SAHfSI32aA_PjL)Si16!3L9P_)Juuq*6PC&K|q_m9$KW#PxydSOyjCO&y$_x)Xjn|1M$~=0rJIAr#C-1=$n)K zy?s^SB(iN^i#%Wl9TSK?2AigEUOXG}evG<9%v5XO`@n1GSz9tk`R?R2qyatOsnUQ) zZI61VR8P;FcOb_uH#fI15ri~?G#s8IOa%-}4g-E>3fuHV^{IXzFsOr(yjcK+NV(sK z$TS?2KDL+!motb~C`>uMp#1ry{eLtZc{}lFH4A4f023$l86ngcz~N>VwE;OdfGGbP zqI6Q2NMKN4XVa4wjA|LIA=qTRf`kIw3xdkQCJ^6VONsS=Ltb^y36vD<8X0uDm?>Xj^0(%%e)0xfWIF z@4s)6>ItSd<`mBrfn|qCxdx|+ZELy@q&fRhi8<5+u|Pj<`!1Fa`mI}+!9zuQPUuUm zl{2K!)W54CS(-s#^!H>Si|P%wg-bP;ft!(H-K#{1t++E)r0#W*@>}8WWs!D5L`MkUyjuix z?}QIro**t4q@`#;md_;|)d<6bM8>UFWqhQ03^}SFDryj}{fa^hK{G*u-4j=*5lX*| zie@^&ZZH9kS`7+P7%6g4oseJLF~dV+N!vGI6+34`(Mo<(8%o z{yE49L%6(1dFMSwAw)X=#3-Pqzu&4ERWc)VZ5d``Zca%yBK<+>GY^ZqT>&wo4WHWC9jO;jad1PIBZtm{N$dORK>g?~2 z10KP}%PWV6afl-oNFm{+69^E;T3lS5`LDl1KsLc|<2Z6 zK!yNWNXsT*BEp@*`7AREj(UNJ`6)IXgO$(0QPSLe0W6KPXe+5hHNoD|u_sGQT4b$N zmeX`1%B~T}F8Y9QCThe($3q!Wopq~C1K1<)2L5u7c){kP{*KYHy4W!*P@E*Pyh~bI zB#Q8|28r=_KwE6TWpGnJZ_@X>f|S8>H7eA~AWMQEbsuZ1I0J=HK$_}}>Otd||11j6K@U+psB)U_K+~cu_q^=AW7b?08vI&YaUI(wq|Id^6FIzrmNk*2ikWfm# zq^R9|_c-7Ev@|Fwf{nF(`*!%$HE!$4$I<20;_A|urM~hH)Z9$sLVTPWI@g(9<&_vw zi=3R?O=nVBtV!Dmi!06gfbnPwuJJ+6hGA3j4(;=u0dh;sXaHJH(~xP~Aggqmg>0}4 z4Kskw8(jU*t&6DOCk_`~uS)5Q%E zlOOeH;eh!~kmDDQ0zmC!G#W{ae~?1Qk}!wlua`rr)xE$tMWcL8yT&AZC;=1+(2BT~ z9exlfhbp{0>EP(D3{LpJx;yu{p6|DhC%lrcqjIp$FIZKRshH<5|b z$vP;DRAy$5@kNc2W;7`|CfU$!R9LlzLLw?jTB{Tt-V_#*Vg*> z{e0es>wR6X*Xw${X#E{qts+=|>Lp z3sQxBi1ks87U@Nf9z8lo_t1D8{$oisXDr%HZo8jWtB|NkwzgiRB&ZP`sDo)@6Zefp zG_YFxHce_x@ccQ!U1wx5k*p%^mlc=qn##4jljiHA0Tiz;GB9u1QcWs#zsba zGpafLhLMRsLyNs`?bW*hVlF)sVjzt?_lBpdYXQk8oTO+rOkdjbbzE2>SF;%T@JNr{e#LSbj+7p`)- z5?Z$Ne&51jZ>GC9S*Li83CO8ii(taG!%6KJ;Cb##dFkmKBQ4Wh&=8yO$vST1O&+w z>9J^{0J*Lr|J*ey#7V?YYu2o}y}V{LYVpehcVC-qHTKX}H%%Bs&O(xymzF1~96)%PI#>VtO>WY&-P9X95M-oK zD&_F;l6sP0YK&gPTtL_r%FLV-u{!%^=9lJKmCTYxC&@pcQuec6nc94qzf`Wz6UdF! z;Ah%&rqh2HQ*Ei!`#WXYTtu4=FNdyXVXAZ2*6f~|Ay6y-t{t3Mf8S?ZTxe(6ll?vL7OH=uV?!6T%{ zj9UBVf-0H=ptkSkt?*scV|1?^FgYv)>8d>heTv_QRkJ9+ymmhcXk4F)n zt;$E3<&(of0%o1Txx$+R^dSC@1)!JP%3l9E#mEf`!*G_oDLt*@j@({9G~1g=6K*xX znFv~;PNTk3J}2LX#^*+I=_tPq8iVlM4p}!*UpJ62hD2puxS%#92Ums`CMGRs1pnXU zqQ0MjVNj=G5M>M`2$r>V~{<{HwT^n>J6GB`uA5; zswi{Q24G-O1=q4bh9?;CdnXcMkD^%D10n!x(@C5cuxa@xo$tbzEtFre|N4V>^G=Zn zJEJdFmIPPcT~BV|?|c{{_W+YZ-~msG6qy zkA{@DWZBs)%+ufYxckt|_@@D5im0x9qX(r~rMXDTjzYxazl*&Lm6G+n(1xX=s>EXL^4XvFMmO`pU0-@=Rbw z#iLKr66kJB>wD>0eas#Il-X%1RI&pqYihH%TscF-xOn;yD_hg#Rq-Z9Kba z<2cU`I<01N#R&S3?#3FkB;&(|?XS!(-W7Q#sps2n4liP{p|Z6v|115y0Ld@ceflE9 ziq|&WTTHa1$8ZrpYKr?ITN<=DS}8TUh_DRz31C+sr*e6e=9`~;CQbzv>{k7cV6;6l;cMl z>4JT9eJDd6)EKH4Etk8Z=FHX0i#%OBv=(cXih!M_M?R%dKaR{y( z*_Q6FYbhiswQYY@=9@)5Dy-Eei@F7{zhY?~I{i5RBRLolb?zJA5?~NN2%1WVqv%}C zwYt@3hr~v@TybeXpbXR9Z}cskxL7a3G+tqdgwmO}8`R*&W1C;--Hh{4)3znbCQz8- zS;2q{xbcn%swdWJVMLXoH4Hosl5vUD=C zQc~rs2%Yw4WMus1h^$veWFZ67zFOKxgg>a#q@^ugxgsDT^it*e&$%SoliNr{J|${U zB(%nK6%gdFl7)*KgTSH;gl56=$z6M`igjHC(_nF#L3-d-b?fq>+l|g{d3hmWS?QLg zTaWqLf4{%>qM9YOBxl3a4?p}WZbhS7hWTmtXJDAkkB;RX_b;`T`a`_Gz^Z(Ge$>~S z3MtEN$w}1j=p?*=z%oa$^-Ui={V29$Za`V~8>Qf zR(|Vy`vZo7% zgQ#wR55$28qabM)4ywL69g$M%;?oV%o&*(e+ic|I3_VPdeinZ;&LYzfK5&qJpgU(y z3nzt$W140dX$Ns-M{myy4fp{pB96$(4V$~!!^GNhreoIbzzi$f4f@}pmNqdnlXUIM zN&nKvq#6%%N7h_r;ui;hR`|ue4IP+Z8uz`J6-gZ7ju!=6YQ+NP8|(l1O&}Tij+K=j zYnl>x-y?5o?sh$^6^@};;jv;|QF&T*YG&N- z`qoW>{+Zv!`*!WM`}5oN&J745vWZIMtv8Q(L>`G9diV}1@I+B`cj?ycDrql@n0_%0 zu{ycauhtIfQej;gvTWG__*+p&%H1e)pygPy*PwX&`|EC7y(cr?I9BIEJzf%(D7Qph zL@Mr8XwiQ1Np|VdI`{qOPh0jpZckNYk~jh3UCY*J`@NzdyJnj6t>|Vl6NcC&pz_PC zYTZ-`z+GQF*uy7fOqw#&B^UaGV zI+ku-{;+fI`{1eBPgnJ=Xx*&earqPkDPUsLs0KtZPKi}k`C~}QsK(yU_xR~r&LDmU z2>|rod$#y}X)JXvIfuD9=;f}ur%qeEiyHkgjE6T@*IPVS5XXG_a9m|+$hh_QH;DJ) zefscPzb#K~Ev;Ff|89FO4gJc?R?ue=W~?>S+A7@gd{$N!c`&|nO*SNum(A8+?pZ$S zCzs}yhkCq=07RNDt=*I2OLv(~Z z8!mG(EU7iDyEgXFhH-aaK<2JE^528J8J_oy3LFn z=V;k|-4J@w$RYCTYnXe@UhFwMN$#Z$-ROV67yU{~jX=KH{$A&=$$L@Fo3%yAO{Vq9 zkrZaMVgvYLO%i!jH*X&o+dDf2_l;YADk^r({)(loTeogaB~GZEXr=HeC_hdQ{N|cr zI#w-JTvX6;&!Ou9&s*;=jh(t{#6BQXl~dfBn-#|qCdnNrF+l{-%#(9RkFhEOJv6_Q z6diEyB;GP*m6hs8d1YRr$-{o7UCn*Ty^ov2;TJ>+Rj$I+OS3$ujOASaP2rAY*`}Bs z02gZ(v~J0>ayU`<_NqO826|H`PW%FC)Ymm5=0zns8Mc5L<2dPAb1*t z9f}HXUEye^98ZJJDyd2(F*&>YD$_fOKJ5LKl2)8r5Y}Zs z`K*miw~Wq1>24>I&IR%{FW{iGS7k=j$A^}*9|UOd>o(p=x__^xYnHM+UnGobe}99M z7VEEhrS9zW54-zf;z@0VW+GA8M6S@fc`7IgYM!HTBrKCFPiisprvP92J28 z09jMO$3`G@@PFkJ>CZlB9NZXwbT1OX56R@mOo$ZtP%;9T3HyN!Q3`h!U;6rtX&Rfr zi!xVma869&dC6P6NJmBbhzuaQ;*$PMNvBG?_ERYu8T#uP>_n2K%sL9BME1UIi!epl zMlgGSVZMj#vAkto3wXf;BnDz`2@(3=l`rM!-&MYJ1xi99ihB4@mcuva!aSXB4YdTj zp}!Y?!cdkLZ+n%H-ymp-WOtBm16E+I(PZB*OaD5vv|I8Cl~dA;gIBpWGqh!TqLYoV zfH`KltMW}R)>icPOMU_9lX42eDImX`!Yq}WXT#5;4-rdI_{{-%$2$LxS~vv5^?ye> zCejd#FxT?TKY)uhzo$P0C8Z9Yl%hVf#e44e1SLOFMhm7y1!zo+1B{i%sA2QZzlsWM zN1wUrOYfcUEK(LCzStcBxlvwEN6&!!fipT{lOS+=j8BJal-% z{EHjwMZ99q;V)3k^we~N2GC$P=m%}#1%5ble(+=*BJUq2{R+w&Bh~4-$CsX?suG50vZz#7m$Xc=1ePIz`I{4@F%=MsQj`6`VnoddE5)ZqumuhEC36YM%p?a zJa{m^o(x-lm`pa^RkVrJm{Y4RKq>qM6l zvBy!kD0_c#Mkb-w90vqaA1nxM5sVLrQ?r+>Q=G6&kLfG8cF(Q?oZEaWJwvhQX!{er zo)I>kA2)Lf)mNv{FwpOl8XAyyTu`Wr{hul0(n}eaVU3e>6k1w5mPzgl%)V0i5B4@M z>R{PE2a8YM?KB*}9nE91n}_t*T2yiOaiWyRnqNcR*lCQm?s1ikx#P-J?W#NgBK zKy2a(hVgjVvt=4&bm+YMG;^|bN?`pOebLl_->Vg8l{F<;OjD~>Oky>urCuITi9I#S0wcmin8+K9QnDF2z8e%zOY4>P0U11?G8gY_u-*PDwEP+* zV&k1e;HPD*XgTY+M~-38*X)KUYPsKfvQiMI6HspOQOyn%V3HQQ0fZ}V6gzxc^z_Geh3CMt$Wv-s?@;H`Z{ z6r(k5iHublKK!tkYw=PwwzRzNenb8)#oS;0Qs<>hmlBWj+~4{*K$(5JRkCP)z)X<8 z=Q0w`+rOc@sBlr*qF>InU%sK0qO^g{md#?lqw}k;3b34`tMHj6<7H$IBQui6Y5(W% z{a4S;Pjx=%uJZ?@qGde#Wz%V`|2ysaZy2k7ACo^b#Q$F&^?CQMr&?}k?^>;FOO=iN K#FX*#KL0Ne=fZjb literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/pipeline_N1024_baseline.png b/examples/aot/matmul_optimization_guide/fig/pipeline_N1024_baseline.png new file mode 100644 index 0000000000000000000000000000000000000000..98fc524673cde623e15955d3fb9963fe862ae8da GIT binary patch literal 71181 zcmeFZXIN9;_AUwtf+$6$2uKq}svuQ5ib(IhHxUE^{)Es%izuiFD7^>)=@2>rX#qq* z={11_h)7QeJ@j&xd;ibb&vVb#`}Kafd(nIZn5Gk5?0HMP2m13Z-xudwFSw3o_9P%H+R4FuB}p&8H@+ z{ycU5&JVV@m6grHqE;$)x^*SPndZK+gXxDXv}RW4i~Ha`rR$t8-1CFhrw1YHi;%q* zU!TV=-Md~luT(#ja6Qa?K}JCXmw{rM&guAcqlaKIO; zt``{7O5rEpP&fYniT!_DVt(jv&UecyD(W?F8s|zo)?WVas`gv`9Ls<2>X6b5vbrC+ zn=EE+W@h#Z7196bzbfay6c&o$XLRsnzVjs1Q_A2Pe{U<4=TCf7H84ASIp(mfJA|XT z8RGRl^09THa;>3~Y3jopPIO4oenLj5=X31nD3$}Cv1hxqJ`NeuBqSx*8`$&l<#q6_ zL2KgT^pGmfPOUC;E=bYV(^Jg}jaZ}@I|uvo?ua%V8oi!ShQ@Y89*HEwGbB6RogQHr1oBW;gvzy7a54SI(e2B16k{C7uHD*!$K##ckDYDtXjZJX?I>x({J}U z2r}2-aB@;7Ej1R+GxDDL71Jf-M*ov2&Mi1BPA{`1n+2s%E8w0-);G)-?R8WyS{VO$ z#Oi1^KHP&<%ua?X!an3wt~qS(NtI!{PDkaE$^m@31?+}L6jS%kZ;TY#i&K%VYzETo zhCDmY+hCIu_zdhposuYWAd-6b=>1FTP|uo-M1r>O{wEM4wi<6uY+a=X=Hb)dD`$-BreSc1|e zNziZkXBfkw7lK392jg%$-q0xoJNhbF-+%7D(0b62x_U>!MOLv%v9JSAhVFTZdL};% zkvz>#As}i!{4AOKReSH%?zoo`qbdEFN@U<^$##z~5>hW6kODapE53Y!_ws@XxY0S1naT6 zIGn{+PRvX~PHc2^U?-KH`|3wV(9(Qrv@zGUyg#3b2-Ss)Ns8hH5f2@F!fD8e zd%SB&-%N_9wj6s9o@h1kn<``Wxr2gffa*($;E2ru4^rO0;2Djn$=SgiTx3-v(_A=y zbsCY;lwztDs`j06qzbAXIkEv^#~85b2w5>O zD%pn6w3e{$I53P9O6A)fb;UYio0^)4Ypv9i_a#|seyT8UWcXpB9Z<#`_N^uXtR@Mx zr}}ef3GuUmV6E2NQzC_Gv(1gvQgvrxYmxxdoTp*S&nfgvH7DV=Z(4MNABJj_3}S#@=dp~ z?wXqIqe|cZThs&RfhJHFGR-gVHoi>)jV`MlA`l=NE45J_hwyNk8(9kB!;wsq=ad*c z1tTAI#9d~Hdk1+k-j|c4Y&X~0PZw0B#U^|3>1<%XhVZ3Df18zVxTrlbD%a$Z3C+!e z5N+C~sclE&@8_FTuIEU+-BO3%IWfv2rKgk2P&r5yB}X-xJiD#(cv{V74riA0$m-#C z^kc>DUX@G-4P`a0JG5lqlxn^1k_>lEzDiC$YEho!%Fnd=beg>gW9MR{_q8feKl(Q29@E4d6cHRf(`%emg^51gtv6#pkiu*VUpXT0Ih>K139qP-p>$;u zHv$~Pqy6=1g@Bj5^E+;-XAL#sJ2MXodLspBqpmB32v9mog}_eTIS)SlOst8x^5!#m ze+WveO`JhUKBlzW;;}_a1&ixaF7rz;kl?@PzcuDiQ;hvUXKYTD^=n=JLpqnk~lN|R2LMq<0BI1Qf zbMNAxiHq+%Hl4^D4#i*Bm}b#mpJ06vuotr5-uSlOVmoX_k~W06*`9d4t~Dc1XVORL zxzSH!f^8deJR{^l4;j8tqKkmU&{z&2=vw}kUI=UOn4^bw2ALV(91Qs!HYcf1N-DL6 zW;>ZOQ%I_1Pi}W|pY?f9wMZu?*9{#quU;elj71(D`@%MY^60UEZMV#HLr*ng*(em) z`R}Cb2J$>^P&)fzFp+4$b&;S*AWuo;hS5VAlW)WBu}2*>|75cnjgfenM2-MqS&rs! zXIHx*k2|UNAR8r49?G{u?Xis8ec6M^9YI`FC#t$+x&rc((0P^nRfKLBCnQuMNv7!#4_oS&uCic@v!G^l&h6g;|EDSG} zAsMnB!5<M>fM%K}%^Mt%TraCqfPs}35 zxMbH=SXeV%?Eqxd{&eGXBf`4kU1?dVbWgUJ&Qs$_#toyN+E8iN>p)=7P#Jx*^yob8 zPO-lAnlmAn_gLzseXX0`tAc|wL1Df74*}89>XuaWtoBe-Cx0mD6JZ8z4{%}?1^m0u zIWsV{7VRaGk1rqbr5+?-I8t1Ve+o0psvF#CK;^my92&kWWuSr<(2Sv^_p3AOX0jV4 zk5g(wkTHLfW7R!q4Rv9i6|Jej=lqBSiJ0$ER#OANOlWvE0)Y&gYcSS__YbM@H+3oA zA9#Gt_`gdj|1(dn`w_XvT;wtFHzHIoQ@v%=6k4o#YpKri>QDQ;QANgPa{V$Om=Fre zq?EyI=|2gkmq)d0Od_XN`@a-&{*RO#MQIt+tYaWq@_%~FzvK%%%h7{v$m-T#w~7VW z_VRD7`dNQTmdT*5j6C9qOUv*p|7Q~aC7rj70k$3W%f4vfkFf*rMY{wIF#Gd<=5PKa zuHQ-nR|mG;^Pfr+x4%k)jq;bZ8&DO)V1i>FzA+K)RCY`Tl0RqY*kdd?E;aNa8{vV&9Cfug6L(p zo{9&`Eq`<)zxP?w(VyuFU6a?^a#h>jQFu3Ut3b@s(~}?qZG{A2W5g8QqUkx)NZIzq zWo5NzzhK9FScIoW$6nCozb!*GqCIIGV)zvBsVTa*KrreYRJkWTo9U>|jmt1u7ojWt zZo13^#lKY5{)-y_r?$l-?g|2Nc%2-wC}{DVHlA^mL)y?+n?(=Z6%=%-=DI;oYGGMR|nTqhtot$8e=g)-fY zyPNXzZ-mIf=ZH9?zZ)RoFLk+5auo1RgJ9 z=csqS@Y8THqQ2g;M>211rj?b#Fx`va-5(6b)`M&mEyumhN|It@#?679=EcSKi%j%7^U&3(0)$kk0u`Y0*Vl-;+HIR-b@3{OgRA+ zs9J@9b>+HsY4jKo8vw7UM@bgRV1YUvP@5b9N_z0l+TwceX$jInjI!+bo2$3W8yW_I zJOfCV)#`oLvp0|uRZ-CztKBLGjdTu!R1GNn{YR~Tv%2IL9?JN84GQT9e9%olwZEuN z0@8)8#{3Z{?`ewSNYSN4Tj%+r$b&5tkp~><$s+_4j2<##sG_8FMmflW3lDYJU(`2dXh$}jR7v2SB zS(L6~-ekN|M&7_VB(j;%UK7lGTJgy-`E)#srCw(orvfO0s8fwPIu~Ig*q3_+xcUh;PROHGnXsIdn#a?wXa@Oxp zdOLrDFgsV`Z;PxZrqPYEk)I|JDC=|*8s zmq~U?#^PsC+|MahgBz?+(&OdT8oUY)Bs_|vppFX|R(3f^P>aV4#Eb@~U2FG7_U zs>?r@NApB9QJ*#VC>I?!{*VaseiscdGp>8x#{PXmxbzb0rJ>Z zt?xWgT}>Rgpdgun$h~r188WrChx!JZz$W*Bbu5+Eh_HkcV#;UEia3ne(dh29s~6fhj6Q6{;I)D5L*jJqyH?v@ zpy4p=qVP#764xXBVhy+F+uF~O$#C-X*ysY5czpR+(Mu&e*6Kmouf$MUQS^FBTp5lu zhj&bm6HZrl;6~tfcJ;^qHkO6^=CVuV_QdLt!g4ZcKpNBd$}h5 zKL5i*Cb*ax#IncbK{rgk%r7sjU!gg*_G^fZ-{=NhR=%lG1Uxx=L@aJtSx_+UE6>Z_ zh8O8(fy3A_kJg!oebf!ubhnRmmvB&am5Pv7i{59pg7@l1IX;UCyq!}wGov=Fj^Q$t zn?KsfsjQOEsJ^?tV76qi_qgjyYk^#XA2FBuVfK~_35$&5L$2wv-6)Y~WjIt00ja8~ zq@IR|>>{n*gSc?DtjGIf>s=0{{UQyP!$&JK7H+3TxJud(#r17mPEUz)VkY@aS>+nncAjD355|G{AYTio1*A3A+S%?z`EEtud-#Kr%JhP z;b8NM>Xo<6|CI1XrN#OI5~6B_>AxE3{{$x5qJTFX`y#^q->v!oT*w3DRKHaJ zTP*sY&jezC+`UG;_R8N!q%gUD?hiKqe_}@0%RJ&{EKkh;9)&Vin!eDy={)J>uaybY zeOzwfS1g7ozCpz`c7|H5WWiUzIS z+$?Cc$IL}{57PiN}v?FBR*bJG3mdKc?yK04JKUIngTa# zND!W7qqh>N7#Vm2$fm*4BA}u5V7Gae5FG0GtUW3^S`S**gt0!tr?y?A&|7cWMz+ab zQ!ZL9smSJHU|D#fatN15t7{<z09_iUiG@ z#FE?b)|g1g!{dr9>~DaS`|xv3399j%!zl?j0ifzWkTGjic`Xs!5G;dE%Tj{O93CDm zSyNU1wRYjkTKu`)w46Fd2ISEY3JHJ=6{6S`^H!z~V*HoZT1#g4Sd1^-FKV(b+g_h( zgrM)(wlGB@O`?)c1%$Ql3%Y8JA1yJ>%LSZ1_^xUE213@$XJJAxQPpA7wO)(mXizki zdK}F_+{>{*n>u>rf_mC2b*l;sA9;eFR$pYjqeoSkl$ET%6vaB`=7P*R%zIagM4naw zcT&R;v);G78}3Gl&hYB*y@pSQB@L9EV8OGA^&QbmHtuPP z<@Oy#nX*3E-*})Zej670BX0lLkR#-pGTn!w9DSkWnv|5q_iv=-w>FOTN6Q#vAHkc1 zm??~Cb?r8CsUX3@qUpL5hbVIg{jCokQ)3hEN4v!8QMbXMF0=UQ)&Q|fW}m~)X2dH| zJ6ATr!tV=KD}RF}6ZIW30bVsW$}fL@SHvD`a(s>cTjB)0alv%TFzgk++;PO4+uy)S zA9uRei<+EdmdIEX?#Yl3+Dfu(^1DTuFD3PJ{cIWsz~I5MEfv4GFQ^@_g!?Ar@$A3` z89F#Pq(y2I%u+^IB$q&E$mv6kAn#)kf*mt&FwhO>OGZv z-Yzb4CYi=|cA#)V19BFdwmEy2@udkDw0;h*$y| zHkJ%9%%e`AhtwxWVIV6egVDKE$+rp|6BgeZM`&AF065g?h^-l&a?8)uwtYE(XG6p0 zC@M;T_t$pFz#V+9o?<2f@7$A?k$KW~NFq3Ce#ug(kM|tCZ!v{>MBeYCgbo<*V2-h)w%W1aRl&7%%X7S*LIm;>OB%`*NI#b*``)EO z^h{tclWI>Y9$zYcW}@S%^WCp$*2)5j5syx;sZnfURxSmN%Lu#Zq#I%CVP{+8RU%Z4 zrtC`l2;5SZy#LN?yQb&TkDvF8WZ6yoPvWpO1)cPqQ$BoapCT|1SR_+B+y1&J;|r#? zjzWD0erJO(8NLM&59uK5Ck(_S>Oz(b3!(U8-J%q5|L%^7L6qZr3HZoLonGMi5e}c$ zIr0+WS={ur(rze>u+nItc=gm2Vkk)5mQ&|5g6y3zgnVz~<(9AP(TB+^2=jS3J4-x% zrt}*Vd(jj&_lQR7R>=1bx9qTyxa-Q-WHhkwh!HA=7f%~$->nSpe99Rw5Vlw?Flf7m zDk8$tK30WsM~M?U?lPwf=nxQST#e7^qR+&unAA>Nn^NlErP}|5Hvf-zmKP|PS<{Sp z+`POL`)YSisk@14g9*l}Ppa<=<`#vrEh#}#Y}PiW8J5fjV&Ey|B~91xkZ-jn$+?71E`&!3&Lp5f0v z(!!PYz2rkqOBs4C+`~kuQsotziMV5j@a0ZY>?_C7R~J)P9nP>Sm(*AnV^Fzd6WpiG zD%E!f({2bzt9OR~s<~Fp;wXc3xjDH_DRwOjm3r3WsA0KK7SdBjKY=W_fRh$ zHmtv!RN^l5Myl$kb6Ro!#lZnZZwd0we${8v{T)22QB^-*h}xphTl-sp zsagQQ8>M4P-Tr?K5t#Xb!1#;*wah=^K>u2-X(6-N)xi zS5V8c2R?K%Px&+oIu5hz*7lNbePHbaJ3_fTqpRK% zNk2@w@l};N-Ja>@Z2kq9b3rLRHA%N4=o`auyAWh&O0!*WaC6}<96NaO+)HP|XC^zf zb_X&Y9Fo=PZ6EjPL_x?zMb6JVsm$H%#C}lU!AEYl;9hh~d86+cwHoNYAgXKuAF{G_ zw!9r$g1e-$z1|=<%dIX}Gp20Hkw@}uU6@FfXEn1Z_myTjxwEpLL=0pbjD3(I?*$IE zBE)}Lj4OEn6FT#hNBv^C@Mu9me(Z5BZo}9Ct6v#kTI6p<5<$;a5U9q@+FBPZE5jg7 z{Cp|9%!GhS(`M6N&fR*;rIgAVFLjVJtg`ka+9`5&PtiC5m+`#)>6yFAihO7zHfS35 z60{jBL~>2Xu-I|GOJ3RUn|658>krCQ@vjR9mm9AsCm;VKZ5TDD4RVV=vwzsKaA(vN zET^xBGP*&I5N)!s)IwXuTaW*SwI^Tiweu$kuD=%bU7yPH7KLe$LC5(FSA|!DFqQwI zT<19SWOneY(`lRf4Ht7{E+7rF?Uf1!WP-EIh)!*IA@w4(v)nBQ4oRKE)9c_OV}Iik zr1bFXlG9e#mR?ip*td7M*7l_Dq%2h{-yVH=)|P|@e;I6k4*pu4LE<8r{Pg*{=PIb+ z(&90*$z1K>P_ouNW|TY7XzXYB-Iz2|!~9sJTY0QHV@Kt#ikP3TO(A4pC_U{NdPs1~ zQUkXYs@3g~wSK#CW#4Aazr07b7$e%+<&%{c7o zM~{PhLv2e=OaV>NQ-_!m^{eCE5O=M+ccS_0x{f<<6x>UAToP)zK;Ybve|mBVNPClS z@MS}RFwt5h{GBMt2N5LC4Yq>UL*x{Mb|59brrp(H%_+dlntL>gk}z?`_L=kO5?`&Q zl(2Jt?^0c|tQ!XIPLD2ITD-4eLO+y@M~B#DPFZq(p1q%&&Mow-4Iu{Z^W-dr#T^)r znWP+YqBr5xH?*tc9*9WqVTBPN7OdT}aM2~P*Tp(X+PU&#Ba*G2IoUs>+W8(Q3PMX2ao_-!Y%9_gwckN0`> zG^WUiFf8FjHte1x69!?NYFgK`^hIm`{X;(GF}ujhfU27Wvu5)-B8H4;T@oPJtKm3) z@gZN>2`gyFslI1aKjmDVkTo&>9)AlRm!&kicHx-|Ck^~O)bi$)75*dssaGU*!?#EgCOf zuj<{e=IdimHx90p5>-iw{&kCTo-qL;P9-7C&>j}ehga$qz~8&g9oBt?ZwfU_>^0bD zjl@=i>nrgSr6r@K6L(E1pb{5ueiBr$JgFO;9kor7uKy~FL5+EPzOJZMyJcM&ug#4? zoEtX?YEBW+X$7JP8n{06HFZI$;`i1{{2Ruv4x+A+xKW}e5fLc0B3k$aoeQ`?>&wM& zP_I0z!84nc)Ir*Ier+YE1;m*}Lq z2_M*jbanPYm~_&iWg@Xc-fIvQI*^e4L-jrYSn)B&er?G^2u>Y?w7o8>QKmN4-IBpN zg%2$l5^t~S#I|v8y%_%a^_xgf4+f9ED>fcJt&mg})99W8P!~LPbL5R(u!#_2Iwln^ zh-&iizhX0kt)BoZ9?*h#a2gwWkc`N-Jn6-L1;c6m-$0uD-I6%1g z%{jQWN>xt|EXiDz^I>LUGeG2dVFD?Oi~4DrSu`qCP%nrwt_HhwUJkS+S6V5UUGq2P ztU3w1+@?Hp`Z>nh2QZWtO%Bm{r_1}|PO*0bft&F7I_--%9)10rnvH7ZAS-rBn~a7(CWpn%qi zBK3(u)>%MFOa-YP!qIfMQEunf3NqtGy9dip8i#&JY7=jSsumSflE<5dSNZ`|dAsJ^ zq|9m;<%-})+-tK}Xlx=@yNOQZIy6}zhgX#JIJ}l&u7M9Rh=N3yE?zI*8Od;G=oE5a z6LG!fuFnD#eBgA@V3?#jbS24+(rc62n5p=VtI<&jfrjoP{1PRijP|W(V4E{bxl3ll zsM#>PPvxEs2#!pPakBzUN)+YT)3Vt(4#kKHf-2or>lJkds%3d|bOc1rv2-gEl#Dc( z2&3$|l>l}?~gFW~xQk4_^hCpqL*dS>DRJmQ3DDax!I^|)uT_1@KZyin%H zj~_AJD}G(U!4Obj;8%7I4iWZPYagKbAvU&Op5D0{AX|)0G=cS408s17aA2nu2PbE3 zcD6kSJ6FAzj3*e3Ft@SsT=`K4-rKR{-*&7~4Ap@B_}wx<0)w5Jo0=dYR(Vbw2|r#U z>g!&jNj{;svh=a7kSvxM_42M%2fJiitf85i>)hPc*^?hT`E(4imTnfQ;dd6X7%x40 zv2jU){Y@~8;Ud?FJ?-cggk})N44qS>-ZL5&JT{9Q|SxPxhb%6#T z8jM-AZBbz-P*1Y57}%4I7*t!2W~c!qT@6-u9PAM=JBrLl(7$}}3l{?@(HXlyaq{Cz1y_*;eKOiu^ z#~?$?5I$4aX0Sx89|6Tl;J#Lso6FlALds~}J+v`5u>%b%9o2CvJ^vmFI^p^{V*Z#| z7`{8UBI&=ldQinVsJv}TlN}oPD_>y>{xL6IR6VDUg1yVrhjh6tUglL}zLc%H#O_bQ zCYE(WOu_`S_W*w_ZA5C}c4^=E6EKa4!24d2d)MPm(t^5+7@#9-tfZ3_2A1PB?URV8 zb<9svaV*M4K65v>l0InZYdo! z6~x$!vAhOC4U3!~SbBt!bXZpY6^i{Cd-c_mx2^gk=g1E(S-Fai1uep(o}nYn&oLiY zjpLPQOV}vBF%PM|2_CXkM?2e`kmvT!73o|_8#x9yp-rPwD4d#DMkBME7CEYO--0w^ znl&sj9!!>_P(k7lLwxvSmD82MqTTWrYCSR@9$p^wX zlij3N5QAZhqrA1hYqK)T(4xx>hM*ls6RC_$5y`dt@Ds@1giq+U>}vek{NU;$5$n`i zQkq}xW4FYnKh3M7(@_Dkk;PiH9jgQNu|1+&Tj2Ql=<}WLH8%rAD4UZu!T7$$_i_d8 zDDLB>v!Spqdw zaW0pH+kGc!9czo?(+sI&-pUtBv+A67-8ZN+0%(#X+PW^xgWF{osgqNJPk;Yd_LN7e zl5b7gEfJMcIGj`23l|m5QckMaqrXViRq+mHv*ppizxT2z9HD<=04=4PsUPni2$FH% zOGq8003JxBS!RE#BY}pNSy%4mG}xOI4jVqn`NhMOG}5W*;j^uF#Uo6RZYcnCtOzZ6 zi_P;Ar-jP>J;QuIFa%_IV{wEwytVuNA4{O*1kw6b%=yz6eH*4P;?E4a0@ZN`yQ? zplV1~R^e*79+3moyV&oKm8>7gYGbtLzIN@>q@j}X*PIju85ypcBEar-7dag=(U_>} zl8hpRFR@;e$nPV_%9-KrFhmGmDsSKXVtR3j${y*E7{eJs?n)=K3Wqr5EmBj2tqJ?HH1H)*4e zk^Pkj@DH^p2+kZ2p4oK{FU8a>B#-AioxHUhL%K_4%X7zSsG~z@^#wh(8R(4|Ez2o9 zm1(QNy-y&Uua*i=ffh!|OT!I@D-&Z}5}zs@+AdS59t3@oVzO^=^Y>wKdAh*Ei1&8FGvA z)7G}r(@=JAgYd#ZBgByZV2A@iM~}M|LZeEx0&o}sKC<;-0>%t1-j|!WUM#eu*RNHL z?mn--eQ+OPmM( za|#vHOLPCEu49kZ=p<>}ng3gmaWz=Uzx&Z2FU4{EzSSi}ZYE-$0j@nkY}%D^ey(p~pVA%}h)%zAzyGc5G2? z^XawrsKZ0X{L)>yoYD|v@oR?BBF9vT%+ixu+UsD_&P$##}~f*G;X!oB#qovx{M^zj4QmiTsEa;ksE z?&bzr7P7X^FnZQ!l|(8JIk>v&y;`NJjE`CvyM`N$kq3`M6(q}?3NM~LB7^oY53M)U zBxO9Ie74gRwc|B3x-&|#sZ;VaP##sAzX*}9pTUk)**l!gRmk?a{OmRCTH2v!1U09W z-+obcV_c_%7N<@ImA;uXyEdjj(4ue5G5TSGts;?@3S%MkA!<`6XlY|$3@ZN70a;8R zEMR2i*mNf!$ay@NHyV{Y&%JJRn&KYY3R`Syv*`O2eNXzOuiDP+aG)P|Q~yyyhN3&d zu+8LdL$oWep+2++&nX1$>n<9Y*bCptv(mB09d3*&ORD6M#dTeeorXzz93IVrVH`qp zR(c0sg-(MRP2eN_YXd%0(K9B`w8L?Y5ZO!TCWZ`sh{|3L$<##vp-?uNEqUeQD6345A^~*8_AeBTa(>oa8ES47K2NXF zfVo!QaOfo{q!D3+yp}lCZx@%J*s*T(yvwkXqJ(xWoH_2JWm34Z4tC$3>_s}+JYmSq z$)p7TlnK`OXK6l**@jvMtIN*$zqtSai;l}7hp(FNwIkj?MGjY903s8EnOeA6ms*tvU#8Mg+4MQ@o;z7q^@aY}#_Fs7 zkuuNu>CPTGGSl=ap^Jo42W>6Lu-(-{+M%SO+O@>-U0tzYvJ%0WD;RWB^?8C`0JCeT z2qi$4uUeEWnrdo(_W-u}RE`JnqsBeaMG~p!6%)18$UQ7oshU%)qe75n@djD-DkltQ z!>2fvLzAQB`4aKeWc&jlZ~b;*`In=~bDO}Pz`F)#-%bSY+qw5$KkxOA=bQMGXFH;mq|L^F!;Mt@&b#0z&kB+l(`e^H2kCW7=-GeibO!xCW?r99f~Z;t^xrr+DW;zpEGl{{j_x!QD#vY z-6znmG5&Mgnn*e*&j=}Qi%Gq6L})B}1Hf6JJ$VJa*;oMdHa|v4&-_^HEdY)M;4JAgzfl&{-BdcL8C*&q z*-!v1X059C_elTBU#qw)cJl;^uowqR3k_N~3b+)9YXssT5sCNGdgO*nbBFkD-(IzU znmMowJnIV84oBr|k!-64+!nVQzL$L16Pd|cGN+U+r#(23s zJWhk|Tok$DgkD?9 zGCNXwU$;YjoXiW$Z16Bi>#i_#Ivso<)KYIvnG(iQ&hmzWWLTvIy-8PuM#DJ5go)&N zWn7MU0#TBx$bW6ZJ6g%T4uT%okP9v9iPY#Rmj7qVh0u1UL=`R{TCC{G?-z6%9B6ECT2O?T|58rJrQ!ZSMYMlml; zdwzM4`D$Ed3ov0Sf}rDpPPj5 zo#+pJ8zJ0FN|Zvzs%o|#US42*CZWbg6XW`B1rwh*hx**U9&S{POiHi)5M8IOZ=?IY zHoZ6HP7~!H`rb3FycabMwO1p9(q0|%X;{>Is#O<6=DusFUmCY)Vw71Flpb|*%x(8& zy*ayS{G3R08NKK|R)>*u7EZ03{WEKqR7QfgeI4*CQ;myA(+J&Olu5_QQ`l0yrrNma z1aJEyP^6@w_(vyjX%#wT{xCFf^g%&avY=G zs9n||>)|WKn!F|}NCW5pO?_{AzK{X-XW`0|q)a$&MS%&5nI$tG9?xa-f#L!1;}9-N z*%!dGp90`P5i3I-Cn!`3#F{^A;H5|jgNouTd112&Ie)*F^fH^a+dwu$=h5^Cgv-*{ zC}R2emCk9yK~*T9Q~JTDz7i}z^!BW5bL|GOe<0rC-7km}qdOPaoq9DZWr0beYQUi? zjKO_jsi8+q2Rw3Y_G2uvNSuUO=(SLLUp8EeNV;2~YLQI+9cWMv;VPwu5!cr2^tzS7 z_kUJXJ~ycI%5eGo_DhUubW7K59|LbhuZ!EU^DmEChAklqW}xNnHGf+%YRVYDYMS1s zKaur@;6y_Se(ph|LT6CbjIU~4-`1VZ`ds}h@*a42N>3xqlB%CqGHq@9;Md`o1GpFM zxDMx6d^=J;OnL10FUll~z4xr7KX^FS&nsU3v(NI$S8oH`i)^MnWM)f`&GIq+h4h%# zH#YH1JQ;%dvrrqgs*<_f+fTs#m1lYkal)?)2((}tTxLiesZ>(kYU<4qXT{Z-Cna`- z?VyX8&@^W%tm)n(?t07Zu8&*??MwZ3)rabSPky9+8-t)81d>CgZ{!TPO~aR8s&kJ zhe_Q9J(MhPgaQ2{4g3I+y=D#pM0}$K>;po3U%jL#-TcVU@{ngt^3NE|RX|viqX)Ow z%)>p*?~B;&i;*tW8ow>M`nh9C-LSi|-b|J{Yi9F!_`8C?|NQ6B+5_&1uKC3wmiaC! z6=&Jo6^v;@3;IrzG)YFP&?Tk4rRXuA=4``O8UGn1?#OM|c6#YSbD`Yraphu{-MJ3| z<-S|HESp#>PQD^54A>QaR!~#c zWxsQ~vlD0&I)X*ab^2g@WX?jvJpydLGxiQvKM&iPf9qYw&GAtTFdGS!QfGT%luX83GOwY85*%`l7$x9fb9T*r` zxhlfC=uwJS@%hmSGPeUVUic=m!7M`Xi#}TL5H@nk6eHCR0XjX0b+mSq&v+}Iv>CID z`6#yyRN1SHB#d;L|$>kyKd^TJF)E<^t#DWNd>@KJXnTCMVbN)jaHu?>dGgE z@#emx1$OyDv#!il306{4Ob=>4d+K51si?QT`Le`0+9;Jp#TIHrbH=Fr50eL!874zC*o-k{z+Y>)pE zq&%YpH13>2cuj7oeC(_|DSY*>Q=a2=JSLYrTa+YNJ^+>EyJzO%j0ou-Ijw`G<#{WSZXq_vU{d&Hs>T+I@p%^j%x(}5D+a?wmI zsO8BV1}7s*c}5?o`x;AkqYadx*oFIo2%9#p{rlJ6KKl$ALQd{5JSUf1PnxtKy*#W_HUj5AlBv$faj`eH?P^Wlo!BgT z@{bHk0@6uqdprZ5Np8Vk!iDp+S!eL`;|IQIg#Zf=qs1>D@JlVe&H>9WPoJJOsO9K& zjxJB3V!z~|wV=y%$~!5GA#y97jm5qJcR9ZbBW8e-B~99RaCBbwPI%#N!A9mOG(hv% z`De4!XMH6l1$-x5K+^F`&7;}u-nQXZtJ#>&bif%z@Sohu`!;sj0m-W0L8(g4!6B34?A|s4K{p)Z^=SKk+=Y4c ztqZo%nC`;hV%dV8E_#AMhYT{bFWMPE0@X0z5z-kU-@I4IpJ37NUUAKpT}Ank%jEx3 zx$q4X0K;^Z1SyiwT*$TfIXV)73+Npt-2>k z{w(5r7NE3HXGHs_OXrKP2}a{pevR`M=xCcYoG%|a7_vXBG5^=u)jMLXT>MX|QBDeU z080JxrNPs?5r)FAqeTLTcVB)sBL-}}e;r>yX5P+TwFnx`w!aW9gdVC&erQ)_xP5`Z zN{vmBdiIW`#xdlteW3rf^vDt_;LMWvrMPU>`3J>g%*UG^5o%lZ?Xw%4sjIhVUw|v* zkLC`=L4FZu&KOq9y>0aRdQa>qd)UO4TnC`OciS`S4fIR&Wv_1u^z$_U=9W1#+#;;> zF!!(0F4Irq;0n^^Mx{`%ZX4sZN>zTZ3*+m1<)7A~bCaVs#@JyoUV6!QK422ZtxvM1 zJu7dxdsEP6j8CI8s&V3CySMFf4Y{20;T37ySc&;D2AuRR{HN5>> zmF>7OqC?$VSj}Xgr|SOa*-LwoZOU7*?-i5LdM7`5tyv3h_-j*j_)i0Ux8MK^_@)lr z$P!dodvR7ENX{{tly`^=UY5_1Kx3$Se^!pBbmYxv8HkoeJ%UW%q?C<|KkV_8b!Lmjr}b$X?@%%{8T7p_MZB7y&FIveSw@eL zRoow4*6EB`9M02tAogUvaXnu+vt5xE5#?&Qd{Y9I}vyIBz%Yid= z8XK`6k1}RYG>^vAp1WtAI`q9EI@GI*Yi}54y_^N^$~$chR(D-w*L__n5UZUtm0RK> z&f+;U*p$-1NhRMihT=kgbxpUO3tgZT?rr6N+ZtS#*xqEW1&JCV_uCs@R~T?v(Mt7e z++P_Xl>@}QqK1w$7#mSx%RKQ&7jwG;w$aYLZOQ2jv%mLLEy<` z_9E2I1T(D*IWRE54!-EIbKgnDb&`M3CAR|JeYE|SZ$jM!%Q2yp?WDEB&Lj2jsxFJ* zVBf`_Jiy1PJLRWMxgv{(Opq_#Bet$!v>^ArW9Q#($Zr&UZy@tAi0d5jhynh5een1& z;+A`w!Y(8m5y{hP<|4;}`L~Sjcg-y+t#0qg33qPJ4ci(m=IA{dk$6$L+{U-ues1*= zSijt}teY2N_5cJpROoSMMhz{uQ#Xve$KW?4$IY63vvsni+`+^v{<==N%+0N7_-7^@ zbt~sTV#C)ZQ_qA#rHj>@&rZ|9T#n)!&oRGZh1qnFsfsm#d! zx|^~Tr2eR9dfl@uLN%?Ar^+^g+d;`p`7e*_fh_reDe}Hi#e{4wD(XrTK54g>eH`Vs zIM$;2G0-ai`ZS058{}w=QcS``>lvjm;ugp0svtrY2m47LWaLR~xy@h=<-2kOv)V9E zzGavi$i;eI&foKKhR4R=eo%Py&8>Yv&q>ei4JMtVmU6#VwVeK`o7V|lE~ANy=l@~M zttj!gn%T&i(LOsIrL>l)A2jUQj=>-=+t@+^ku!()Dt(x=7^3iW8WxSs;I{a%1cuBBCkBwXiMK+wrdJ)zS$ zE#Bc5&zdiqoiTkR_$x~2>yXtA!OZ~Cs_4y|KZwA$KZQNarNFf}zg2W>JY0Y)0q+^x zhhBa$Sl%5XxhZt4^=(HC&$C<&G+j8Z{bz0)t4-;SkWj;JFt5)Xqo>R^`S&j z>0s4H6D7Ga7iGeIVt^;P&$|x)jBBZg12KMQa;>ZEZt|wqRDmbQpX)36`w#%XP~iNL z9$zU3Lay-ovIz2JtW-19Ycu+a%sa?sb;3p1hZI{^Mf;0^Xz8&>*_j?%`!=Oy zsrt({RzTJ~+~%2@tO1mt@IT*cobH;&if;d>D{9?NiMca>X_tmLzBw-@s3a+>;2E_= zsdBC$E;QqhfNx^bnr?5l>+A2+k~pH z-SK><3xC;8w;tU^S+X?ljon(Lr#6Zjr>w#|@aLcIf?;f=X6~TmcmtpqylwP|$|dg5 zs>1yE=j`-})0mVS%E7_czuO-jR2+B~S_gi`9;S6}m{Ea8YP2a-QZ{YFf(AL7C-dzK zC;+5fbGzjVqgcqgrLZRuiS%|01t_X27;yjm#0}(HD^<2{!zZ zbLA$pZE|lQx8(Tx=*G`(oudpq!({#=ru&boU`g2h<2p{VmzI~w&NtC>1nvqJfYZA9 z>|NfTR%4)q-N@Pg&mNNbSLH(9eELnTO?l(wpxM9EYAvK&|At%fLbjwqah8nQ`EpbR zca3><<0H?}=!{MKj2Mf%cZBtNmtAu@)yj8%Ume+cgR}a%$|vz|x!6D4^=RcFk7OLkgFqP4C_@zh>^u=NG!zC~kh2H)~PG9l(j0udy9xc{G;a$tI3%r@(L{ zTf+?Bl^Q6hhRa`ttN&kpy!7yZ}#uag0JMgF0!R!JXgk-ko}-w zQdj1H^wj8H=qsxvy?Z;%7HAH}=B0scV71HyV)tdaqD9PJRv}kv9^yzvZV~#U5ExUF zFQj0h#_EkTCD=B%6meFOFHk7+;ey8J=#jUDZK3Wh60NSbWizHgEpV%3=2z-Vbu=(f zaNEJW=fl3mYNGNn=RNsf3b*#u7nLf~aq8|>d-zXKBNv}|l|Ee0d;c0gn zUOzYT(rA3U>^ImP(Po%so$29$6jt))6e`E3(WHjgjo227g!e-cOF6mLRAElgi^Vtwu1wpF89`pYUl`5F76hiKhME!UP@c0nDcF7O;qo?m=qqO*EbgGSnpIAdGNgv3{yauaJk`h_RAz_vI(XDM60y7iG3 zY|sXjGDK~+Sd*EH3>!o{bxC0?lqOU>?(}}r#=am|Nlrk^Topc^bbeQ_LIyLqvP!2m zhLv#HbOowDwnR$3p9jUlb+$&JQ667Xq^31(r{XNB-2@N2 zsTR;4Z{F#}YN9HWC^=8Iy5o&U>IuS<;Mf-{-hZuQ7;D}7Y`_-|3xNg?;OosZ*aiyK zvo#b~p|9R1Sh^HM$dqP4>t?O`IF;7cgqXq=yQ9KZDQi83D_wirOY?!l&;pvmelf>+ z4qp&_U%lL`Tl##c^|lwiMc$xg(0kT}D1;^MwNFgbkvF2M_&^g$a#n>!c{O zFO#?EPg>*+-#d>(j~wgt?%1}V#KeIubQc7^zmjn_ct)Dd+aFq`YRDLlVN_1uGtorUxK#c=YdgG#a|E}5*O)!nSG$-_2-%01y-10=j|<(O zIE(*jskY`Pl$?L6p?$Guv7`$;KX6XC34{!Q3WYAjOq`HeUZ+7q1-Ba@*~$#sw*1n|%~4iZo1yRp|a(HQy+WGgT7boocVN8&nYp^26p#U9vmFVYc!b_ zRHmzs3Mz5PTc|$i{rh?=Nl-^ge$HZX=lH_NeOh}GCe6L7{ITBb|+JFTy6Cl z-#Q>WHWHHnmx(U9TV%a=sQ;S$3ueON#jTR{VwguzqJ@Hgq^#$>?eysEAV3PH#(|X4 zHuKRp=X1yTEDvO7b=yEiSFtCMgkAs2arBL{XqVkrxuws5b*d2X*Jlzz>!*(HL;9SK zT!4I+<=wHfb-tb>2QQ&$kCn0a``vZ1xOtoB?vImz$hM>5zXk5#ysJ5iN8bjsM>uMm zJMEO^j1CIQa)(~+JWrn6sk9`0k@-Qts%(Vwvw;~3%EV5-`~Bg4)0t3>Ovb`!gG>BM zW4CmbIgAs{#g2Gezc=}c$oO09W!05>wRy{uz^H()=92#F#7}Ty2OU>u0adcbX?Lr_ zHnVb+s~ADu!tH%7d`pL`UWD#$Uub##vmYAl2T1MLl4dH^4}PbLLbDYM&aoT-xoX%R zFp4G`SO*bv&jw$+z(gS&7L-g=q(1IT-HO|qCLkLFR*BF>et;vHHQ%&5uqGnbUN-vP ze`^5>wfB?@IBMsoy@lSN&d`0*u5dCVms-`ebx8MXy~GGuD^$Piw55ya*7<>mdX?D9h zy(NL_F`F-34!5A5`jEaaL_+Y%P{eHPneJN8dkBp$K`BW9Az^XLKHzfw*6#WnK!3)Z@n>ZBRg0qz9E_MhW`EcGj|hYvII^Lvku8S`8$CL3 zF#fMBvn%bi@~8hbV)N(UIDV-t-sFmnr<(iaL)#NlnQa4wMaPCRW_EI%o=>D+Za1w3 ztBXdnPHVxc0AdCBv82aH@8Q^D4VSitalW$P*}BPBW~D^d0kwwu{8g&XGicr_f#tf7 zPkfYC!JMZ`WxEEsD)#2}cLQ;ZB8}QhvAZ?K(4Q?fu&e|$Id;$4i$w#{YjOD*cVZJ} zkWc-ee;|g|CvUdFm_oR@_^#r$OOeEEKW-xz7K$6)7ScKh-p)r-ncJP zcU~#hdl%hQ5+ivthDUpy8MHymNIWuqjrcq@-PYGQ0KI?{oQ)s2d3KZdengMh1C%9N zZlKFsfz|!oGU^~t@%XeG-YX80S5&cXNU9ANMSNntc5*Og7i_x;U^Zc^q|#jX9ifQ1K%j#PUh)FQyl zRu(`V7#dm`T4}3`onhy-zRY=s<3!Bsrw@5gd^j~)E2Z(*W94xFo1M&l|H;(Ls-4cS zA4eCym$JAg9)9TX*(-?d-W%IF0Ox>JI%#Q%UP@YRX3*PKR+mHt*WD57@S+;rdm*w~pvz#vl#*hA|(!00=;Xl72srHt^*_n6rq7SR&STgz41F z*&-QYBn8{nh|oJ>mWEHDKFnMIH^AhsTcF!KR=CJ&?@}vgvlHsMPPlyOZ}(V?C<>kI zyK8|~8vWc34xzRBT0;iyoN5@N#W*)?k9)Tx*q&~O`4&*G7ljHrUs*L>06{;jx5cGnYY5M5mb40qp`Ee#>JVgXx^ z$jj{oAe~*ZXA26+eDc0`h>s~}U`oi!xMBu;39MP-94+X9Y7)FWtXd!X;^idFkRHbZ zWF8YtmpwSL&l$Nq@d_z=&S+YGCE@!wy$l4E1Y1x|Mm(YdQezwfMT&=x0#&dLhYx%2 z=M~(*Y>M#5#xivOa8rq|BYQ3?qi#~I&V5_iDD6aEN0;Z z8_m8r`n{2moE$Ixm6b%*8A$`dgMrP~#LU)uLzC z3UJLpV4eSGcP-e;%(TBJ#nh^WcZ94d?Sb=b$sGQI_=~p! zI95woNAx_9O^UlBiEpnR_mb%!6pRRxeyJZKA9{-BVRLWdL&^5%iaGlQ32J@Fvq0lB zM#P7Y2}qu~asT_UTu@Yv@-ee_hzT+CnK$)z8k;n-WdrXd?a*Ma1F{&fWPQ@A#ToQ2 zv6(i8YVM_ze2fBA2RwrLe*g_BU{%b2(yGoMvQNQ@Y$46EAB zbC*BCfRc4bxZzmze*6HJ%0lsajiX@i=U;TMzH3?h4t0sZ--Be@n}N%F;lC228K9BV zwmw@tf}xB=y)d5Xai`rKp}~bTk+!w5)hKhHUoJ{gLtXRqQlgG~EODhi62fE9iqRGf&KX_H-5G z`Zku1Mo2mgFW1sE1$Umb4U21q(B7@CDYbp&z0s1FvIKotyhfMn-B$UA>hyNdcN3te zg`sMA?`0@%NvxVzXT1k1O=zy=>cOX8yOC3Ui?<=_&eRZVLOQ)K8PwOClb2?#ylUH# zCRMx29WvH4v4zWqtLb8?g&tdAB*-RrGDHKm<1GT4Z1F1)uNeEZsz_JUel=7qbVB-& ztHLS6pi_E8YWEFc{Ue<6a%o8A2gROiFE2rVtED~B4Xic>buy&*g2YfoCZJ&Ma z4%|qJGRusbJ#^B)Y|7FLi_u!6Co^{kMkCnHW@GDJ7B!RrXaNgi{yEm-jCvAd9iIHEl@>DTYyUZd*rOq}3rO(@s* zu#ssr*(UB~=6KsosAMY0RRg{{L#$dveIQXM$g-}4>6PW$WE+9O)zDAb3!av(0O39E zb0hFmR$$Wn9M}^S(!Tuuohr>%8pyh#+W!5uV-7l3d>)zgEh|OxrW|cixN*6vJGYqV zGVGz2&9Cw`YaMUMe32(BLcR}V+x}%RI(Wm2Zz*+! zZE<$&DY#N3JBtm`rUh3<;CQ?N2Q~Ja0tt13)qvo`eCYPyNFTb%( zKkny3)1xCsGnrMYZ58^Vatax4&*7o5BO^swCc6D*%OmV1Th3n%tBt%To3lm-Z&nH1 z$fy7AWVWTgX#uGRKN5*JAz;rbELW}-=GqHlr3?ANuuk~y<_4}KLlErFI5^Xfx!FpX-E%ZX1v=EH!M-GIRnNlgoQc3aJq^WPNehCG3uZp>P9JPC3VV^ za^Aze*=CZM9X$|kOG$^2zX8UBNhmGU#i zMGgdx6jm!#`{_xxgWy^2brx_3KbA9kdQdC7WflkpK?JMW55kT2HCT)S>5Y5w`-56_ zU|HxGPgAnv{GXWl|Kuvd;#nAU;_QUZ)dOK~l39`U(+4ZuFhX>+u zt3Q0g^77wO|BmecoboKxZmeM}8+jm6fU;p>do1p-)qi9g{u@>M{h<@$OPx<1=!Eat z#%fhrb>v_^XYmiuA!smo;XsG}$skrMD_aiF!v`&$|0@gZnicxQe|4+F^yI7V3ctAQ zs9a$+uguwe99KpYy53Mje|;~pw|?$&^brDP*!{9aD;&AU5h?s&1Yl}6=O zRmP{J6HGj6!O_Tenim8ZllR&-!7%aJY@f35t9H~7;%!nh(|D@6GUw-*CDRbG(h?VP zCR-r6J*?~Im9WiF&1 z=UD&-^Z8|wG#{KxT5}_SF@r-UZ{c-3sEV3|m5<}W%Q8h5G-H(gi!pMjPNh)6-uNcp z7dL-WWm%{#GxRjd;o6VQO32MZ zcqzz6Qa%#p))5{FT0ukgj^qcP^FNBc`!{?D@{x7t@e8`ms^)c=l6-nL#UXgroao;Z zbL&lURTHZDQg7cxfDzpu-_XE}0X?cq%Jy#lg7mIzM-kviem8)vyh|ItlBj_Z=KdHrtfB@A{F^GVY#lQ9l2eE zdUUtsh4aHIHRkgrS{F*$c))dL2J0`@0`I=7 z%{PM?;5lE5b@&@Xh26#D%dEWpMIf zXOdID@8;GCK84PkZ$?y`L;FiE#8lmUKlgoU%zQ8?9C>lc-)Wppdd@@tccV(yh6pS} zoiXx)M~5=W7F1x!IW|DH|`5k>yX&06@Xq#LhVts2kRB0sV zo=oIVCpzhQlP7weI&}z!FLx~~Lba}3ocblU7E#ePXqzS&VHyq8*0+= zvG{v};24NMKVmbYUVTq&e?x_^b6z>YRk+_fp-=elnKFk7b*+?6EVtFR=@D}DF4;s@ z-ME4j-GnmGaBtdlJx`UTG>n{2Vc}*RJHBK#e`Zn;Q;q?6S^4Rdd!YL?M!g-4W)!Vm z4*jODQL%OlYq=Yx^}q?tVO0ILFL*vOx615RCJAg->RP~=^-{IUctd`70Dr&D`L~94 zrTdX1tJf`*J;>y~pKXe`hq1_?LqC;9gI+#f`LQ5xISL#*_Oh)#$jtmMFjRt|HVaC2 z9q#EOttk8?#^OTCfl_N%=uc6$c*y%&k7LWxQ*VZy$6={?hAr6X;H#Dw0Q%;N5*f2C zeCKpvL9*K){QDz1j9Vsd&ixU0plbdG^UwQ39y^~7ec;pE<;3zkpdE*sr`6Mh6;ZOoQ6RBBctvvP^{ zAh)P*3vlZvH=`@^8yx!KQU*@kCd$?yJoyfO$rr%=G-$143nSLw_d}oJqeb)DFM53B zV{wlHgGY|h%@c@M#rrN4>1L41E2;@wM_3S)tSD+k-H&n%PDn|)rbugERVMlxh2P7R zyszv_{Db$b%1LC<64v2UF~D8D*1?;F5=4kds2fF|?3AF_j|c4G!d%>nGnhS2w02uF z0*7e@v?m2x0Ls+UJ@?yKFN`lgo(I3K0NKIyr@mXyg? zDK*Phs^ycXfW470m-UW|(%clXL)hI_$0L}jBuuzb7p#g{_#m@5IJI$TIbLEv=({}ot*ziwF!oUQazM2436Mv&hA%VTIXd}AX!gkF^6tF*u{aFS2?u#Yc zqqX8wIY03>5X;S=@3XY6;!0i4rsX~1V~HA5T3g3Lw5p6M+RBnWPgcw2dAxVOAdlWOnL`>VDzY#@~}j`58?R=gMBI%URNSS!FcsD<*%f2 z;8?jv7rTKvXAbXkWj?Rk-7xW%E4P}myyo;@b0n77?3v)r7_@##mAFeU(uU%@CyF!k zf;Z~k(%sM4R5r#ZmRu*7U5;SK%^x*^rYsKvclF{8VK9Pk0_|L<*f4lQhOoYS(bn_Eb^@e)SrFUbPLHITU@T8 z+UP`HvJYmnCTz^t&!?bDmiw%?|p+Y3E!N+Q*E%0+lK4SvjDPyD}Y!SCj6mk^vHghAA`>8EF@_tlv`!++<8M;XN+0RYWsy z4cUFrT$%r1_1-6}dc&s~PPiXSH*+J?7>XLyW>hZu3EyHxgnscO*AyUDT>I{>EKUuH3h?rlo;vHx0|J7DM-;VR{C4YJlxVRIsu@?>V?Wa(wO;bG!_bxgehI>kj z5ur;OKCDb~MgNtQ_aB2Z|KmYXk!TMu% z-UtLu_*YEcWF82QJsra0PwP-9l-?s635$!x(u$Dn)};-kpBeZyx(h#>T*yn8?|L{#9Vhub{nRdi?(?tArkTFaFkB$i zCv~_^t@u4Ctky%DiBBC3WanbOi%VF(DXMnJ*sZA>)7*T{i`4qC{Yrw`_M2A5Q1B{| z_a+|caetdUbKS^j>_;(qY{cN_`SYs@H#l9V#zPytxL9e%xc=JSWu+=0`g^FYc+>rq z_}a|OoYb>}zSPy36yBe*2O<4j&{34G-L1m4U?t|aBR=;_aIvo?yix7sUR`*=#xG@` zfLG=kcV>dqjEeYUJr21qu2xS^sOh`fk;3(g6yRY}oLc0O`y*e!mL4nfdA=4Fv!tAh zO}h4>`ZB<0fAkI9T3TjoOji1gQ$hJkwn>4*&%{x?^b~TQLl3qtDD~e;*8KBEpt4zh zRr;|!U2KoGhe{my(}R9}|LIj|SY$kE|MZd|#E(LfyL3r#?9j9J(dXJQi)y#W#3ORA z7UEUFKTb@wcqh;q5USwlS)G<&gGt`Y$qa$OQv@ zPt}KE?aT~Hwk0(6B3vUM+LWAJvA~pB@E)XFm`r6$xvnB*dV%xDUAk>^R(yB(tr&&z?CjFg684aZx`tug*Xikx z$2vP#F+rmh%vB6}n=2zH%H!f(y-r}G+v}-x;8QYO;Z~21Mm*O#6 z5C$)77jUm?-**dSRh&)BYVhh0E>RsG(KKofA>XPtFyB_wh8y=)(lbh6iQHna53E{Y zO@2!;zrf9a;Duye!HaPf-k*&%j~FaXSpWl?x1)Yb9ki7VUu7eR-gctc7bHl#cijB& zX89|;P$Wr{$`ljOmGh$_K87p+Ar0eivaVH`))FWE{w_+?gO{uY*Ba;~B^mhy|BhcF zG?>jVEX-jmAH+sAfi=K=Xi9u~S&AC*z&cLZH>)ezj7%SF>y^sV5XITbGz!FsyD4{s z^(ucFu&*rC}xE$BHGL#8*HRx_yqEa! z^r^XqZl##TmF2Jk4byk&w|F^~2EV0e303D7)I2?KD;GOaA%h=fyZeMogloC9G|e^; z=)W>~bwS>F-sR_)uJ9fY{c(+Qd=oIH87iP*iaRg#wtJGq!!z&Zd^20Cw<## zzfFbZEY4?LcEPT1sip9)(kRIYw%6EhMVejn6v65_r27Q2pVVeXs%qaf?Cu!-5#$+z zX1xcg@8@fRM6k==IexKIs0zI_8QYY7#{N!;^i7k|eJN4Shfa4hSiyEnohJd!$X;s+ zu>A1<*M(G87v6P$OKy|tQcd~b+O-hd;Hibw zgt+u;2hKZLvKn;6$W%N|OrWFUXw+o8<)8nyYQj(pGz9%3VCuo%2;xCH9Js;QIM zKO`%dQnP;0(>k_U1yhg^QjFBW8yxb!qm?({kD=f3E5kNy-L0QT&58nkD;vJqPD_!> zX%@)~X#NqH7QRC zSIg}FktE#sv+B$ekXD1NsTrT6R8{2InzzU+-)sx02MP$7ugC7Z%MtV22Sq2R)9(Mm z!;*P!7^j2DD6@f7NW%y2YZ>+cjS#Dt5mgXQrQxK983=vi7K@|d`7d6pu9h6*9z5@6 zdq353asutSLVOD!yN27>(Xwgue3JKEJ)!%1F>=o75^9$ND=JbCs`iKfdeW~Yfb;b$ zJu{$cjoRQB_S0=GUl=$c9+jl6{bcJhzpichdik;T(e1u?T#hofVdm=DLFLSLcuOOB zVNe^(O&VeP-NBb@80~I;4TaHc(C|R3(N;Oax+h~Y{1196K|R9_i@8eN3okADQ!_TgkDV<=K!CHgKwY}Et^NE%yFf|X59EOF1EnYrEUS*n6CspT zYJhAaMYlo?L_NWa`H7Mmnaa{_3-wmFD61T!4FT{@wq*DI7S%-MdA_QL<&{okOxKiV zoAWEHs!OI=6XUAQmA$L@V{+H0bC*TqUKq75-H9bIt7d(OYQ&(oLZRP`5Pi@hA1Z9VzCCM%Xfh|>DR*}rz5Os#|KXzZ&|jtG%+V|t`Fmy^Wa?v zyl318yWmDyx@k6~ZtdYNzpRZR%igHOSoUsTLJKJ{kEWxc{>w~@a;BfKNl&gib;>9e zg-K40t1UOy>mRFAI`wO5il#$0uOCr-ASj&DH*uOJw|OVhpOX$WkPoIAC(zXs)b`DX z9!LUHaT1??WhlB?V6mx_a>xC~jM?7L0)YDL#4UFt^w_4O z_yw!qi0wn32AHcs?Vh7!l$5@O?ZYz&B%yq~Gqhn{H3EjFbgif$7|A9DqNZLjlo?){ za5GjNbHuBSw5DEz^-McLGBOLFeT~7YQg;}4aQbfFE?2jQrqA&VxHbk29rpL)tZlO- zj8{!w!7OA|mwW2+2HECcT(bCUwY)wI!CayB54{xd_NbVhRz%LPBg|ljtvRLa-EF#` zQt{->7FD;Ei>+xcg2kw6n|}OP#(n&#?riW%>k_H%%Wmg3-+pFQ80yPvWMil~S=C^e z`p{b7=SGE(&R=W!L_}rg;pYV=1_|&=WfnuB-cXYS)vzl|@_^M|ujXNk4J*ivFgjX@ z%9lTa8S|oH{uc!WZwys;Z-0E!OuZIUSxhcgSbMT@vkV6rM z^DQKCcJn1}Xk07RB|$gR*q)YRi7wv;y6?O2>B!wSxbXyx?-$C(R|oHun52ID8Fcid z66E&^BEek{YT#`%n)oZ4;$T=(hGe{|Gju1qIGIx0tT*fiB{`(`J`i18Z-2ET z`=F@yfG^A=ERBh|9Oju#$?VvYW>SIL*yg9F6LTa+XSiWO!v1x#x6q5Q8m(TM#GUMz zSGaC9_fmbJ+f3HFg*&kXDXsQ>h$AI&=2QUWI_Qz-t@*g(kPTHFhqhamwZsi@&}fdb z47)5ObM~H|;NA_K&4kvS?C{ZSjD`&yX+A?Nsz)#BAc=+&7Q(oph4PZq2uhY7eJI@U zXAjdTz^f8_41eKyTLiQyRjJ7G$k^u4DzlNGQ5Q7$OoH6H2bJ0C;{V8R8)3M+cmIRazv495% zQhTePJp!LYBXA+OThgrRKc-OM7TfpmF59xEM%@g2C1WucL0QSf(%9Lvol%x2St|6# zom7vOh^M~FELP5Ar94%CpHj8Rq*+dt3x!V}!x{i{eYlHg8zB*_;N4*@UrmFl}p1gUydO4Vg4%R^f%OoMNY z%#%=J$0M}-&(C7b%9)ZfE#~D6$E_e#sj3lGFY^#}g5vodWr6kotFp_oCzri=n(J*; zf*Q1tU`f}ys>JWwHZ_)j>|>QdW4ey0$9Fg(SL3^9GkczBdN*4t+mg*e6v)@`RexVJ z_Rf4lXQ^DcFWE5lA#Hp!+^O=4`AFL%$#k0U_2TK>>W!w*m*q2K^Kxvy9%){4=a`D- zgf)J0D*JIPe_r!!i1N$8W&K9f1M2s(g~x4Sje9`-tna_J!}u8k-Jsvp4}Wh5WO3OI zhNiRBapD{9vd1cndUB!ySi-nMpBIFj(=zy7yEucr<$=I4UMdDKQFTeu^?5;x09n8y z*n}t-fi(Lx^W4g_$~}yC3t)uA*IA+~w`MGh`x6q>>EkII-X)lU9<>Hj?;~V;5i=;) ztiVe~WDs5sDE(%$QGnpo)6GxL6pTPHY#l_*7?f41Ek;?ErCnd#eUIUKCg`NU<}!4W z5$(>n-eU8DQ4>Yrhi<2{W-_%;$me)%oW%+O5vuU*(f|rf$cJVtv_$-zxJ)3<<$>Qt>pjL z!q5NAk*f?#EDBhcMH!|DB+aib`SlFvO|5dY& zh^%PdkC5)QWyWpeuTNRm{O9^gJeVR%Y9b>|R&r?;n6$TVsUpeQiO&mL<}ugm8azU6 zscA*wnX`stYo>BYaA48L&m|b`(%V+lGApMCgze9tm4EQKubm&+l3S12jgnWst0oC) zK*K4x>o?YKA^NvR7FPyih@f9xFPtAYf3Zoco(D>F*N1#xd0hE~w6B-lEJ`$~(itQ^ z_9Nsk3CN`_ZOOD58Z0~M1g*A;yj>5t<@2gpA#y_;z;s{h;(@lVJkn1H@NjU9W2htw z;(}V*u_O{xRo>SoC`M7-*sX!j>_S#kKHV6#%91k=pO>51ypI{?u5Vp4wbCIVTR$77 z-T0UVZ`P53o`or0XW-D+ZdB3gUa!OJn%C}Oy$Fg(mPj%&UBxN*xk=*UbZ#bZc_=zZ z=3x^&5+Y}|>5}T=xT-hkgxz#{POAS(Q8OBQF#(I*l=VVil@qn27m;BQ1$5c$NQ?w0 z&%6k)ot5#OBokrLV*QqOy{mXL-}T7~gH(tUS+)Yl!Nw^% zIBQ)UdgzDU53J`hoZ!Fkjur)jalb@GGMi1Q?l{X~1pA5#aL{j^I_sfb%@+2u1l)m=8fOgq-B&ynIO%(`~XOKqUFy9aic zAoYQz5bv9%pyvSUe*kd`VXHZ+Z`B>{D&E0+JtA*%&nT0fLxyufL<@bEP_HtaT;*W+bJt zG}Gnf_r~LhxR>P2RHW&KUvj0u2h^9b1tfOl1kqw9rQ&os>JAn$xAzn9-ha@ahhWxH zu4=H_vV2ofppunhwbZEGFY9N5AVec)Y;@XzBQCx#df)gY- zBoHJx!Ciy96WrZG2myk-lVHK!-9m78cc;;Anx^4)_WSO$&p!9tIrrcF^NrDCK*Q|W zbFEdg>Zz)yYL-)3=A|MmII=2x3eQ6eb;*i%>-<02ema5-7Y3?(wp*xlS>0%N-9m z;oabD_v+d^6;2_rPhO!9(gRAhaa+-|$pdtX%x+Dn)m3#=nCE@)0hUbO*EXA>N7C!ywNT8ozr2{(m0CuG1Ys>2xCr#he7I_|AJV2N*&&BkH$I^s!Wv#o@MIFGCd2K24yhZ&- z%p*hmVPMSAqQ}rpgJfoNE#2mFP#R&iJf73pGlmG@ zKH+)I(KGOE&nO5+_^fLijO%osevTS<^5STv%eAHRF(e(<%DogYXM_SYo?5BcuNwy= zdkTL~ZV7wTZ3zo3*}ssVB|f|zXJbqP@aIDh)}8I8`_0vPkFG!aDm}*VM>)QnPHDaZ zi$A8B`%+H$*Izcz7P_9z`u`{_!7v?K@v}=2;IZpul49Psnbu>FzVWM=c8Sv!r=}u{ zZC<5Hf|2@boUYpaTJ7$}b`ET(0=HRqjT{aVVu8I%)*>gJ-@&#S0gE@i2=S>{aZPhG-Z?1cl9?k66-CCwZ?@VOV1n=~gKmuPu#&;|ZdJ;vh z4aJ46(uUYOe%ceZOW-W!;h8v&*02O-dQ2Jjs7@vUjz^KCj=kC!Fgodb1m@B|n3f9; z2tH=44dNKi87Zxo3DZ2%oil#?E?E{0aU4s}YdmXim1&b7J93}a&33RD`J5`BtUhn( z_+VH!BRA{7IDea8E=&Gfw>IGCIE>C> zaGa2HG3{NMK`9UCbhduEhgtbvOR1mDrt30CDFs2Ki0g9RvNTM0DgRvVC7l&y6WxyK z!#v}hascYWwdrO*5Fa?Wk^w$E0bzfHTb{;PJfGA=Z;IoXGsMz3 zwOtel{cswl<6AJW04qt+&*r{k(L@diq;J~xDJ`jQ2EP}`hk|sWFI!EHWB4)8{9FNk z!YVgkV?L~%RL)5+$A;Gr&yCk#HIo^rWVgL-IBM?jvk_h-grCRJD{CC$U6Wk5dRMN* zTN9->i{nabbbW77D&!7WD1G(qJ?*aj!1_X~0&hE2A&CB@&CLGGYFKe1%#DoNnYT?J z^0a1#mi(~NReDVCUaBABVr!ZV++-e5ih6f7`vdudfS!XN*)9F=z+aFk>MsUykX+&C zf3PKgKw43jUxE>;YfSbV@CO%+{fj{ipQM2ITL!V_uTdEFT|EB?m4E$%0(Pub2_kbE z&2JgR<-bM|y$`nkXH5EwK`aV`@u(V{lD}mT>x#ifvCJSW{~vP4e=XH52nOS^zne$= zwg$TH18kJ$_dgEh`sZ7A0QZYQtTIT5_gk=!3>#Ihf@z2NyC*pT4}Gw1;!ODTcpJrt~Dgj>Mq&Sd>FA_2@Pk`uO1=MVf3o*&bTwxx1ASc-wz$jtILpql|L|SBn(X!?NGHq(rTOhwCFaj(u z(h_DEFJWgj%kv_hBIV#Dm+~xBl;>rlvaIQ)kP!P`$sWnrxXf#Rw%B_+dc)#;i>7MJ zR^puI+<|_5V{pT9#8&a}7FH6SQF+v)KRv&v{K-9`%+wtw(G)DGD2<=PZ{VIZhrRef z-{OZm$Z~M;>ar%AU|jwcVQJHllf#e*>eDNt&i&p~Hr%PZQ&L~~p0w(t({~PPeqH9iGI$kBZ~Yi`f!?k;>Gh)cS&aMS^o66~FG~ z+vhM20KvaVzBF-(PINSf8ZUXnmPf-b|Jw3@|1E=lgK`qvn>6+4G{vdul3+Qb{?*Pe z&;PZ!@vDQic`YbYx8=?-|3B(LO^hZPjcV=K@C#LnR~Cpdvasy~i@CUYg5>D%lZuNk zfXvVSF)Y!L)-{F8gsJ3BkD$I>_OPIjg!^4QSDW>S1zk>!73m5)ioeD@?gfU|X4OZ} z)A)z=dqs42Pe;C@`z*0E|0}k^%xdqZd8Nq^)B=clPW(^*us&Pkj~64nolI$%PxBM? zd@FX&&sO8TF41KlRxoWHjg#o*7Qc+NyQ}T32O41z+_V$WfqO9H?{5eCri0F;)Rw z(9BJS9Hs?(rl)&qt8?DeJd4P*O*g!qnbjw!C1{Y?_tQ!fMIMOfpW{$ zmXtbh>0KIlukn7{AXR`Whvl zcLfP6=ztA8ZOe=JGLjdy3}=DY<@Lnw>lrrluH&*>eF;8(8UZ-(gjnPrZ@4gTg z79QzjT9lwk00AjyF@gHuaLDZs1i7xV08PNW*PX!DXT1U*2NyR@bRuBWq;g%zwi?gz zow2K9CtGNBb$wv2yqo_HZT$Vh0Nupx}MnJj=f^{$z$YO!CxP;K@e{6PW;%Qg+V`zN>iEzagZ4U}uH zR>syL2Iz<0t7{7NaT=+av5WK9eif(YLW6fE20qb|BG*ph`&TnK0UMy+qXoyetFqxD zCHq(7fgRb3Wlai9ISD7B^15?c^df@#&S@SG6F0rrUNIt;&?=NME|G`okz;OqXG=>( zc8<7-fOXm_hWT610DGO3g2NP{pX!gLg4+t z1K2I_&!sBohQ94$7Wt4&2|5%C1BlJ_~v2kEK~w`ns7EAAoVTmtQ2Hr zF}&-rK6e}nGJnwUpMT=f2J%0YuV7m};10Q2w{p)(^FOVxIW4{x^V(OE6}+hf_Gja9 z5Led8803t0+(nE8I<-`|Rz01*`q7Iczh3EAUE!{!`*WAq70}PN3&0oO0nL|ww533u=_kh zqfU8Acaq~Dig`oDNn@urwg{A)$ePI%=bA^eJE?8MIHAi_4eNB-Q`_PKS7eGh?Kb)V ztY^c$(Sr*iI1#J&pe?dar_?}z0oH@T(t)Mp`nq?vz?y#RoM`(ls$*s8nqd1OiGYtr z4Cu!Zo2#x-tf5uA4uoW}hC&lQ97*2j-Gd#j(~*Wj4G96cex zLOSP31dxhEymw-(zHgZ=SbX z4`9W6@Aj2FL(ZDhV_zogOPl>PZlD|Nu$tP>!l^SVz6Ji@dUo=CaZA=zqE_7${0?lY zQHgotPxf+F-Sm}xEyRpkVJB85M-@)=3qS3`^Y!$`L@F$6uW@~Ue=)Gy?9eM~MXam5 zY;smX1?wYZuqs-U0X-g?9^1zeoHz%sIH*}23FRCk+Qqe#YTKOqv4g*@kq0^=$}K-K zcsPVp&Tb5H`K%V~_eZ_3H99&A(=qNvFpZkYKv`iErFN7} z_FVEO9FYfCVY4nW{?*NB#Njys>~(p&c8X7u+s>2XFWi4@w|n z-+hjA4N`35XiE*enP9EU|85um_O|OATrd+5`FHRUE?DswM~7*|{8tmvf1azV8og!t zr~P>Ls>?}tp3C)5D8Hi3&-QfA0uG-x4`^Ygkf&kx1bDhPuIdT)TAaWg4XD4)3dH`# zrSq}lbK@G&NCteU?Ee;c*mmQcrCjsk z4wi>L9lp5%4Mjf$V14xFbORYy*}Ays?%i07(f#g*=29T@^SmU&2;G^Kro<^nvJctt^1qCYRU z5Zkud_T^m$!+ad#|Hh{N!691K%B4kACw>_*V0wAbi_>g#rIcNltKm^hT+2_CQy@Xp zah4v{|7~h|)n1h;T#9C@fVR{nG&A4IN~?_J+w^_EJ5)%h<>YNm*TUtgw&&jc?@}?i z)+WKxPZSodzZ={IFkm8#=Bvf)p-FJ-DraL?`y352>~wYb<4@+!bae%FXfXq%Bv(7Q zn_zLI{`F~PAtr`wFTca9Mx_o~$DV<>CbKl?I%v~gY&X*STxN~^&lKwb}$ia4Z z9mZ<175eQjD%&rTJB*ljq%2lFZ)1IP5|O_edY?uW*uhB6=3ZVBW0^vY*NeA(=0VH& z3gOzAzmsmrPgh#X~gO zrS8ioneD9jn7Ls1!x<~}~UO6`x`Qxz%`#UQofL`b9J zzUR%Ids=Gd3=_14+m4=JZ76}nUwSq_z%s9dcv?B&j@Q90DJruDq`XKqY$mtPl{A$* zZQ9}^IbQ_$b|mN4?5WN60s`pwOA+1?`K_ez;2j)vCC+Z1oRmS-Sr)%DLuxf!guJZE zC09V|q-eDwzgI^INc@)+>RB4yiA28*ZxHSN#x>Wy0mL~RowLXG&7-QgQY z!>^3sN!2tFf89&Zu;a(o5_-1ro)HCuPL4yhhV_x0n(e%zignnkL^xStNMFwA-Tg@P z;7Jk-&E~`>5SgBHfLH!5Fe3sP`%ZkdQEstpz2e5MZ4G1F%ps?gH)(H=&i6Fk>*Ax= zWE2IUV(n}$uuZSkctBkIG`)h>_ogJ#t+SMuNJ`550Scm@>C9WpDbrUv{?5m>>H@Jk z8>1NX%e5cL`WlraADGl$m^^1_OERyp_fm*0t#&{EC5(o)HB0={ptEA%Scqya%?`p@ zG~VRm^VKP;bwaW~wzM2U2P1anLfZtwDvPKr z2h4yy$V{WNomr*4>_wWA$CcJb?ey^tTA|_SgPCt=LnrSbuS*+OZ+Z)l904P+pFY0s z@79iHXKw&KJ0EwX=e&jIufE-^lJV!@*ap^*J@_(J?z^5ga=Qu7yK6sAMgx=-=v(Jn zJq-4)wH(>`}B5vmgp5fq(d zr@W~}I?WyKnDw^(iEAl*o=VHh8ba?3ozqrMmycU5tITFEXCMq@_GT;GL2Fjud6PIj zICdIF^h8*REC29M+$lWDW|Qi&?TzdYZYNxw&i2@Pr%Uq=`HBC;MOn~9V1M~;hAC8yOP${| zLQBcMbji#N50-8SFfSR@D>o^A7gzi-8VyUDCSFOA_l=|GAN1`169`d}?+TRt6m9(T zU?qyDfa(vTD=&P;kMX6(k$e4CqNsn1;?z}j)uPJdPwtC2DtdEjuOB1mHNCcj9d`J? z&{LEf-dBD~RBfl&pPpoDJ}8Ui#hlvckg(Nf4K|97|0I(bhailT<9a`+$YKN8+A!Ti zjyJ9ToCyx0oDFx5Z|3YQ-fvG^HhW=tX?Y1l4@Q#wzmaO#?f$=!>OZY+@c-9T2<~AJ zrzV@JCnwXUlXgVG))Z%fnrzKtCLsoLxK!Sj_tt|w6Mnc1D{<1|;u|Jn==#UE&p~NL zkHdR4Er~^)kxlDICB$lj<&Mtn?|46R{Q3R=C$k*PLkzR}GbsV5kuC=l@M&qXZ0|1o zUeGeE2WUC0wXrD+gHutmOfO;eJMu~)>M=sV&wdAo1ebt^6I)Jhq0KR;92NxT!Ie)W;vUfWw?4zulTPLZ7Arja9@0frYlQ@f?5BFRFno|U&F z2nyn!fT6=_tDWZk?6~S`=ciTtZs$Sz?LLYQIBS!F`~A~+>OAr9{Y=#=4)z-;!X^VvWyITb;E($4u=fQa}CylMxP1o>Yu{3^?39ghL zQ$xqNG*|uZ_mxFTMBm9bZC+vdDyzRHjr+4g@LIJ%gE@&V58z&dr$Czr`?Ib9uDMv@ z!$U{GY}Ol3!?A3kO0j%X9b)3W8c&t>%OkSO-Syje-@ut>%%>JEF7Nwmh$SMWdZ)^R zOFt{2H@Ql8o~$=2ORK-qEqRiJ^S8Sa%tMqq8crymZSFnllri_R|5rcDKSub$M^fz6 zkMrhjYwgWp%ex8c{in?*sf0X z6DQz0q2OlM#e4ABEr=(+L=7XX*X0o!=KfJb)5q8MdR3LwX$mgMe%tiu_&8yJ(qDUB z$X!AZI#GB_*2YC=^!S5!)wlRReUVd+bHKQdPdg(F%6678{n_8ytx42ypjkiTd?&^b zq#qQap=YcI3n!rw-!{WeRm~9QSWJp-_XU+PrXl+|thKNaTf^M5sa@NreizGS(N%JX zg#I`A)~`zd*-P}wLf|Gpr@Jz@Ll14-qhbQ(pCl9(7QwIN4;Sk)MFLgVgl;+ll0512ho4=`JjE&baR4Atv{4H$~48-*U8p~2HmIHXsO0EV6fs7s- z1<&NXFNd&{KdI9oGa`a4LjTw9PBe+iNDt!BNg4{O#Ur+lo>pa20(VY6|1>zIwV$tp zZQDq!?q3p(|W$M%boN*Jd56ijMFB!=f|?-^0UY-}s)jwSgW`kFn)O3-`^1)x9MWb`vH6C%zDX0U~0J zx-`rc@Sbkxcz>Ene@ z$SFzO1NA0sUg>mLxg*V6_&>gT!Bxr8i%&vI>S44DDgDpA@E=R)=xTrkcM3UBida?5 z^Ab<|m5&t<7mvEj`tj6<=WW1cO398&Mc1_=kxWJ=&9j}`>;Eyq-`$wGG&GBYcEIB4H2qxGTUxHq;L8Z z-6q?Rj=i86ohdqd+G2gmLjK{>P^Rt7AD{o$1pQQy`P*ZIMtV--)+(rg?><=Oxn6^V z@V^l&jO$^fhFO|l$1VL`16yMDwb0K5cJ4?kMebNaS1w7gt`$B@;FI2at3)6}K?{;h zhoDV}2va;|vZ+#tR{yt;9zv{!k9cv#WNIZuxR-(#-v$eAl89Kyr*FuU&*PM3obMX+ zQi{Z?CeEY_5=o~-NYPk``<9(G-umWDY|I#9MXyfU|KzZ{YNLu}W<#nP;@m*|w?%2f zxap=buwd6Hfp(j=70bl&pIm_1$nC;>%gRnBvQfFeR1LqHD59t&);(e6y=aK4ItcA8 z9{LT1;hiTF7hA#RbbyNWMw1uV`=W-4*QNNi^CC2Eq-MR6U^)_HYkU+Cw?oO4eP>4D z&0TREp!61!Jr$>`EWbJQU1lL+B6~ppWtkTQi)lfoBoif{^$_S2mx`UGi>6V|6-X@o zr>(+05QJZQHS6uJ6ppvRAKVYCw~I}`7?7JQ%blMqj^p~*mK7DiSdb}n(x5AxKA?Zi zmmer0tqNA(Dffw$msTJe3+Rn9$5;oslD_8unpb|7&{I!!{JF3d>u>>$QTEBfjK5Vu z;%uB=!3QO95p9v1o2E4Yy)o+Rn)ucYd90eH6vn*KT<%Qfo(*mOweqW?PI(9v@~Pn? z)n8?TZV_;_h5QADObiI+G*yCpL_0iqu^4Iz?tl4?ul1;>cBQB#MeZP^m_yAga_aN> zP8Y5G7&fz9`FGgNuwDk6b1eA656I}Jd^BLtL%GP*`wOTB+3K2giqcKW1l)X;Lsw60 z#U*L(RGiLKeJ}D*+Zr^W3pPlTY&7!UOVV>;5uXJt*x?GcNN9BSQ3B0>fw#I(L5jP) z{_>Yro$TM#uXJiWWp&_&qD*zOUR&?bdmh*MI6oD=^udYvlC9T1bN6_9w|h5$TyPJP zNFui4jSYoMdkU16VC&F&4{bij{5SDRh7n(?x&GY%{MznSiwLAU)t>T6b{OsN0I4q1 zh8)U3%A$u}cH`YOmIC;)i8Ti3A*c{l5+g0ju^r{-%I9)x7ZXx3C`%dOON|YRE01JO z^xPCk-eD91Ux|{EH?4C6R zaHm+-NIdVxIc6*|z-qUeMecX&OP=@qi6gamP+XBkb0O?MKcg5GcPLP#Ce98}0l@pu zCn_Bm1=kdq!+~z{T?LX1^^MUPdeX9DN@_cV8fW=a{87k^KB$589r^*q`S3 zQCsBa!}6U9&~YAhN;n>R`cRI8R_{&oa(0hM;o@(a;*UO8Yfa~OX9*MCRt2wJG$UxO z^t--09k$}PJSJ?N`l&b?vLboCCz?hMjNhWagS8ne(=!ivYwn@_77A*`)5%6Dcqzb|bW`O#sVl;6cimy=i zx#RtHZWVhhzdyRO{j%P6f$+zOhrObP5;%Jtp z@uf`I>W*P$vzkUAcI;s8ua@6hy-kx4z6SNzGI+TYCY4uufe=MRD01F4-J~b({h;@| zR>v2*Hv-e<*!>XmXV4Tw)D9iy2;bRfb_hW(=n2bys(i)K4h##(Jv|?AOYizXsJuGG zDag7>%h4i;$-^J{ua+ovUxSWggl@f=cDdA=E)jY0h1@nsK+%&up^mcFk6f@YOnO_% z`ePLq@)=Qhmxi7eN9ccCskLNqJW3iy(Sx>}j(^PbrMdck0YL6yg?EL}7bsi|o2bP& zZnbX0I{Got|51H&%hMg>83p>TaAiF4Jm{=*j=|NvrOnjCMl1z%@_mgwEu|-_6VLIa zty8#NN3AAKV|d-5U|E&CHCOCq)}3+@vD@^hc8#Zo&cmH&y~dOvYt?R4=G>+A`TM)M zo>5}9$aY)X`*=b(3YN9{clH+l_AJ$d)qJ!sQ#t{iy{--#>-%Z(AE5R)ls28EUi*k2E*J_vXrHw^Itc8z5UKdFmd@f8xF!%IDi zjVMh0cgO$}S9?a|;OG9l3G;zXZB8Dwy$EvkYmXWar8oXtq5ZXs8DSca*YHY3P{rQ2Jf_ z#ECb0zsOF1Vk5hFQ8lXu`Gf7;xQQ1Q3x$gdQ7MUQLDd;TNJUF^@l3@3qwDOcjr}{; znq=w*{X3i6I!raFf|ka*nRrcyFhlbL2ZBDH6cqo;MC6Vl(ZqdkaLqnhXl@;qlXCt` z_2?ROQCiNXVib!7pGkzUZ0!HJimn%E3*xUw&me~iV^i(u7B-f|4;le?F0+2@MZ69- z7qq{0@Ps7-u^-Oyqf4`84)M~H=&ipl;qSk}V>WfcnzpDPUO!O>BYzn_f-TA&$&R_x z?S0E38R6_xd6A`hb)TEb(9f^ER~$qO1`jc-IV@gSaOU7a9Ptst%-UV()=GWPfvH=b z4XG)5i-}Km(*?s_!_G(>s7@m2cB;+@gq~+5yeG?VYp|HFndVH+NKpt!t!xlW;`S%N zk!W?NI*a?*C}dcjF@Ic3)`FwAEVV=$Y|co7Y9@5~*yvd(;!-g0Em9BqQJ-OV22sMc z;^V4$9nUI%IU-X-u#jy3?Bxd}blkqjpkP0YSV!zw4lsU~=oVb`RliK)LY!F)eUwzX zBv&uVnA3Xts=hD1uTkld=C1cx`Er@zf`Ot(ZE5;A3wM}-`|(`4QM(|>f7~^BP6M$g zueey_g#ga`b`6thl?3Peb{^RI%O;!qkx`5jF$1rKCR?v z`V%+t$k}eE~ipr_Qt)6S5altDP#>xol8DF(R21C;m1_LW?Le#j6CQQ zK#zmR6E;JX`|_z#U8WPNB-@usYOBY3KsPl0QO(f+wCIfEB!wU%%G3O{UnEI|9vQKumv@>$$Jv&BSIQ#1R9lPB-46-aEs4Be=<1zIkicNWe2mt z_y2Z)I+l*YC65Ebw1E({luLL@(C%2Y)^!$eJoRkL4 zG|6*n5cV7&9>iSctl-RbV25)`BI2_t!WO)T6udzfceZ0!A9s}gHDgTH^H@dp0 z@6Hlt5hP7>&3*^@;$w#Q8O=ERMT29_%*Cqq@@f^r-u)MdU^M2wTNV+D$>W)cK^Wd} zzOf#!4uZ&iE5N8K;E7WX$GNZwl3s~wEfT&yq`Y(h#CLYwNrDLKyZt03AGlk!JLY-Y zk=zgs8bX>2$t0MI{ozlLfDq%#-h@TI2feviYqH!t=&}6(H}77o?#s5m7A16;OT=Kc zI&dMl&ob}VJnY#!{!xe(#2!wuFkfNo)fTwf8RD=|f7YFTd(t9<_x%L|eqqVs@`vz5wkZjuAGpJn2E*N!i*h;5qARU4dj6%c%({e+CC zCH9K-*6d~#7i|MBB*91=Ni!V|x$wG@;*~u8hFez6{j)uTb8U@?Ugx-nCDoDPCw=qS zIm55)bPqG-&sN`~#3hkUXe0Cx_C?+nzTg_U@I!rI+o4k`zXmm`?JHBRI26xg>z;A( zF$%Zg^Y6$z?u{*{0Cn)O%U7PBetn6|pylEjNl!iFs?@BWGcd6E)7o%<4CDB~ne6*> z1Qx|0(A>@@MKbsf9p~9#cg19fBrQttb9bK{dgI=Ryo9gRId8kr&V~uMm_!LTWv+a= zDC|a$p#fQ)9wLG7dC~x2YezJl;qo`?Ohv!c~0ASqlVXCTNu_iNsfC$@~~5M3}cU(HVDP<3$t> z;X-<>H$(8HkOy0y2ay0u%b!1k^+FP8?$4MQ={Hk{XgKHk-4_a;(ORZiNIO$ZhQ3+g z=C9tW3QhxL0s(@c{&JQ4Fs=}iuQq%x>H{9Wgl99pDLYjq?IfzcS_*hy(N!yPS8PgG z!hdw(VSh#;G~5=5C6a`oT9u5OjO@=03OVe*czn2uhkjVV4r8IHb9ZbvCHRuv9;1+^ zgV$7#Dx)PB1~+e_=v2npg#dax7P|338&k5EyDD!y))&WiG^Sg&pZe;pwEvzSgZ*XE z{%WxDzVIn!Oew5a>3tLV2cdg?QW_O{BvTtCq6<`4F6f)o z`Kd#UPp%wLYrZg44~6C8Vul6s{HN)o%b`+}<@fmUlHu=}qU7)?hUoBkB!tOLmGy9w zBvmo!=IQxHB7_yl%|)SJB)#`Co7G6b26ccDa0oL~y)Me_feSH7EK?m(i?B=+JfBOO z6AFjqhJs(91Q-x%o<=w^5>kmj795eZ9OjCp8)xPv*G6?QbY=S~8ovTwHbD`Zi;=xG zZzA6443|CgBx!@4{YayN2Y-`Vf|h`d53{Z?iwJiSuH+}x`=ZKv69<_QyD;9hHU)BG z`79JoTGEbyTwgmM)sw0P(Tv|0ZVZCZSIWvs(B!pcDkqMVjS0_hd657;U?#6;><>N> zp(3PiaJcRl8YWV8+E^Eg0A3q%nNyu}Pw%oke~X0kva<5|5wXQRzV>ZvuU6h3T3>vX z$*nLri_4S;881RfC;A!E2wmThAK6e&^$ugG_XuY4i<|hu&&ifQic;)1A2+*31N=ti z;pwSSypkEG(O*j#Bcs1W`}hGaL9q z4bC%@#ld*$?&e{L`@@m{cufQ+vgY}v`_U}5Xe*Lxd>X=&m!`x)R=z!h{hMQ~E41G0 z+h&A^1V8aD3auYV_fRF7_<^t5?rG)F6x&_LhO;>l#l(?HBH%#GRl%$}huyNNfG0KE z8^SN5?52A{0?HrJN%<|Wn#QlzWnc3FuAKJvmVH^TqPD+oyn9Eb`Ba?= z9_1ZzDqCpI^?Xd|F6kQvMim;F5ZZ%2&+#r=6ytB^4y5ol34m;AO^UgIvZk*3b&a)< zL=2%2L##e7Ic^E#0(4{lfg62zh-R{prMIl5xN{R$i1yIbC?(;@i7@#%gm2$tg4@_T z8OTx&ZAuw6YL2X57g`7P&E=6IjaUAUM4+n^nM{fO0wd=qNYC(o9CX2r=_Pj?d$aJD zKXX1EW8wR8+1RqGI(}*SDzVm7(Ubi)vX?PURrmfjsY*(K_}p#Bhl!OV<||CYD+FP> z-tVxlck2p&`JA7-6CJKbXb)#4{)&v+h4{*ci-t*O2}5;L|8|^lKAz*YZBOd8m?~z5 zW)vx%XseV-iS2~TXFsKH5}378`ehPEZ98!k9jazvHQi#qQ;N*Wm4jvj766A7YQGmT zJ=a9*lWzjZzKac{nFRG7gXm~yKT(Wvo=Ev|nF<64s1q9Ke5m0QCP`-g&@=?$q#z;0 zy%$y}S}ybDs?9#kwLq!E)FHKvh64m9(n2XezkWyHB41=Ej^?VN23<0t&j-k3g>FNvJ~>lDH2AOm zm4^ccn*v=P)1mjtM4Jm%L1D|CKYTcZ#%rus7@Pt(5M4h2F%Ul+lqfe34b9`g#Cr>ge zrZ>As2&L96H*Pm*I?7^{uT6v?AJb4)UNV>FzSt8jH_GQ_`q9Q}DmM0oLF$K38jBQP zk7{=_1vTwN7Vb#uhXsjO$x4mjbP`v?+7wI*k}r(&--QrHBDT+e%zlaZ;W@-f%&OD> z1VKu=6|G;`>vhn51(W3k$onK+fKmG{V%NZ{%uerAF+#W1^tx?Z5O@$^UUMEGCzf=@ z;r`x8dnv{gK|&(}FIyHl3=w&8%iLqE$ijQ|XN9ED>pk*()b|;1^EOWkjhQSy$5I*x zKQRR=%2G%*FpysnmPPbB4fU*n1#=0WfRx86qa=zyBkbeLWrKCc@|tYPVk!cn@HSQ~ z998-BT~p3JrH$k6x915^qMvWFWKF4SX3CIi=&W2$$|13P{d2GsWH z&t@t8K3+Drq1L7&%8#mhd45Ij@qq_9qflRD1G=S@t8CfpI1lhEGdcj3*H#l?N#iKo z*r(P}esOotU^{qLxr^4S3jQk5Inkb}=S_E=GjPLX)V;Dcx<4*bZ3S+sIYzQ2MMAvp z^kg%V(o*j0laV&e`$dik0L^(B(u!x0NY(9_m`s6J*GOkyu)Rr|T(WC#Tj8_aEkvGN zLhU^3Zu7{7;w9dwRM!%?*If^s?i;o3y5@_LTtddod0H^i5 zN{cgP>)ybGrm4eiTk5#Cs&ArO|2qZhCkWk8LSV$M{z0ZO({qkj5jF1!S1@s&S0s}Y zWad#mdm9|_cD4*}iTiqCyDh~aBg)GDehiSE_WVg(E@RA_x6%T`=k(QuS2e=+S=mGz zVzPvV2cNB!na!id29XL@xAniZn-SY2%2x4MAF9vr!8h788L2y2Ey`L7Oa~uBX+aJb zkPa&;Ff`+l{q5HzTNo}v@l*T>5#++Yr>sOba#ioU=~GcL2kK#4j4&#A10)wXJFR-Y z0=mgqQ-&SWdgyE4f_)F`xrBN99}XULT}5HoNh-)=bpHj2RC{{Yk+zqlvuK>W&)?5# zK?>e=*@=pyr_dXY08u8}cwXOV;JBIRocr5cU_B4LTYIStY$G!F%P_uEV z&Es$J0s7}xAnVQX0VwF{muBI!OS}#JE#+5)u?s9wBq*%&Z}DgX4?PxaLzTGMe=t?J zIBan{j~F=;8g<|mziACLfGh!OQBbG2Bg6uT!!;7fLymg3$PE63M+(Lo(kEuXE?uF zf?Y|V9eCx>f<+h2(itPwDQ(+(ME$MwJ2yhzJLeHaB>H9paU3C(Px(q`K<-Sf8WS5~ zO;5&CTg>y}sugzme6&B z8%UuTGKm-xZOrb$s49#Hr0yplTlYPn z|NQJGtJR6@8zPbiKfU$5zDn2Er+_9z9w_&S))FfQ^0PV<*~A7p%H_jKiw!g?dFsds z)Ca68w@T2bovt@+TA03CHZ$1pcj?+=abjYrWG8t)pD-m;7KkUi{*=Z;ZMh-@y;+h> zZhdR!HiieF@3*Roj@({L_J%ykrm7P>3)639tNSGXdb%2C-=VpRsw3}#FnE`tixuu( z0{+EGm}by+(v`4pL;21ETRYnWmP*&9p@ym^5vNLCw|9Hy9uO%y#&M6fxn$+4Gg* z);PA%FILIq9IM;yiBx;fAz_?miT2Oparg4-srJ3_7e7218g@1MVkef|oYSE^Tw${m z-t_@V&ZGj9veq(35IV(VyVf>&M=4w*pHlsjnT*K}UspH%6m{KqdJVE0#<%zNbg6z? ziCo}Fnw#5HJeW6s5?!`}tydVF|T-|p+NyzJ;dkAfOz+}rchCHWf(6&1); zQAug(&i&YKJlnuy=Ui-o%TBKPYy0}51@E$hjw1&-4=K$2*~SD3IGOORbca5gZ6`xF zYB+qJ$qER`dGu0AubvHhjx}#HU+oQ_U8B*?;Q9}v98;`z$b75)GP(K%_;sWIv;9N1 zfG@$=rlCCfO0e|`Te4(wG6==g-w_KB*0Yv_K?lFE=gsHhNN^f-r$t(pctW0WCI6H1 zSK60YN&oGre07xC*Q``MC*?HZegmH_a>-v7}gu&Ip@eid+(lF}n7 zLhd!9otwZH?hUSXgB&;Qh(2E_uJuWDp{dP${+9UmfiZdiu&zKQymC1VQ~j1> z4|jiPMYv^x+DjVqBavVlfz9P%x-t8YiO!6ym{=A2hK^3Rx)zh1oF6H`T1Ly=`QZgMb~; zq%>9WNPf^TswIZC>E}EuUi*IR*Lfd}>WVfxkuh#h3O)tms8bnUD2Sp=^>=vF+=CDx zvSA#j3|#n(9!cX?XZFM`fYw@VGk0_qg>O)nMPb7)sQNtt-j$TV$-LD(r!iK66Imv> zsueSsgU%XS?~k!NHTCo5O?UYzVc!UcgIFry0#cnK$yN^b#|h>PUcpp;Sk=Lq$z0=T z^9^v)+2l$f^Oie8^eWZ6&bQ|BIx=~ z^`Nwm7QO3>;`YVsdi=z520&BA2hYAj($9C(O>~Wk@${Hg?C@rL7=@O8wR?#1bz33d z=l#vLjg>1-3i5(x?E^9MC4VHdDjnABeQ@3={3Di%+l`BiAKE=4HpLn!llVI@z8E zOaM+j8YDhomGxQ8yh%xeqqx$drg<)&7)&H=J>N9OLY7IT?v&}{EmP6ownG)dT~#Q~ zq=y6S$~0s83P(~{mvt`y2!qM6{&k<<1%4AuetVFbc!sA@zO^~b;LLzHCj$zvA4W*8 zYj)}^R*KvbeSoJ6C-4fww|vn|#@J6U00UIaIpbadU!e+z<<~j0bfQ!NOe_xkRW)>D zWJrV7P1>OwHP7~t8rspyP5fUJp+ETD8;A- zM}Wh)=GL*==P7H4HM=lsc2#XTnx&8rQ!pASG%OFkvY^TB^iJ`($=_;B&!SdG0J8Yz zX~iCy`XbZu$q5iNVv0C5F=6cJ4_fv(?+So`eF5Vje4!Vxpw#yN)7VwUMfI&~V(3OB z1O!ne99j_R1_5bd=xz{@Zjh2j=?0M$fuUP!=v1T|>8>G$!MpL?bI<*v|HE&67v`AnthGUqto#q{3+ii~v|6P+sDM_@_9qU&ylE}IFjC>C)8%5){ftG*sVBmn}tX-lmu_AG$>F7@aBvwdUHB$ zx^tXy3B5RW&VwH`!9ko}9@;pZPi$~S@dzkBQsTeOw(?*x;Qw}U@@bi%hD6#x1hQ{+ zMdOjx5h7zxa>!lNtEppmh{*OjSXu34UXkqW_nn3=DOjsh=ya~JFBlx2d8vVt;@fu5 zrs9Y9#9x?qVoXZ!p9uNDeGIHbPd=Sbtd6`7h+HY6>{2_%BPcr|9_2EX3QROV|2AEC z*g8-(aec`R&EdEG%859;(gzy5Eg;88py{%&JjLN)z~~MUzqV0t85@eMa(EjUjrXA< z)aS0k&zjfsXyR5kH>6R+xl7J#{jNK70UuuG+XSj=B}0ezU+?UATa|zH!u)v0=Iut0 zkZuV^hXy&7()|3C-HOnb#!UCF6Q~!h@T_o(Cy&HLz0Jat5<|-4=gD)5R*y_TM|*RJ zz(NK@{1vxe+xDM%yEdIiDt9`_q*;>c1(Dxgj4*qtb(4L~K|v!PSS27JlEl{Gw~a7S zS9U0$_a9wjM5O8RNlx`QqSV^!enxJ`S9y%6XHmhxCHeRk8p!2?U;C%LoX*?ZHPmiL z$L1q(X~Hpdq7k@;PECGQ_HcE;O8-EyiGe0Sf%(O3pHJVX8h0t?HT7EAEHqEoNbtyTF`Bbenw!}rr&#N&=iUt*QOk{KRwhZKDczLu%StveA$K?MPcWvHYSnD?8X(Ha|=_t70JBSF+cn{blFU@zcP*g={H% zZ8a~_M&IE+BD*Q2^WJCK9HwWle>vw2Ezs>ft$tzHf~VafNQ6;z5x7H0{)DzN*m--D zBB<^8+=igDX)Raaf+#QdOkt=Uv9p1xId&9C8&NepQp#>X-K0~740Rfs`Cc~YQv|D8 z$e^kCy4hssFUVQ-N`lv)eBZj^W|TOPB1Fu7cks}@!d*jGw~3dRPsi6c(|)ePxZAxe zOG&uJ*X`z7I^jriNSRMo&xjLmAf3$w|6%%8cQhYjL3gpHQL$U*!DH2gw3L+UzCIR( z$8lU|!N-BhUU;}Fbq904Cb2B7WjHNg;0Lp|-sk&AZ{F_F*4Z05F6-ngdx;9O1L1=f z#C)tr<3AVlQJ_p->j9d=Lr0=vT_9ed;E<_&ujde576qp8KZ6>V9M@t)u{Goa#EI;& zzbyfuOI^`g8gN#DPg0;tSepp7daTLNyBS8YWiLmG71uq#pvGg0K(Qfg^=ob~#1%dL zywpK~m!Ru5=t?9%hYcpwv_;Z?uHWne6QIZB(zS-ZmwPlSh`Vyv@3E7R^p~S>oLQ)I zn7;Icnk*Le40+4$eTB;-%jZ}McV64hZ%?cF`o`{VpHagPb2Fv~`XC#mfj=HOR=;>b-vj!dAF!t*vV_ z1#77>3YOoi+S(peRAp!DR+p2NGjD6Z>WXH-#`SwYSbudrm+W@hYs|4XDavU?Wu^=j zGI|^jKiBSG;^pIeT5m;8ux&&(bTNf#=Cdb$*~)f7BkF(PYi9A%iw|;tV&^;WD;5W<*7$HV5|TioP-yFZHLunC+Rn2l z_-N0hSwo#CuTJ6}<$O4=+HO9cSLzdC+8|dqucVhk9*pAWA`#T*_aWr`f!%&;R zemTNnDi&+TzPZRKj}d(wPb#M$=V`7BvTkCpLUjJdE75RkTW{f^#=VR9*@KyuVv>Pk zV9Q7!94{?^Mk!%Zaqb{fxN>zppK2m@zS(l{U%W?6y(>^z#LT*O;;H;r$4(_h{=qD( zSi{$x@yLksmY6qwbu$jS0biXf>Ryp|XwVQQnZ2=AN(#O5Lga~Fw@?Y^GxOgDn>~@P znxyri1+9{N^GM_kk-w!*TVqgE`B(pq!bo3`Fb5i7slOLq*_`!Y7SgP=ejQ7#I^<7F zQTM=bWGawfv6ALrcv;9t$HqQX-%4JPW%ViDoL0RK$Ap#GPQHfRqf%crCyXO(g$%)K{m%m!K%im+6y0Ve|)5bl8`_qBa`g6D~LCg2bcL$H6qmzW{=h3E`IwYs6`%-s$FGVzcc4QRn zX3e-txqO9X1^dtDf@GcT)_uw+O@BXM17y^*va=0F{>ZC{KWE$Ael~!`qa-~`<}hys zJ7O7dH;V1wcrq2%z5P9@O(b!rc`hU-H972z;e5PrwTaDt??w-CtwATlam%lwl0eu2 zhCV6s)9}MvA60lDW{3j%csd!={z8B}LWnrK0E=SGvCYB{`G|(3Q@YFU{bC#F4d%rGlV9JjCELWyno8`V+O%h>R*n&`C}49Y{u5k z-bA75`DeH^E|@1TSBVF&g*g_FcG6oKAh?9IPXN=Rkp!&LBm4a#zc^IPCGRB#RY(Q0 z+&fx#P4Zd`FmH95t2A^IbL`hGWj7DkXG;>Ooh=nv^Bkj`zBtcav@MK#x$ul?`_n1nP!Wd>ZAkNs(Iv1=ijL3kpJ zMi0SwUZb{PM2T4kZW6*0DWCY>tTkH*V5e>C zbACig6llQ0JCyyND(l2=cSkU-vxrN;4w@2czSI?<{_x~WJm2N_liW=ZNyL~jS@gF5 z0f8vHdn-EOG>OzaCg4tH(gJeyHcp(KtkQ*ho$nj8aZh|iESNvV8MiT47oW2?tR|x+ za2&_(R}Q)>HL)f&VLSC_GTaW33>Da@Q=$UmEIAZL8*g9E_ZqR9zifYzHrE8J0&POx zJ>;q855|8Iq2|b{l9P6g82N%Co=t`2^5&DlfLq>TkNW)t{u*K>VJ4ue8fex2 z74CYo`4wSi)!@$}P@>s^-j$LC5|ECEH$1AA8)|a#1Z` zJY9PFfCx_zokx_ zNw)Gl#r&RSF{?npg9+!Z$#$rB&W|c7VciF~W+{G_Na@=lEGq_u2i3~9lFqQMi@dZG zay2#ZGZF1=1$kclQ?UX8+AqM+OX_sUyT=2U+<0J_eOn zfJF4wkICFfl0aUi029U2syRAW#*vn}$%qpu&3#&Y@Fi(ZC?9R{)+C|VVlkzB?P1va zQnH|n6h;Ugb!_L@wyKdiBkpDf+dA4n)irKrJ%1=N`wirRiQW-LY`w!+4)LcpzodLM zU>79sO)@&&4!em`^YthV0E2OD_vK{#L+%a)>!RKWw^K%puE&yzX?BlCtr zPJ$$ac`$@ZG=D>(q)3zrE3&bmEYySal^wRz^aw`PG~8)9*pXvPm#cc|13eC|xOb^& zsPdXao8QBGHigW|6YzE&D=O#iBBH*9dOE1$w~AoSriAtWq*B7eH=y2Qo=hC1JZ$w{ zqf&)PR%+3q=clN`j{W`>8ZtnXW_@B0XQ<#SRiH{<5|lm^MGFP4&n@rH(hr`i_K))@ zjNK{!=F1%PbXqsT{FQdnI68U-{6wu7at?J7IUo36pZOsz_IY^wO2mbi;c$gGC$3u0 z^!Z4Q-emQZr+kdI0OZ6`ged9TgWC3?8~^#mJA#M`>Nh%g84S|cR;5`<%gK+apbyGTA~}Vb-(#kZOC$} zaP-YD|C^fe#2vyw{o}K{ld4si-m6eqfcq;&ye&iJXjc^MfopjT=jF!%gCZ7WAJu+coJp{+TwyYubfHZ~uQgP0ZD3+`4uKX4vY9PLtqk4h4W zBt%>0sA(iU-FL%2s1EkELbdI$P~^FK$kX{W2nF@}Sq$m0r-j1U#IPwM>*{1P z%+|4R5(e)YUgxx>wJk@R@Ywa&>PqA2=PW$jCqXhUw_N+O@N*%)^UNAgro1xTq=S== z;6F8ae3!NsjF8oBD5^?a5iT3DXXYuhD1Tu6-l-LWt7JM-z^Yv6b&tU-`nI_T8QQ*` zzGfm3!b#VZG8DE5u4H27DHK}8X-}J@c8(oHGCc{_XeL@)NcOgQcZPF%X3D8XLv?xW zusd1xf8PgdxFMLE9dDX*rc&)!NrY$f}$jt4f%cG4m`)Uf4BV|}5i1=1`rEU3V1ZN~~ zo>eZdxz&G&9OVMK2*=YvN*;xVf|FhM1TL5khY<^+obFuMBA|tH(vdw%@=@Uz?QbGv z<*SXc=4kO@wM}scwleq@10&4nEu+uGn|iXXy|pSss(pP*=lIxN?%(1Ez>fg!N_nq&8~6V zy}8u7k)=y-UD zz|X4_QBeUZ7JO_)ZQR*RhZ-~Ry&AbS zSgo9`N*K}vb;6ke&K^y}A0 zFFiD~;m*6W164`s0Qm8XnRjoS&&QsGt2>qE*LN%2vf~5_>dV#qSl$O?S!I4R5acaP3#(-m$Ya9(cj57MtoW0P`Tg17?f1W5>jC zY+-ULA-xii0ZFyl6WHOLx8}=JqgKkHpQ7i#%SJTY&rCEhyqZC1HWTgPP5Qx{-d*fI zkEk)das_#JxQ&sIY!=sz)&-vD6Rh9-sH!z$vd(T9;epRqZ`}w^B=EfDpb#j`y7RA; zD^AQ>x^amMn3Vq=zZ!7bwze2wQQnoo0{h`w4GLVYD(inTS+ylBFOaarB=3*nG_@8{ z(5raFoulTc7mS{Y(hBN(I*4wlL(w~D&X<0ct{h%K$$dZ@KNZ%lpV(0o( zoHIHghSZ4DlRtN~MwegvXm#+0_`MZ%^D~3K$KSfUp5;6v%b;qXH<;=`adSvEZ#^%; z+C$I{5{5i9*=8~fW;C+mxa1tp4y(vAQ6|p_>kvUuNgXuMznrn;H~VhR2adsIYSfnz8}?mK~<7tj3xcLFFC8_=QIXoi0Y|f!w>VVavB^SiI!Na z3q8g`hyW*|k3b>UjvPH?%6WY6K{@S)@#T{Paa@uo4TV2~a#HrBn+9zaCGZDY{h#mR zyu0xg>iM~tc?M?xoijobP+$ejoZ+=$s0qZAOuyDmM>0%eMdm4MHw{sNzoH$3hO3aaF-=FPnPBlr2`A{dZo;5i_^7JVocZa05-o9(!r67}L}cv9Q;qk&bX7M@!D~ zke4>gy(`B4Iw&Z2&0a`}YcS*I2;s})U^45kGSiGnyXhIgroP*JcyrWzeYdh8N;F>_OL}y{9QC8RH4{tB_X5s`4_aw`HRbcg zx4_II_oS6s_=%Z?q=OEE47^+(jgm+(IZA)jIQ8OB8L&?g6kwp1L4>{GT2M@f*FLHc z_H#zQ1W7GPGXUHY*#4dZ%lBg3^tQ9K$ceEwubBMLm0%2vCh_)gzEIy!ABa7L5Tqm0 z`Eq#%ufD@EYGe6$5Phz7_Qo>Tq|B0zALY95nxK`(I!MR}mSg-P8VM`)iO{D6GY#2B z%ga-Q%COZ(FKAZ``1$H796Pi2THzQ55Z(=>NDqiq>?J$9Q3InY>q?S{mYk|~+&Lwf z86V*Y?$M|xisn8~vcP9V~3{xz{D#1lAGn~8`3(_jx;YMZxERd zQ@^K^K{jSuTSkWD7N}im+|FgAnLWGKuwAyK0#7B==C~@lv&Bj}g6iqaVqkODjnf4$ zYQSd;nH)d8jzcAwxL3~=6GW7<`I{nhroKnARp5LbgQ=Zbp^UF5{5)NKzHe#rht zGV<76>65vGKkh5+I%GQYKQuX^5EzZeph9h6lu8?V9)BMgn}`Fq8QSC^eBDa9LIWA? z+??;WGcRV9H9p!9(g@!Us?i`598hXU7NeU(s<-DLiWr~Y=*imL#_79T0M;x9A{9(y zbM;Av={_5}93LbeGHkS$w&;AAbIWA_)5A-b#BrQwKKwiSxBumAV^_EbCYxSv_Rrax z=1?k_6)O(t@p(xBrwVW?uI2X!+!EAV1kasE;i?%DIF!EwND&5_bV6#H*SQ3Q&=HRD zy&(5#@$Wv+ya|FVEO;kXlHKgKgIV@`E!D?sq&+gt=i-ml9TFn+>T|RG}TfyFg17I;h92r9f790hWB{z+K zKj9sUGz*;UMgBKD+JcSS2KaN)ObNhKmwqZudvCWcMV_pb^L3^Hz!33bd$4@H@}lmK zhB)E-N@^&X@wQ^N6y7%epGu0&xi#r+=O6}=dOy=fiXtLl_Y!%}Bz%^y;%<@JAIhs2 zz(o2hKek3@pj)mzCRX^EVR%Q1#XOY#nNB9)AAkPstdD_}oF*CqDu27^fA3$?#+FO* zBKPw8^2ZANr($BT+j)y9d92$^;h%~S9soc`B$UhV{;8q<-|?apU}Xz7_oAv*$ z4zvVTs%~p_f_{x_D>Py|{YR^*cN&qjXNR&^Utz^A7=|BdA$ZiyVq#M{K@kM8sD zQy%0%{jx%`41nqMKXlRm99QAG%{KDB1^(yIpYIuwpuYrj7~qx9R~Wb0FHYEe(XG<~ zlb^&seg529*w=Sr%*pcVVK)#8CK`VzW%0kBEL*HtXbt1z<>kc^za`wKznL_-?^kO? zc$kud0?&x>r-o|XJ`b4LEw&2t@$;LxAIxVTT?JvmsmnJyLKKpEY-4Cd%yf0q*isc& z&aY3Q!ot4roT~4@cH>BBvT}B%3DoNR!2Jr55PQ`qU<-3~SZp0CTdcQ6BGGG|Rxww- zk1hh+#q|vD$J$q;BhD{*PpWjr>>RCnXI{HrY|J%##yii{8wX{JkjGB9wZ*x*xn4f! z{fmqK?&RcK-?lQTjw6T7p5{kHHyuKX@h?w%Ee_otv4*N3+1dSEA9(PFZVrj z9WeU_?<>sKT?8AmW-85=WeVz@PJ?opNE>lO-bM=3)*VbWy1sO~%8$P}Z|E2onXR&{ z({J_pf!4g;2f+*1;Rnw;IYLtL6KH1BG95a~FpF+=elS2^!^K?dOHq zXEUG6FBe>aS|Vfd$xb0~$z%c#g0AI_PMr7fp?kIXy;Rf9QrWiUP_ z&&v7dA*vt>iM5$o*TQ9AQ*T1r8&#&D(gILbXn%JM?X2C-1Y56wP1R~o361%*kU&t zdXWGDc5|)l2Y2?*$|IbHgh+56JU{y{9Cq+AAV4F7#Sf)F)5Vn-^a=V%8j=8yndK7_ z;$D1kLkXkFT0JdRXJ_|4{j6Ll%_5Qwd)|+Qk!H_qIyp6EB#NBoC|z?;xBp)7k{zys zAJLtvrGzjP6-G=~Gs711PCUb#&1r6))6=gVKTjavtqSkMq3C8eXpK4_^xSIs!u-VJ zY=*!aS)*0JQ*U92_Js17=U5N=VKaGP!)Zb7Qq0PSY}*5=N=`Y#WysPwy)p3{K>XR4 zZ~dK^B*k#Km|G+lCB8bocV_ylI|t~)B~JmQJ;r<;67$;5Ztj=J;?fQaST)Evym@R4 zN|map%F?`9BRHBJP9@dS;=9@O)Jm&N&$-P z(EkkIOS5LUz8Q%)pXs{jit2&yPY0dPaO?Lwn3;_OD!nSggsF!V@VNfGM~i#2?q3d; z9|}6JSI@Vdgya|rs(6)CwdA()`;$;9GwDInlxcQ$w`Ew`Fgds`f)2jo7;}Jm_&JG_ zFwDMw=5QiRqPqTa@O6(7+o8t2{vtWH>+_}94%a`2P3DKU(y$QPq;*rA1`Ss0bQabj zp5x25eNm64RFNuJPHI?>N+>(}a!J6=iv}u&Z$gFAL?tckXiyF^p7&+?UyU97&Y$UJ zG)_;5unn7ynnoN5ay@^(Xk9>;m(tm)Ixj5D6Ju*VZFe*^Sju|;{E(!PpCHsXw; zQ=947?)M##eo!eKZ{qBB`Nf|~+r8gHxdwHs2G5fuJf=a?lJfHGBiF;r_Vx!OgdgS9 z7ZBF12hrx6q~ZMXSNZgqu+lb{&)Pu%ddaM8=_O+&!z2FSrT=N=h#J_GKR1ejfG3e3 zA`@3$EE!wkge|Xp&nJdqGK|ibdzBdX#h^SZ9eC{g(|vVt69uka;LDYh({mzVwQ-&- zk$16#h4axl%3E3(V}#lZZ0{;8ZRn@vwxt~2z=L*l#HjyHPNWLimy`$6N8vu{Y4CR;NiJumNUPC&FN@gJzH~&|^!uqm?^6o+Ps4M#wjZ#M6CpJTDy?Rq? z@#E+99Artyo5HSGjK3l-7B$Ev^hi6%Dhn)PH%~qc93EEq9<5s33M$z}5UjaQ9( z;A2f2$gp(&LOv6%ee5_Jz6&NGDZ)WZ`2{UdOtFF7p(u^~8(;{mku@rlSA}K2H$N8G zpMwIt9$%h2slt^iiiu|vVZukmQeJF}*}O^6hB3}pYUa*5%X4 zW|<9GJ$nXo+{~Uq$7w^LR);Ny{(E=LDE8{AvNKP@I1C$X%CkJDTSK|sf$5NYZ+j)7 zVT`9KnJ{2g#&c%!d^Kjecidq@b(YcM8I9ftS%QmA-dER97p|a@BqE-I!)Xg5ixAi< z4hIJ%0j|Ndagql98pfr268=xmi`GG%hlmDGi>i^bMd=&?>__m|l~I~(;iE3I_HkPI`Runl{RSmrSra*biiuUiC! z5QJ`!hA<>`AllCtSx3ft6% zj9})WT$<62j$6YJy?zB@a0E^694I{?oJ$#kl@?D2X&cKD(p4rUSx{v6hML?(r~^1* zNa0TF@iB{kmQN({teVG-xK?sao%^6jrXNLM$s^*EsN!nwf|BFeG3mefd^0o1Mk(c^ zl|Yd67;q98?7IcgB2__3y9PhwT{SoJf1gP34FD{WgGnT?e`SJCH7Ok zFLhVyP4+A4U-)h|=(L^8jy`0cWXi_=#je1Ug{TA522vtK*sJ2Yd+5%ky!i0FYW^I2 z|1T&5tEVA4oq36^_c9It)|3AYN{2|}IBP7v>iBa0^Ke&mtNAI}L5%+RyJ>FEwxuxw zdlC)#)m$0 z$<+Q^g$*dt`Y6`pw}Wmsx)KSLzD}&tJx=q;3&_HaVAw)L z?VFy>JrSti!vB^H8rk0g?wYyrJKK`Vxx$v}7<*&+@w%8eqrZXWD(q{20xrkU U>t+9G6yWcLw31Yrgwfmo0l}VkSpWb4 literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/fig/pipeline_N1024_doublebuf.png b/examples/aot/matmul_optimization_guide/fig/pipeline_N1024_doublebuf.png new file mode 100644 index 0000000000000000000000000000000000000000..8429131419eee55d008bb7d0d94bb3f94833691b GIT binary patch literal 73000 zcmeFZXIRtS_BMzpf+&K5pdg?kA|0hEC4dE_m(Y>kOF-!@AR?k7Aku3bqghu1{YbTS%kV%Z@YDx zz>J6Do~X=;yuW4t;Cu3H&PH;^Y8s>aMM>hZ^h$(6w(zU-GnA3%1L@y8&P*?7Z21lP zWgN>czjv9cfHS;*A`nx2kEHsD;?$XuRQbo&Ros`A*{-jcx~XT&(%g7OLHXY=@_Of< z6cyI!GM_rbAW#0ke}2hyqM}jLTQAI>`|lC{XH1-J z7@wj9>eZtQ9{huy{xhhSDdm~GLT9kH$scU>*L43sWUxB4RYlV9@UYSC%g{k)Ezk*O z8FTn;JI_crWL;a(Zc(!<3DPw2_e;hiTd4(J0n9w z)||mR$yhATz&W-+@`OF^VYndz``BxV+er65_@%(c#s=Tse(CJ&3|mLmjN9~h{eSlQT0>gy$|otOLAB}5+Btg1~YDk`!lTAn+> zFor{vMVFIQZqJSU`ZbgN;fus5W6Vz2>*xn6;WzppL9fP*BgQQ&#t<%VtgRWn%Zj>p z@PRZ=V&dY)&Z-fbUIx|~uv{5Ku?J#Il-H>mXB>+(!wt+g`yF>syZA>}le2f?!}ES( z)*NuJlm~Ys-qz7Ye;jLRu{dzKpImLq@xal|xoY!h%2{INRGe(R&$u;D)OgHy>Sa2$ zVSk1^7>pY+(BL_oUsQxt>}Y@VO7T=_XQwh7JNtRAGQ`^&{or1tR-iga?G9sB&-V7Y z2m*??%sTk^Rixi1H9x21u!*di+n3Kd@7Gns;C`Syw5Z5~s}sh1E;w9@j6=?S+gw7Y zvz66--ygDCHP#*8=A0E;u6*e`br%RkMoV_sL7ju6YsSRcLsC|jCGPrupw28zJyi;n z8l6-ZG@sCjjqrJt-gM|}zB2$sev{%7xss&(=P;!UYbiXFmy(*YcCgK*A6uO6@kUc3 z6;`^zL0p@}ueNCO8bv!AW>eRS>8!5iYx-q$GTf~Vn$IK6D9N2Wb(<$=hr@$uzB9{4ou{ytI|MZ8h{Sb=vDys&k% z%>I_IS)F@ZU!ePltN#tY4KEbj9!^IeeX#5w+B}Fq&$8ahqY*}t9ZjS})VZgC<4Mt)j?)S%oW8%FD!1l#8p zl>p+XOvfK%Zs7GI?}M3B^#RLfKeKCV?-eUOyz3~-rxm|9V;o7{e&P<0{@MX~D~A^k z&b*2uF!`{y0r{q|7F}c~cqEeZVAu@0$fPzH_aY@>)6B=T6;Dy!gC2;EVhRwo|8J+x>i6 z+1gKb!)*dTCJLJfWIuSMEJRDu2DUMJxc+M;|6R0A>ehaa>@!K}rzBa~V5dMU8JTVs zWm_!$g*V!keAZ1l_8OhfS__dIMi>0BQ}l0aMFAj)QpQ4z+Q=K3c&1N<`!+IfwxkUlFFS&-3(2fBV^;>jW4OEKB9P}a3#{PZVq^0`qj5ni2*Bcw{Y=F z@vq%@u?fYHr14hX`21JzGC;X8$||gmxiM0r`O~RIBI*kHmyCSCX9n=QEZ>T4c@&AgSH|K zE)4eQ^(U7(8&Tz;NlKs|0^;prHJ4iYvce5nX_Zqs<6I+RC6J7=+(3*>frQNc%M%65 zW{$Etm^9pHK6Oe<5|5L2iLcwzKwnQ9~r*UkTi zF8>aB^4Rlgy-IM}buEhdMtT74P!sQitjsvYvtD>lSI!K8wPaw*SNG3~B6LKWqJ#uH zWD}@}jEsz}tsm2L-L?4QxjVAHMQkMzzcINJ9?mp7??q3W+{OAaJZ=5d!6|)-QY58G zQ;sh?i79aKZswC)uF`>Md8*ut9Alh{XH2m-p5KG{if9Z`pm(zxO{Xly63md5e!znI zAf~i|yEh+VadcyL!ES{h;3D5IW<@11pxew>D1p*2avGox<)J5)I9}qpb4T_r{Drhw z7(_7xnvR6K(Y4A9^L(sve@q40>nXShVSWME485xgTOF(5y6HPQKSbRZuWK%gQ|-RH z<@NKlrn?px`%x$|2>3aIGLg??C{M!~duReCbpg4C{kv!k`}C;Il>Wlb|M(ID<6>}j zaV42EF}%kk+z3A0v5Nf{E?mUZWKS?8UE7V3l{?&)8(Ww5$yeeQ)NOjr9?Q`%U~e9H z^f1^JVZHwY8e7w*H0tSvd)pF~_g?i5BL~ex2jQlzG+F!tRBMTfzC4Yz_nhXz0XfLSO+LJ^UI*2lDNQA@a$TDR*>Qr#d^g@5{AX{oA|$3%m2*WEiirc2hV7eS`XH zy4n)Qnwk3Dq{fAI2+zy#auE$<$V`%DM+HVkc31-_^IN54zvgOdu|CVOXL6MKI*0IO zO&*#UyidB2qbp?_%E2`BtibA%bt1LG-a}bQC%ey~lt$_7>y~kG2?nISNj@s&56VN! zj%T#Ph4cGY#gj8?k zvY%s;?#(lSOkZ^@)VZQ++>p#>^q~bCYf(J8dG_+xQCb%iH> ze4JEy2>r-4zO&Tgw^X^R;&hiu#C>0_;_lN{^cs!it)(n%8|^^+EDJPTwgmSW>VpQQ zhMM^v`{4+>AD!sWg)GvDiAfM^Qmb1-YLyNId4d5E|7F2{nN};KrI0#LSCER2>A8-) zGFQY^I^cAd6)!pOq>u~R+}dsOkN;R{XAV+stJoWA1hMkwa$Y{6)VcAhrTx-7*R;Gr zoj(lq_h}4K;+3~RYomb@Ck<8}a)w3?lo_uMJdptSe`hQ<^-L|1p?;Oz_`cv=w0hGY z{IlikrFU+hya(0DR#c_rzm{gKx}GEShd-vEq^Ds>vI>=_2pML4_XHqZG}sDF_A4`s zxckREXmN>BZ4pi*!2FaVgqBwK{wLke2EMT;mA_e;f7~r!tyht+14_fLwm zvN~^R&*?^6uxs6>q$w_!ugn{<8b}fslPG(kY!m&b11JKKr&05>>6&~0<|(ON@b{J* zx>3@qJpFP>_4a2>iDNbOA52`p7s8Olx3DWAM$W++G30#@!QRP?(V^P^=S5GG4m{U?0o ziG|aEP2eBg=x;9^OQMW<){0K?IC@ub_?d!6P4Imr{Rt?B{1^c;+lH`eGvxTgs{f8f z7RqGLN)zOWHnM&kN+yPk1@=XhZ^e|Sq_C{FUp%t6ImnwF)06Mr3K|JZ!@}tH~Qb`SQO^gj`sx~zE212 zz6b3<56LBvv87$*$z~b*6#~u6Q*?H5D0h)?_Sx_rAMfuos-lM0xQyGRG)2ETaq(IY zL+IF(q{@RKt9UJgGE?}YEDLMv`4Qx(eW%T5ZD2gI{pTN}+wVotP_lU&YTdqE0S>^8 z^|2#C#F~QLwHa&~SQAr=#tZO`$FV>SOA}=o=x$+HcF*E>IS6q?R zs-&@A9bjsj&}mbWl{M{saJO)(-bYt?;|y2Pt<(y9q<1M7MY|Rs71}O{~t5)+9kt)6zXvJKPK@1oc8}q6DxKOQR8#651zCN#eLSl zE6@bp-imql!^z1>@G`ds%=Zx>go4ucyujjJJ?|w^<~{3)9;Rp>0oDj3b|zmVYvPNEhv!XSx7LRmhQ9v!684+tv0wm*g3Su^CE!_E^51KB+iy zF|lMtbvin%+twBN-9Mne5G8&Q*(WNk&}Shnh-_2#IC9g5|K_LuWpL@x>qU=RZhP}h z(e$8fK|4*!Cnd6G$=0^_G~j8IO`@NwHVOTTDmzZQUR!c5(a6>9*QCJP_6ESBE7<3L ztL;hS!4UAW+~twXv~iDau>uBIHe)Yw4 zUzCCC>3}wPq%i!4T6p*Iy2eOl(1jyGxwjiCit8p|BjU0N<9-x;bA6y;O1W%}AmEcK zWdE?FOxPD$EqK)JYOWkD(l~e>X3v(LfIA}glMc3Ck7C48Z%d1}jw;6PaPE+B7|=^% z$f!C|lS;CRy72QBX#E)o(KO9b72Wu$*)+^!UKz!4#D)rn$*vH=g^#!A3cu=M(iT%| z({ZGY{(#EagNm^n8FV8BJHJo>fYXoIv_TxY2L}hMEx&*Cz`Wt1PBPe##Lj-YSuzej zWL;aH?nUS;oBZ)8`&qrq#-zm8`*rac(_c85mml$9{I_Nm!l)~E80y@5Ob@@eM} z(ipn;N>`S0Vw0Y@y{6Gld>-`ad8cb50dPFj_lA_~Q$CE_F|~R+w0W?t-hqKy-O7^7 z2};`ke!402D5Cxv2D|Q4J3O|~F~ss1#qV@nBh3<(pFfF>KJ+Ctqq@grH=PA{KapMy z1=l{@zzCp;#(ho0kSgH=s#cc9Lm$$l)~a>Q#J0JgpWL=b8bO-6LG$Q?Pf~WwYdoZ; z`Hqv@1Bcvh(*e5gqQ@U_v5?(eOh?ltG^<4}ybV9G3)7d|B-YL)HXU8W`;!p-*$^R= z#Hh+nZ_B>_RM5~=bt#Oq#DQ!^%mhI{Krrge0gUW69Us1h_X?kWEVs!sJxb7( zJoSvG_~j|AxcPpnp?c6I&*^A3h@gAD?M9P$(`zUP8O_p#e3gT%u&ANWxT&I<2X|oG2N`4g=A%ByOLLllD@z)3FgTB5UHPcVew0EM-?4QdL8eVg;8TmIj1PSqdU{y zZlk+RmoP7&pthZu7g+&4Wi?H5SI;%oT>^qSL`)wz9X~%mx{72ski^2w({S^dXskYs zT5`>fMDkZ(xA9J|s#uln(4n^lu z57*+jmc#iQ-WO_qnIA(2X-LU7RcbhWh{|tXl1$wV($NyWZ~8MF(9|@9jpH0Tj)yed z3OX{}fSO2F;R1}%z#}Y4*lxwQ5PXdCO!YDt!@glTNaM^F4bu4V)V&57y-0Yi)=%sC z=>vEXlYygdny$g$Tjv!vS)MXGj{4LW2_NKV%Q1W}Xq%WSi)7dpE@L{3cz(r9A)bve5Tlce$gOf$&dc=|NoQq=Z00t z(u-oPy8=aoPMg8G?XOR>V~F@C0n`<()fOIxvzk0%$&QK3txJT%l__S zwTq!p#+Lsg$jWLbqz>9tVc6s3^f^H#_pqwg55p1NQCw3aA&|dl57)z%?&9*#?3w#5 zJWd~6?RnhYZ*tY`L=vFrhE{wCFIB|l_Xm}fBD75DQhwhqa!NUrl$3->u||w3<;&Id z_nDSYPd72dk+Dbha)w{m#p`4Ck#vzaE#h$$zmS^^_RD?d<*lt*tHXO^@8`V_$1bs5 zyH>Gz*y~JR44}#(GiG+pJ8cGQo5!iRv?2oqD7lF?;TD7){q8P@UuBbch_}de z#+9b;N1~3@yO(5I7GFXO;P##_JwneLTs9kzL)866B2I*I#oQr<=heKb;p45RiJ;@A z#{FdKx24Z(?Y?u+7^@Bb%oR+P9M71lFD`3FP2(4dv-AF78_yr@9UbZ+MeT3T5+lnF z{b|G+ans>P(+-9j+4+RCXUXmID~AJ>tgShR5+L-2>@;r|lq;^TP9VP}CjZ^BX1ACa zg4kTKHSwI641)C6+y>1EL(?#081*sVMM9esjuiJ zawWxWt|NV9wZ!bA(;uJ}%9>VN5Rbh0n#y7cXTqP*KlxmQ=v{hOs=|9k;E6)!n`oKA zmF&gks{kPhtSSo3TR0XMsmhK!l)AQ=lLHNw~k@P0QSuJMnibzWF)W42WvV zT9!5Jy$0>C;+a$KXtwbmcrN$J)_dYrXr7VKiDt0aqrie*039UnCr~RtFRxunYU10o z;?DQfkWI0wa{~@P&`!>eZm5Jny8*D7mX=KJ*mFq4_$vN3%kbPt0<*OH!l(557XwPq z=?r}|HIoXrRtCP$)*6cfe{}2LP|a>OHckOy5_)SqI=t&KKpKr6;PO+wcVT}3(toWi zg!Y{7lx`5YeQYtP0P)t$PHWnQml?UanO_|cGaoE~R@uu{{qBUmipx8D_+9RxW52I{ zGki2YFh-;eawKU`SX2aO=eWSi%1YaxsZ-0k=(%yoE$zQx;oCTQBStX?du3At=P_Qqtfa_$8#Nw6v*DY}owa5dNH0dZ*!t5?AgZR74LovsE7bV8YUx z1F?!;g#~fqt2;yWQ~KOIJ<)ibDg&(u>dzdz1NtdWo5KcX7l7xT;^JN_`|HP1>au$2 zX$-Ci%N7-m=d&Dkl^uV`9ZD9VEY8lJsyT2d?@g63YByKS&3_Gq>UK^0?y^{_d?PdY zZwGW}^=zTS~P|lDrFq{DTlpvi09-igk~5 zE%=tpOq*Yde*DF{pE;2uXO5m}C2{)(;wgbCbDCT+ zbAZdvw?C%v`O5P#Qz{Uir&50&1iW4?fuqE!K7D|emP*JRL$j;6K6=)zXO}jNZC&7x z?AqDVH`MjgOBc6N<;vEDLz<3uKPA3?t|G{BD247%m%<(>*ZY$cuXc({WB0L0@nE;n zQg-jT1MSooEK~^Q%A|Dq;1d;u{M=ezA<(U!e4^Yn$cN3P66Ag>S@a@GuO_(n9D3`E zqSWt{GNdVc?X?1{WqkLdz&zKd=^&ge{WDYgU>uHq40mXx*)NAvJAm$H>hwXDnIfO8 zlj!Akc#qoy(GAAZxbd)X#V5A+$3P&g4ZjlBMyKbU?h=MK(3ROJlvY}X*(<)yOeqEk61yX^OTr=>U2#OKi`rR>Z& zv0p0LkPm*mpTx+TwBLz&)<|=9T+bMBJg^r+MX&QzCMl;vvuzm=_E9G6)ewx$dwJiq zn{Q=Np}Ob@cf8GW>@QWmkFTsPKcs^slp%I2n4Y88*Wtlz6S~=C;i?{4xC$$Mh(1aCNTZuf=CJroEOl8<3`F{5;p%YR(utzR8(Owx zG+XErrfqtr(D?s$tL!{w%uihlC+<)npEZ?EE3L$5QA{thfN|C>rY|P0`qua>4lY-0 zl09TfA}+ZF+fiAkN58(cxqfX2BsdRZ+kYE{-<&@Ru;j+2a*OF=5&H(&@fF{cF}^LM z@DIk3SlKx$tX*$q6Dk5cHmU$QmR*svp5A&H+^kcg;TI%3eK&qqy}J7NV;$yevl;Y$ zwSQdVMHY7f4$10^0f(D6-J-inY*Mxk2{M~GoD0>{aioIdFFV710Jq%&1B)vX5EY4T z9W$IlS@pzL!O)vBtzmA1cWT4k{4iExhZV9!$`PeZ?5qBqFR)(3XT2oRG*&0%?YAyD zH7yV77Po){UT>ULb=V3GoAWd8(8e0+!d23i zmxW$#Z#pn)5NkwcFoEKP`Z|v5KcBvCQ{h=}Mw?)Lk#{yYx%j}KnnBQw<4FEYFmU-* zN}!E0j_H<(&zj&9&r9+D7N+Yz%^Yl;_lq*8i|kE!b{q5})2CB?XrSWWlH_b-+8C>p zF8Wp}tMr_z*(ID@f!D941x&*W?;#=uXM`*J)ux?sTlZNvH+?6T4E({8Q>f7h1&1jFkR73B%79sVlslpnxs1Gy1C4iU zg==8sR*#`J*V??Y_p85V3!yzHf1%co;I4)RYbs_G!+vb&&C}?m5S_+EZY9$d7-6$+ zTL4LPD_gjuThRb(LN}N?#RyGu+J1i73$~u>DcTKvOPSB#7%WiJ+eR{AnJh%AK=Fm= zkcm-O>fXaxmlZ+uf=$PT>U`$VgH1J_si9}Y;Occ@?2+OFNnfLDZmE4>x~2`U;$Ib9 z`SYKf&7m{8-$@|V4yr|SOmNX*lAATHMEGdq#qKGpYJ8U##$}BDOjnh!T3me8%_5kP zeQ6nXufC>()!a|rRR9PISDiXL`X!=938ub$X+3;`Wly>l3j12KR2sWaeB+jHIYnkC zylQozW4Hml4){~NfGX(Mwrxu`LQFDF@%MY+Vo}~sxD((oBf8*d{DMG@q2R_~FQ{-DqDpzSa?`9cWPt5U`B>^}#2 zT`0LDyjpw3+L}vk(IA&U#)<1y%QwoDE;koX-&raHzxVBiRhr81&hT6waMXqZ=9$a> zvfkwWERq-UEw447#}T{AUI|!N=?iafCt=S)@()M&MnBInjQLWgUPcy+HudIns3Ktq zKZsd7r~jkhOWQdAB+$Kn$uJe9xlyAUYY(z_Z`R}%Tk5a-Y9BdZ=N#=;<9mB{d80sm zp?ytb=`E+yAV9BwjtBGpsa4Ixrl9~zBS@3o&-}P+wz7YbE4m@CO0g+lN`e5-2)atx zAfc~I9osM;bz`bH-Af@}{924q*obdFP9QsLGwI{VTOt5HXfOgwbsQsHdWQ#ht-o&! zGDVCxOZQ>XAt)7}LL1T!z{q>_J?Z+;(9Tlfc+?j$mZd*q%6X zddA%M1v+ZI=%E!4I)jWSIs&7XUqV9b0B1^?q_>v&jd|{q^JWzgD?DBXKP;v5Q$ioF z5g@WJWGK#)w?srR(_)}?f^k29{kZ376nUHuZd{kE!f^aJ3ZLDan5~Q5cb^|U?B*6c z>d7&#kEwG&u^x;DS&l4QLfiVavSiF$35C=-ifcpf=^vG+{Y5*J?^Ai--XFKd_TxnX zNRYwKE7DUGWDGy4F)nzLN^yZwJs{x*C`&fokRYNt*@E)2Qu)`v&>`oQ%~sc^-uy`PMVtOk$V&7#HDp2Y_)%j(No{SXl^GBCPHg8G8LA3yU>$J&E3(5`wb0J?4xVqsRnG}R zBgbedYOCf4sDC5928`72W?FApISL_*s>}Eh2CG|Hp*N39OI_JGuK5*AHJrE^0#7s1 zlt9|aYFLe7KtQS&Zg1k(FD;q_0d85+9^(jN%)y{+D(T3VW?_&1jSM}RdK2%=p}R;Y z$CRW);n2OLZka5YDv|t1mCP>OigBQ!;hn*Dj`ueNeLo6_U>|GlKkQ|o(6LZC1!blO ztWVZXm6|oWGRDTzb2{$6&QK>u^!Zo}--RKH@ zRs3C6N=oXGlq0)~i1pqcv^@?qr_oxJF&{1=muw5!5R9a2ioNxKq1c&k3oi)sRS7#c zre+^gKQbj3s${*0?f1DqHzUd2MxEaVU;V;}8yO52j8zDG zRlEC*o5w3z(XQyVz!g}s@ZFyta|id3Fz0a68dg&FtC#GVG&CAbZjXoVla-bgfkJMa z5E4RBsl&y}YH)vJYCap1XQ!zvqI@b7MO@nayW|*kohk$zkww z=bQEIOGdwMN01is%aCE`6TiI^plZJipj6T3gOLuyBC`BxtsHHJO`3g2`+^9`wjQ;s zPLh%x0?Wb9dx+me->etaH#5IgD(RPVNmIL*bjZbC^0->ii8!C{j+R${(x9j3Zw;cF zrkr0_(7J&cESrO75L1*K#qsd^>u0n~S3-68f(MlVrnUaPX2J7NrYht+F%g)1S>3H(IY-ah;{F~)n=XMJ)tyU=?qEF$rn=1UcY zNCy=Zq)@Y2Slsc+aAPS4+Yaw(xXaY_Ji6N_);jz*%j*WxXshLcGgpE?bEHiyDzHTY z20i`q<&JKD)JVbi0jE$Z!$jeR9w z#2B5JFp>4u!*D?c0ai}mLN#EsM%rY(MP7f=B%~SdO{3~E0hH3D8ZRj*M*P#Vz+T^ahd_2TM#T@%scpEs%6n0^er%O6)BT2v>91-^eG zqc)Y_EFKTZV9CAlNm0|WOO|R3AnP-oe)mEp z#3qcY3d^u4CX4&k(teuy>epQtxS$I-#v_8OkZq`w;$w^ZnL>a_ZacqkmEUgOR#{#> znO>ZXk)!?DfFF^jzvkpQ6v(^S%0-ormiZ(m&HE{b0(d7DJ2Qw73n&~)55NUVHTPz@ z64uH!A^F-Wrc+aZ`-N>T2CZl`W_?38@>>cm!zx~T{JF%(YMmR-wY+GrBHaC&uLgrk zs_bKMa;*D$Esyg$$2B9@{lF7dT}i-9bjCfsLT*;&9EvE#TVyuK{D~ZYs=q%`fHiVr zq{M{Nw<-R$5?KO3zQZL!FGcs(UIQ954*;XcRJ03OA>=(brVV_zTBpgD)M>Xqq}gvo zX3SS>_0%KjyuY7*`jB3cmn`B>VbuaMJ54l>w?fB*g%M6vKgo9}T!OJv%~j+}H)ZQR zJobX|8etZC)s!J+VO`7M6B5+eS4zw+kzB`}-<^hFV&kTXre;~RL<%jsc)>~Ug%a?H zUDo~Q{`!4PAj!KzprJH)$ZXGaf4o*CAoFv%TD5F zaxSv}R`ogV>Ad`bNU!%Psn74=kR$h32vr5FGOu!SUd1WeIWELg+NdhC)?y>z!Phcu zu{}Ud!)&?k4>Jgz+xQFr3@`sm%5hv;SRkHvYj?c%y!VI_J9cj`Z23N(BKN~>*-+*F zVD#4VBi{0yI1T$pemxAUD@v{%0mP? zIt|%o6+d!vF+m?W6naXn586_^tqReRFxk;e(a5%@QUF4h>SRzghCp(?S9&AW=vU5D8R~+`hxtY0i?A_ zoe#4orG!_n^vLW>2#(F_0AfY3HZz2uAF_ z)P31?2q2r#aI_2m=27$k&$6qtrNY(7w7_>~@#XFkD0R?Q!DgIh}v9KOuP#N1CM&RA`Nw+7N6K!)XS zZK|{OO*zohSfIw>&jOzyd{F1pD+Pm^&>V|_%xD+FOtxt5bkK}ARSi=TUp$jMkipGc z=Y^}?tI{;@{=U%W6B`9#X9|TxhDYTrPKR=m*tE1jtAHv{ z<11P*ACp~SkYREyI%azDS6d^EP ztqa#nY$%ldlx{g!kr1?hm@ZidbtX^Z;4E4AWTZAVsySeSUMp}i!0sWl&Sb=|Q z0RY#7htRj^M&fI2@sCDabIZmIU4PIfzy~Hj*$RM+f@L|B4=~@0>bmY;2#Mr-VY?$r ztNk?C znUjdaz~3JlJ}49BwxzvB^qd2CGYw{4_)B~z??tm`SO0Cho?NpS=s;3RNxv@!wy1?@ zH+Ogh?Nqnjda0`DRB~^^CL4eA6GH**DN{(`gME+s*g8A$feC7KpyLiOb&trqoBMNw zJ>Yzd1A>(;RsHxgd80zEuxu%)b7U31xW}=VVx6fA0yVbHy<}tKE&y73{!!G*%*~)k zmaB0jIg#aRA5ZB<3|d>BAwN09>Q`fJZ3!P*mHQ9=9fIX!xTTE`>^n;c0RKQ-|4ujJ zjB#Y4q;a7MdHeQGLC zTb<%uT3fY=9^&j0UvYAEYm+9M)wi;?dcj>YAK4DS6fd&WYt^{cNBRKVVg;k)4;_9$nlE!U+eYG! zrY|$XRfT{X5#uT95(C@C>NOeFkHm~dI*eU|kWO#=iN#qFs7)n!(0I4hEYe`;we8cY zY<+3Z5!TJ47p=DT!dhF){W-=_6I)SkJwcd|W@klV6Vu~>gzHTro9?@fM(^$jO!>C7 zO9r>zhnXy!?1kX0k}ZXC4dUL%h+DOtm2!zrQfrvCWG%agyb0_THLAf*+Q&YE`D#-a zRVCT?_?Akth>=Rh!@AusQyqf4qR$aI3YEkpXNgnk<-HEcF6#;53CWvb4nm6^QSf5B z6~>5M>SYtpR$R5^iMDv=*1-~02abM5vqA*f-iD%sQZr`QIpqSw4qZK_6RTz1cR zeWqhp<`U%jp{SgvOv;MyQAp+G49)cx$0AGsE~_$O_YuI%?R$NbqtwO{>1&172jauW zm@@B~?_tc~6>z#ihh(GGJbS??+Jan6eyVJux~wcPk3|qMH4@d z_%!k5gC#}M*b`$zc?_^)wzGz(c)UMiU~TLhXW0dE3wNsBaDwfb2mBaA(|)@cPj|+E|doQ0Jua(TytPKwMZYE{XNU5~A*1Si0K(>MYaEr-AzHl)pUo83utfuBS5 z594%>D<^d&x3s_Nzr$Aq-ctrI>UN*=dH%KRd5@1W_&ll%sIKJVfZL4MYkUT<77v9O zf>F}Un%FpOvJ&i<4QAdlwTxZH`sHTT>cpyc?sgx`_u6Lir9N^|E6W+!up9Q&pr}1M z*Ne1Cyyvw|?0p_)>ns^wmy;4rl*qVSJBHXlta0n_Hpk7Un(bp+I|NOJj4F$hh2O6C zl_jRQk%nJ)Cm3g@HMxg!in7+3n-9X)J(FXS%o$x^F(C78kefZUP-f4#ck3YvG7bAN zh7F%KSqW47?kXoFd!=-?TUja`Zgx!1oVs=nC(j)F$lPx^*viPw;30U?aDvz++gx53 zLeC7m&|Uh9!UI=A!DsZB+NC&C{`TCdQ2*x6ot(wbCEVv#KAP>aMVfM>Om>A8=Yvjp zKbS1;?qK^K*>rHg?!4mnU3yNbdt(f``}H%6O<}65w|dXZT^;tl9P>lWw_kS>^3C3j zzGdrUTo0-+-RSKug&f4L-SUw2cb%q%GRlqYl^qY27(FE;3&j>d;vVzF0LM2|b5 z*|>Yk9CCqc@h`r(GY!hze4a~o-rc`RM>9S^8pd6%&yse;U{0Sq5scj!>yRMrS66w0oT%b z^4L`cwB~Yi_XH7%2EqB09-o_@EX+HTg-K%=%GQkxjO8?GHa((11*rM6Da1@-4%7lk1kAwGlm>1@yR zkj|UNJj&CRux-@WFn_p;lcb#{8uf`kyE&=kcma3KvqzAJS3YM(cU5cV@ZrLtnS7n& z29c^tu0;6zqFZo>m1S@eY2z?Bc-jDfC>FA{%=5_Xq#{{|5WLm;cKfY+FZo z=U)!yte>e>?%+M?L=C9^4WY#6W2cxQ4F)@n;>T71XK^kTft1wpz~YfxVa07aB<9|a zzq3cg0cJ7BLgXbp?P+4QnwYdx=RmhvN8M*gG?P0sXY&Ss^1kdo9}b# z-_dF9W-Xt{sycErA2({d|9!fN%~JZg+J{_faDc4aB~(VYa>d#^;Z$Dx0i`8(gHL^O?=xkUfK8NDI zGzkh+MPwH(u`JQB|(+L_N0UK5SrwtQo6# z?*`q``DsY5*T_j?XCTEdaX9s*oc#}=?0F#rr&H_Uo!e}-WmR9@Oq{Bs@Db`Ufl`4r zrN^Y-AZN9K8QUMz#h8SCyr+TXWcIr*T=qt(UKi1?4Lcpwq#s?*Ynxh;M;bE z>&$6XDZ0BbGfKRoN6V~T&Z%$^fXUkaB;*4fs9m=y#_}c_rp|jek`zWE`!%dabacok5P^bU7sTa z^UKnbzAlc|bR#ux%>q84fLC|sOK`m1%E~3GatVUms|2BkUx-wT=@DKR=}9}>YxG0p z{y14jS6O;xE>;!8FWmOJzen(|+w3;` zx8r<5$&{{B%vMU?NFgSmKi?i-zP9>H#a3e6zpGtZZ)9l|E)Gk}n<>T|XCjme=U~r! z@3@UdNDO$RLkJ_dqi=AMB;S3kX)VKq_niqyrRVpi)o8IPQix{J5UUx)IVHAsXB5Ff z*R&~Q+m%qegV+{unMXEDjNYErh9+&s_ekgM^65W{mG#9nQN$NxjhMj=Y#HNQu-Ixj{5~6E?DKrJiD*Ibj9hW zQ@yftQK1|JqwxbEC-BoX)-r62xFc+{HNmMhZTZTQlE#W3^9rV$;e$Nf$ zi76ul1ymP9L&V#Xqtw-Xnk3z(8){`j-tXssuP|9Ivr=&~wuMAhcyG-w+5&cm0J#}m zSI3T(vN%oEjQiDn%+CW`1ik!6b>b2CTnqcRo-{rJ7dbKM$b`n+L|Q)|+k{ILH+Jn)KpG#UjCc5|U9nsVUR&ypJdkO^CIG zbagOijQHAJOeutUZ!!9Shw|HkeQDRiRYjWt#K;#%Y4{Zib(Rud`o#L?K7lfZNRx?v z<;T*;1E+_d`I;AOPe+p`hVnQ%yz`|yOGiRQQ%$`#vIX~7iRyWl{b`&7z1OHv zd&S{+x9#5Kr@mYmNo$=F9@3+Q-AecGEQfEaGWvBJl|vF3;?gUoip1-D5$XfOX-p;^ zuXKmB<@5tNZVy%IduBmX1HMVwiObF1IswVam?N`#$*2P67WxLfEg*KQU zLkS0T9Vr)^-)|WjD&2GxB29i3KL`Wo0!x3~d*OX;)+jJ&Qjm1}&A4hm3GRB^K^9|G ze^7jx>K#*D;a;Msr^W!?n{kX}ipS5!1DboupUQC5$HNL5#NpNET~T&R&9JOx@&W9C}Cx;d(L^6>$;06xU|T6@w1{7C79<%p#fKz)1W@)95@?IV9YyY zQ^aHSHDlvFMp%PW0Hl_3sOgRquaD>jk;lf(Hkaz|iF8 zinT#U6N@i!tL~)-6|}qOKUuP{w`Yr3%(19S_h&VDWyS|*v370Pwhk~ zqxQ8jFEyq=v30HwU9c*`h?fZD2w?KkXm&QX-UIPfYTnPA-~*z@ccF znnHt*6p$wGF~2)jXv%!AR*~kK$=4H?6XSc^2 zhPPF-+{igpgM|hJ!CS8B%#YqqUj9_nGtpfzH(qBkeR4BF2jc8A^Hlg)Lt+Ye;zs%A(Eo_+*B1Z(KxA(#W{RKv8@lsV_%lGWR*}B~g0{5f4v`*^1Wh=d?E?uiuM1T2EqEVN-r_FBhqdtLvZ+O(Q z9v&{0z-=X&2}CC9{C)vBqqr!$?)|zFJ2SgHZI7+$q${LNSv>pQ?sEMxO;GgWjB|D0 z>h3H!>s-&U^PyRBbwBqaFLbraXwH4!Y{`TxRue#N66--8<&S&)WuU7>@RoICKc_0{ ziS)SX1O~DCaCrLB60g|3l9?du`zy}2shuZ$~M3_c#F8i zB&Z?(2I3@IzkXf??+vdY%835G7gd8!p|Tr60qd9o!egWr&-3!9Q0?yaFXMl*Wg^bp>VlkxND#V-DwGDP_n?eSn`;BeB#=Jn9 z*P1&8@VI#rz)IcwZ}7Ca3D<`FK6?MCMPDu(rM%niX23h6Km3Xna%%iVf%i8BytTLn zri@rh2-DqZ5C**q*zq;uy|49ykXU~&@rI$d6lR@gPqa&AD*+7W>|Et#DLDwtAh+eU zJ-R!Z4z?8tTHV(#w0TjInzSnmczk5y8U|~=_b_mNy?8<)q>BR|(zfuGXjuAa+1(e; zE#Wcsb&pk;l;Od{K$OLL#Fa-~hEPlE4yHb~E$~b)clXlZ6~EgOq=iAu3s-c5qOSK{ zNJ@L0(uY=%^WFB7G>HK)wuMLNk(*#!Iq|?SL8?M(Dn4H(UX_b69+ z5Ep`0yEkW_GgVxrMt#Ngn(OPEI9c0Y|JecSPcg$#^K9yqfIld>;Nr^`-0)%})8kt* z7n+^O-z=F0Dfa(DW+t-bKNWOdpV_~wD$cUOrJhRfVmX?D_#AH+<01=bkNtDW9I`mDp)m-?iyfL)F3&x|An5Oguw zB&VP{@prtgneH7j@aDpz`+V@U%%^jv=(1?Ybl0L z8}2y}f(@9X(Iz<{FOSSqj;NmjVi?0Y0UExY@2TIZESWG)$PQWKfvbAL!sBCi`NZKk z?H3NnhRuThVXqv?L~7iwBGtq*Du<@zT_~bsOe`{k`>xA^y5hdUMy&gSu}WgWN^Gob zo{}|5lO1wE9~W-S>6x4TKx%_u zF8=Uy-n3rK!Yd*(MaKObmBGzLkjX*18QQ*aFJCyeXh@E$fQ69=E}W`BB>KDFuf~b* zBUgNTiiqQ8rnF*nhXZ3*gZt7cOc4zV0T4Lv-$h*50WxSyU?~IIs$;jRV8=k3OrCT8 zGUh5!r{tjTu@)gZOKv0eaj5)c#*jWB6BY5b3<7HtL|;yz>MIv-!EWzSXSHZrLBVF4 zX<>d0bd_hXHl%NvXK)}1o&g`#vDyDvMy&Fa z0t*wu$-z)(mr@l>3Gw2y2fJFBt z0wJ$Owd?!YQJLFW&NK79TR6o*cf+mRIazgIJ@L!E(i(5v2+z|`Ne=Kg@zn^K#KVL1x@R7aWp-WKu)ngiYoNw`I&l=@mW;J~*ETlf zi&s2+8~>#msrPpLjc1Pu;NNSF)+_+zPW>kI`F$T%khFfg#mD}7Wx0VQ6$!Z~rJXgv z3cH-gIo@lEAan7AlqYlIcGC3Klb2=Y9`3bJEH*vR(-bISrt*`D%GY{~TOTYKp}q_`r_Z<|nC2BfH_d7}ZOf4S?qnYokJkOD%%Gb1j( zVtYfy^dxT`bo{W>_b>94?wa;jCaO z!Sv*ne%gDis@`Ol`olT|iZ|ru`ZDp&+$%zHKQbbZ&Y;s7XqM_4Xs36cTk-tL#jl25 z=Rc9@g3rv&y32<$QTW-t(AUWx5{?QUVnau^~6a_k%hTW^vINrVA|XnD#zR#&pVmuZUwiV zJbQ04Wn4;13YFt;XM|l}U0)p$vON_fH&~u}ImPgXVS-^oin)0SuBi0wj~|+k{CztG zrdv(82h3sQy$1-r^#CpWiReVInFV2WzAb8mUq3MU-2VQtctWOiE3i3me-rmYng$v6)1*b(D`*? z)kMu8^UN(0sb_61{hem%Yz&%`OndB^>Y+qR3KI8gH2bm@&Y-t5nl}$v_9rQm61BJ! z;;j{9zGNQu1n~rIBn9z_yUoU4;fU@j5r8k!P1OX{`?g_TDH8~U*nae4e~_QC8{hN? zYsKg83}Om`keC*rw7u5aR$6guS_*%{GLDq)*Z=KPz#hs(e)C3lAn({)=Hm#G;Kaq}KwAPo41!}{JyYd6`w&6s)w*-Z zC%8hOb))I3i0}B!6#A7e+p$v9rA*}(tP46a6JIms)w8P?m9&vbda^z%V)5=Ky}5Zkwd}Dq5k|^cYH#c@*M?uo+iPUT zcATMgvgj3PeC@_u@X!phALk=t5pO0cVqSDj3wg!KYb$%Y`A8lbu*60FvEa$TmiivN zx7|A{(~RyN$9jBgmknQ$eyZxJl7Q4G8Z1p3=MjK6Oe^$Kn9(vDn2}~~YKysi`}(yM zYinz(>YR0aH;US7dF@mAE^npGF`hsLEJy(l^6^QKW~W{7ZqQLdk`rw&YR&XCkvjOIWH9BK{;GGcPrl6+HY=pwF1hjC4OiAK) z-%@%tRO`Xhq0Oy$URsu4uw`xLOZug|_kvUgGUSb%djt2N5`KG3@yN$NfDo54UJ4+u?nvN6roi7G`?eV6;x z6Lz&12V)6{yYE0sS#o)?xESFJRC$LcM0@C)%Y7P;#MJstugQ_fPe19}7Va?&O2^a4 zXwI*kal6(x*EfiT*%c!n#fQurm5+V(+ZZo0sU}PdxJ-AnPVeFl{dmRsyl3^}TqLB{ z&649jcN!ir-z_n8^-IZxdWvS?k+V_uf0xL1mzjkZ;GGY2Zf4&a(NMW{ub)S>43rJU z=4q+H#ly*IMv3z`EAtYIwrgyUxWneR37$s$Oxvj0+{on8HVyTlV)V%FEDv}uF&%!} z?Dj$%B25`ajnZpunxQ&bCY)c}#&%%%fkZi70nRtQgomlsgPuC)D-D@6C+1 zTuwxE3hY~ruKTO4Jyjt{28{bcGF)k z%v%G}@V&KwDQ0o)nh(p=Jz^EV+_&|?V)c8i7*1Hn$2o;Y{OIShVpLVzdQRQAlR7Lj zW5cbCr%BcA^9=3kwzCsU#ni{ljAGyD3X|!zxEe$)Kaa*0Z4XQ@*kO)i21DblZO5U? zk6v%aR5*&(lKt=GqgDD%h9t8BXIux`W@FsM8eyAX?aSNF9(cIgww@URPq`TT)$H}| zmQc2U&%}h#@hY=v>r#5cuKd_4>P9CdL6f>Qq0b3YWomTGRoR4DGC5##qfyh=!)YN* zJ7||9h|OnxHX#$f2SqK_Xv%Q`12Mc$2vWT{SnigKbB8QA^Jm$3Dgy-sJ`#QAsrHYcY*Ja02Jl+0P zkoafIS!kM7^pUU8M1%K!f*QM5VXNDUr%YO4MgbSAcwPOll-yCs*MR}o`}OhI+xZ=Y z5Mhccdv-EnU7hXqUyPwG&ujG4=D=YdajA~{v0}}vhx0l z3<+LfTkQ7bHCe`7_sD)MsB+q_4>BvU5lnN5kj>~}*O9AWZnLY=xAi@!=KDu9UFCPy zdC9AKgRkqolu9@KkUz{D%U=t@@r#y0fH5{bBs`){-hA;$!D~T29Y(7|%zBB7=``kAk9$EZ7a(F}%o9Uq+~ZL;U4ys))WHu9-SK)cX$=@A zSIRCJ8ul#KRMBNQ`!Fr2?yf@nqnt zLVu)5ZqGSLdF%IlXW?<}2pw2uBA-$II#cMxIWHf!SY9XNwl^%Z%SzHNYOSkCSQan>x&1L4q{BN8g&+ zXzF3wEHJPox3Dez_Hi?dN%NM0ZH!QOYCKNj zwy+}P=ozDX5!GW+=T*UiU8dDmQa%CobEd=(##o$eImU8jC8!torlKRtJE}EDp%*%On?{@&TsG~!(;^m} ze_qY-lD(as>0se3l-ZA*N6(g^*7SWxmb}$6-5g|op24A*eJcGK-TP_@D6ioVHr;LE z)3O5K2EX@WUsx*cweuVqA8&M9ZjahjT7|Uk_NrDl?2r`%)3jGt$FV|r9eak5e>i6o z=iju6T^D`+o_Q>^QtW1ly4wX;#s%YU9T?f#!*q8@$9*#=zkI`o_z9nfHtRx9)Zatn z@^g^V=XWa+OG+Oamow`41OhDLnf;aBAAJ-Raou-eq@7_l(nozAbpES*=B?Mdfc}1U zrm?k{A-Y9wm(%36IfGZp_xpUfmyGr%EP=pSu8W<+Ngit_l=Jb*^cPBe&d;YN$to4* zxts%E|C}zw$dY|)mgh$6hbX&J`wXbC_X0eAt@+snwzi8B*PV$Pq=Whhf_}mjTAqdN z5vp5=ab&m56b)b9_XcfMFg2VJ#xWYizf6l>vWFf zVLlmU#>t)^vUsKkucup#G&f@V8qePSM34@W0efALK?#FthoKe%TiGks*i`^2oxM#)tG^?eP%%vKez%ZdgLj zY{}JnXfnLa6@6Cw#j^1GcWt?`%z*%<-?VLh>1DAPr)|olx7$kkGurOIO`#zVAFR&>eS;*%y1Ki#s8u(X z4w%>N@;Mt1E;Uv|^_)lFZ8tOpqKhw3Ur-u4BGcc=>7bZx4bTic*wd;e$M9KzFzaur z-X2>o9jtfFp{YHd=`7V9bv3nK((G<<+s&15%0MAxM-kHI0au#->Ym)ri!NPW3$O25 zEA@|M$Jz}Vff*{UYWkQ0$jiI~TxuhD-*bQo^Q#}u8u6-P;N zR{FBTqxQl}e$VCq_-UuhS=~D7E~S9PMX7ixqhU>?t0#(5Y0vTe3W@&9i~bu+2_C07 z6?>oc4?b8h?1}{^9457R_4;w!Gsz5qL8t|d^5Zs09`zHVYF5w){^N@Ps|dzj0yuJ+ z^~H$e`0CmW0KoZwd0B7WHDUi+RonuFq;uGXHwXmXvL$8Cst zi+=-z$cbr2vke&v-m{@*YOj1X(Q1hO@zsoEsAj_I1v#m@BF{iBRVlpQI;n}N&eH^r zFIpk^M-P_odFIisH7`5LTtPQq(mq;T6Fp~TnjmJn>AvwbnQerma#%JN286MTUmxq2 z-*RFqL|5bE>1|Y9{SWLdvxvBi)EGmn?}<=aG7-hM5l+9j)g;KCaRpyuZ6AKOH^>J2 zx3FHh2{~j(wU25i#P5LF?9#z`EHQsqk#S?IuzA3@j~Hsom^03(H)pWdxk@{-bZUM| z*)`K)rY!%Nw#Q-q1nU*u3Ompfr`;<)`?x0hyX26^2)7!Tw`kAmj1g)9cJKy{=OEUBWPHDG|P(H3JxPZoCS@sdu zk$YwAmyphVn3*-*DsQVguEdPQ0cBUJMlq;9&dE)}xh5sebWlts)12_JKA}u#lX!iy zO)o71@XOL1cw%(Y5=xS^J@uz$lrP3ih>@teJo^UjX)XK?EujUpm9bn^K|gi0e91X4 z(iSjABEG1nN{@#D{U>#ps`4^>KXvtaRn4*2mT+EYZm!j@?+f%dJUbLmu@Y==W4S{W zi=VFlerQbZ#YmHbU-zk#I{F;ihp-NA1jD&g4hwSMFPHff#NNH4HnmWY=n`5OFT;>y zS7i7`6%W-xeH8}9znt~GMw&iBqwyxOtEMo_8{Rc?^`fMhER8B7(d#}hqmkXWzk^Ee z^L8OKzHH0#1Cf{mS)=qoTNjRc&M9`&`l$b?<@*oFgBD{unj_~P&^qG#vshi%AlyT- zH__BLQc@c$w>3*3w3op`@I7iiRNu8E9%6S}a}Yjsrz(3w2}^mt$Pxs8Mt8=%boc)D z+^#bd1RWdQdkO5&PYigvz_BaCzwCD};0FqDgXtVu#k_2R@X24~ktsQQ+J!SFMFGx3 zE?4*v+8*g_YSVEVqz2#+PX$z^+~x`>WxDA@%_DW_j2nCwSt1lJ(iN99-d=B_Fd zDkTe5u^-<|DCcfpO-3C#$F2ue7vEO4mh?)eHnzoZIAYFCf?O>TSbx)i+nWQ#xkEt>fBrJ91qF+k_wiAb))6Z=>IrVa)Va=5$^# zwBRmkZ!*+xirPbkG4Z?7mIPRA+Bbn}u_=04;SGjpXV=gx>QY|(ID^ARjindDJ|4YQ z!J3TSH-5ct={RQNspO>wxoRm2fOUnm%}|dM#ZGYr3nr-hxMvsxZno3Y>ThRE#$$Ph zlG5S2x^3C?2m_5GS8U`&C7In{@$eayg~C=Mty&25bRu!jCfJ4Q zrY?}p010?%NsB5LttPux>z$c+Dqwi=tIXJxufLrBDbJcSY}-S3R2q33oC&&*)f4r% zBn3+#=jgvh#-0y{=pQvUHs84*>`wa>_u%uk>4}{9SX=Dv_t**lrz|Q)dlor12uwr& zwMWPg6yBzN-YgYGvUL{LBdgI-1;#II>-`vT4{Ay@S-;+J_|$Je?P)dT*L444YuU}w zr09=z3B*_}Yh%Ba+)6Q9U1Gy&!D)qWZ5*s}O6DMnRZ;OGPkXm`S$7_I?eNEvL$e{+ zqQ=>*N63}a0xl%JZ7ZAE)>^%;1*_JiNv?eI7_%}~A{i&8TbJ5eX^kt?Xhpp5+ALYw z7w30K8ct#n0&B*bS9Y40mY=l_GAwFra^nkl(MM^)3gdD$Ll<`EOzO#^kqgg|@#SSL zzKWlV&br!eh(7n_^k}?LsDOD-y|3C7vO0oswRP2glB|GOO0hZ|S$$XUROA%1qKEGG z{)1EX4^IlI^OLG#_Ki5Uuc3(1Pcp8n=hz;-4%$D-IR9^4$IcuZa`;I%(s9TgOLD0H zq#IX)3nq^>3OT^1M2*t-k9);nF(BP>JpT;T>Uds{4)~PAi_K$mBL7ENzv0TL|BnhQ z*f&R*?Vks$P~#SqDQ;|wbxC-Pp>qdywd`M#yT^VZb3RQis=cWP|n}5kb2UWEd zmK@?m#3~iC$BjrFMiwIsjRYlv)CorKDxpgWDe=)On&E;C^Z(2RAkJ+RBM6`6lQJ&~ zy!Dp0YUEPxH@SKaNTaIrc%X@hEMU<~sNhRWl@)+^t2~r5)Q0qs)7A}K;v>4rcIw?< z6F7KX;vb_9XWFE;0V8y!w*2FVVW2vmlfLWP8 zIggUBVi}a>|JBrI`8rEM^&|OLzJsB6 zYa|8Rp8Is54J-mqtu+BOjhQjn$+b?f)W2KCRgQzsjUGFC7ICfwbs2d&B6Up~r{-WR z!CY3BJ_g=;&FN}e)L7P{!|u$XryJv(h8XN$CGHC!_(*d1Y;JOubl0gjJ=<+5Giju* z^p*YV3!DJEVi!nXr3^u5p#(LTmtivO+E({kCn0CJu72F z3T58eqB1wlxlu_$W6&_5xQ)|<9vV-L2MnWnb@A7!%CG#`W|F4!hjN75sH~a)Y`ZYz zL@^@+j#Xl_b@EsU+bh%a?)_8h@)}!kl)^VDNB1fE<5b#T!THyKC^lclYwiB;Tl6@1 ztG%YW5)10_SqoQ-9m;l*WS1Mdy(9a!g`WrTpb2NMYIRiK->T-Nku@j256V-q@7VRy zXwkpvQ9VWx__L^V%_4Z}0M4Is$N~BSU;h%|s9*xpXsb|Yn2GrHjK3(#?pP?~YshSK zNCxi7FMeOrW0-v3X=;tv5#{wQnS4eAR4)Pb77@Y}zf%O*>Aw_sE8Bf@?D1xaG>LsU ze}};PNY9$`RI28!w;&Z5_HAyM*+|udlrz74q383-K-r&oXx0r%j~PW#`J-2-dV;?+ z=ZOF5noMyT^9-V&Dcol|5~9Y(Z{q1rXM6%WTKrJPvJeCJfM2_h$ZgJ2lkyGkb$HsqJyx(a6gFnV-_DZuj3G^1Db)e($Fk{zq8FrRqR`gDGtI@V{7#9CT2!8G4%cParo|jqm)=l-ZeNMe&Sp! za!-m4mLE0Cx7Lvmk&ur8v1@^C7gZI z*!$8S&c1``350bo$y(@st*X^M$)|Pe+4(kSYb#!2=;!8dFwdge&i?VPo7g->w`?gS*5K*=-AP%U_%$|FNn*>mEfHzm_Fuikgv&lH5NO zwb;ANarcxhtCDL`o3|=?zxnv6&M#k5&XDCZyYtK^Os%!Z&2P zspfq^~b&h`gF{_pPWMgii^_td9DdbJYHzcI7dKkY5ScO4dQSl(72>-es=sqD8~5=(Ta@ z*v97%UXjWEmnT4|4e%bD-)1h1i-ZtmK{ZHq+uPq2yFd9}Od+}7QPw}&Ijs=dgS=pw zh4DSPihPo}K;25~@J%N00jk&AOyr|kbxSq9K$CZ#X$Me>N@ARdu!{fPQ!1E=(;~Pi zngjZNc8;8}E5|v$XQ&! zqMtc=6|xXH0*sLZ!K(&iq8z^ovWCAtgY~7mD(;H$UDDdP86$dY(l{APH!N-4>^)xR zqE{}D4Nsof0MUMRR`#fOk3NuMrh7Wk~m46JL_p+M}l^J`vz%tR$Sw%1(rphzAYJUJ`F z@HBk9)g-E1lK`-#fh{LoZ4;>YEOG@R{d#uWF6bwxdYlN~fy#j7%%a?Q2HVK_HpEn( z{+r1od+qB2LCTWm{zdEE-Q{KE0M=OFwMPob3lmR<$!@F6g4_mwp^$MM9UyU+Es5F6 zy}xiSS5`H1o&@*JHu1I9C6LD8W$DczRl7Qx+U@GrHAF;FTZMS-$LqGo4lB4=s0_q> zzw!KI?9#=beerefB_3;b|L0l%q7nYjkXKeXr4^LJv`t{Uq5!S53eVVWt~de`ERHnD zP?yIiKfxSO8mrs9Q(^&DZJjJ=+an~t3*PkDNdt!K%&DH-=2hhhI{RyKFLq(>6;J*np2`3{ZM{Z@a7g4Cb>Nq zo`J5Y=wUr5DGfAr4qLw};Ra(ISKxuWr^(Qj`A56!(29D_m=!TsdrXln5$eLXZVwGm z)ZT#QgCByMM2U_iumt1m=t!?w1!l-%7n3YwdZwTg0my$RVz`C44z-PR?WFR&|^ z?V5N=DXAH6fB&YWpr)oKwhv0BMhFOgVlW^eK%WmaHI+^76xU_e`#l^yD^|Y#^eg63 zta^o6cj*hU__uwxTPF+!KQlbti#s=!Hsm~)Bd?U_>bfP~?7_UE)XUtuZuq(b=GR-E zkXU~>TY;G92AL;P=rg)jjLgL~;Vk(g(e~)-&9(8sGTy}cggp!H+pmqEXVitobxUav z@Mo8;M5*!MNKb-vUaC7ZVXvl1wWHA z3)mvL^*47Ah$d=oQ;iM7_mI7Ww$y6;maP%-W<6dh&))$}a;=%;kop_g0R7^y)JC_t zLgs3Eu5HIw5f_9=S&h=!+M2&yPa}I129BOg6qHTtmD^172Icfuin@lXkk*`0xHJiI zh8ewbufeu$$(j_8siyzf1i6V(@gH(Vf3ed=Hp{Cu=bb#W-BVK0!Uo?iF;7Y|_kU&v z+3=Gf?wnIzRquX1{ESp@tw^hyeeuiK)|SxV;Xwd8gdQ5l8b;egWE7vlzz-@Q3J|oj zd166k;6o^QiOcUh8RIp>+T>=&&R)QFmZE<8ie`FZ#Zj;RBDQHQ&R(OBFetkK@dlEl zUBo!zns@;_2Gh8eqUOj%ALr$7i52Tjyf|q|(blH(Va!=$_3Mp|iI$619{MrZzUy#! zfI&VHDPqd`g|I8sAdGQV~MHnSf& zqNcS8M-MVD<;5(~FTOmDGQ;9Q59%YS93^u6z}wfzm;nl{mpMTd(Z%t{<7p zG!K>fENw-E0r*vjPruya0hSFEO(Z7Gu=HVh4#1l(K1t~a53dzGT8*XHIeR*BtX@!- zzv>XOA@W0qduqeKsX)6;c2dAc$dKNP$9}hnxkjZYIz=q76CrQYxRu(LMX>QHSX$^) za6K|;Qv}V8_+MDVHCE7U^__5ynIa37V4Qk#GmXvoGtYOslnm_czkb$Z)_8~f;X`cm zC@+gQd?S>N<|d~zEDzJboMVs)A@%0^r!O2v(0A+EIlHH=BP6p;WNaSb1RPr}=ftJD z33@E?%9c;Cm!!IhwlyA(ssk_jS;zc8Bk&Jd2R1D1o=8*CVMX(FidAQ1?;9L7nNnTJ zA4S?~1e{HhkX!o>oap73<)nzVHx2?1CPBH=WJ{3N`T&!m^d5xkp{#;MHu_8)(y}O5M;Z zroOFTkf-WXcJCbwj`nSNizoirH$)TMZbCNZp4ig(LQJ+ZPS`QRVraQ*sw7ZpegT;mmShr?pJJN*5=WSI01g?adcR? zujT$QVz5qH!gKe9gvauG8{&z46#F;*KlTQaKZT*|}&bmBDORboY~;~RBgVk?EW9Kyqp_RgL=RZ)U4AL+f<%#*%A zi|)l1*>^$j`}<^EucR1Dj0&Z_-KwIzC?H_ecs_QxWi}-R)Vlq3%&ih~Uo_`aZ+~Va zOgl;v&w66*TZr*brTBInl(3qehfm9hODjp}r0g-q-E>Dcfph#F%&^7EtrG?Z@iU8} z&S!!SrbJH_U(V{UByXQ1u1t*mQjV*A4X^3vf7+;NniqKi+iDlfo5W|nat$xTJb-`J zAyWF|`zBn+4lHs8_xhV$NW1FSl7SI$@%&(Gj(m5sB$f5=ak6X-naM-YOV?&YC(xy& zPHoNf5Amn-_)VMcB=KWW6}z`Ong5=8?mt1CM0{uX$cD-%7B_hin~eJ@j3=y%HCa&^ z&a)!bx5hemIybAIZCV=shAW(|RYE!FMSN@GdF5^8R97ZWBL18hE-SHSsE zg<67|cwCZ<3)AR=rJQO}?26rHHHc5XQpixCEMX#v798+TM zFFgaD2VO|MV+5>y3m{ZWcQ0eV96+d~{4_4(ob@w|;92*-Z8%C%BEekd%}=3$YYl6N z@{cc=yeoC8Nenf+?-$mlXr~1OgP%mH-ldWRIcF^c)tnxf0n^69;G+Eq{$SOknM*t} zW9mh&Nc-(X{Lr_T(%X~ws9jd}&oXQw^AxW35A;_soE}QVjYx=I4j_H$xze2^Zr^b* zB?zIr$$BiNRmT2Q@^ejxila=7%TDtZhGa?bYA$wIhuct|BWK~I862X(93ACsQM{El zEC&6N6>SS0wRBqY`qGy0phycHVh_FVKHm2-KVL zWuEab<9C3x^*ih4Q$OwVMwD3#gigl5q`3Rh)o!f}a+UXT6mZ#k%h1vnRO1zP-Nk}Z zJcsd67XR`wDau`~=zjE5yiR(M+8}L!RN>csYvV@RUmAy`YB_@&p)+Te zMf;#R%@X-*1p{2kT(fh&3 z2tdB;k_*J=jAn6XB>iC2EIWs1SonhMY%io9eAZcQ|M3g4l%Jv8yb>ownm|?m7|j&T zePBm>(PW|GM3e&+U5G!}rBmTe6H8&jjRxm77+dd!XD#De5-au{u4U z+7GU_RO?f!A^qt>yYIvT7VNB%@#~PMc)7lJS#{{>D5ro%Soq;`p>dQ_lyo;Em}Tg! zTwhN1;DcIIx8Nc@H+`|k7QIZq2i0cH-U|LAg0>ysa6k#!xI}%Sr@{1uC0TZN=Sm8! zUYJFOmJ4Ksgs~Uo-_3Iesue)kp_li1Dw4yj9GOs|l+7+?C6De@IcYN$(X`SM@7Az; zIDWc|4XKLfP25Uv<5Kz^U|Wn*>^`4uJ%?Nrzn|u90SsWN z%zoJlQu+PctlXEfXJ51)E|c|R!)PzbXK`A{JYNg@nYTE`J(T?3r7WP{uJr~E=4W_P zCeyTmlfbZV$gnx^>H&#G`h5*cV4PAfzPxGG?Yu>@b$+^ zjxihM2zZu)jJL=V^>S0v+Y11>@IG+=#mfF?=XtEEC$(Q6oRU^vEldpWkJD9Vmg0uk z+fyaJzjOPgyc88%40A$s0W`Mr`v~h{ZH;=JEs1Do(pz3H0Oy@Zyn7qqWZE<+qg(s%x&)@E0hEAvqtrw{ZVW~A~N}jfq+_cv{;S3K(mDn)Q)NXHZ-}@ zVA9}g2+SrZRS4A=cYoE-1A5S$0lmMGobJD$i#>?%oB9!yy@w~0DI29cG=X}$xPlRT z8`ApBW@pN%l}rtPT8e z?)JxO+oV|BXnNZxAcd&P$q@28_bDrbavrJL1a!ejYT14V+i1gzoe8E^%}`TD(3>Z% zCGf;H`c~Oay1E<=B$>tUx9)W{esPHok<8CM-L8df>rIE>q70BI(9++QTt5S#f@T!T zGH{%w;W`q8Fdh#j-g4=yW4D=mEvNn{>y!Z1^AmqIBZ?iIPfMG+w8$sNRdww!ih z{2PU6*Qq$2zX_mqW>V$vQx&sTD6}U~jyeI44|eSFz3uHUy|6aDn7T9LKmksXBxjwM zFCgxD3iUx*LT(^7%^dMQUL!I8sOCGF-?VWx^mO_}T5b(Kh!dFD`7Ut5*&L*B3N;i2 z%#(~$Z8`*J@Hla)FS7+I5?zzck0!*Y_qDc#vpYgF%PS`zUm0W=uVvA3-EM z776wCcAycdKA*J=pES$`E6rmWC2q=8enDGwHT1EynJH!hrOvJwnbR3l zh(~~M1zOeLPY}paVTlH%9y{l#9LlJjbbB-ZaY6t0)`h=Ny0TkCwZU8+=2Im_*Jw`L z9$&O+*zS)0{7mdPZtAb#3|62g1Lh{x_NhNm0>3M<9f7nCKCmy>&Exp6N>3oY372mD z3Ho2sf&Yy%^8IW9mq#lQa%?t+cQnAjWq9&VUO8oCe(S>HwF0xESyTlAQe*muH_B2! zHSY@acn<%ILh3;~AtJcQ|8xN_?XN!j|3c{gL$K}9SAjHOR$3~l5P7O!In9#yAIld< zUFGQeXD&d9CPTy@*u=6sz+31>v!Ug)rcRw@6GdIJ4*J8f3|6NS>>kTMH}&6E@$dhW z7X^sI;%8Pj{_uXk`}6`2yO@j=xy^X&=3gv)0wA}dD!hN@Wu(}6D~P>atszw8q&T* zIV}P8CS|5ChzCivu4&)fBJ`pn$z& ziw3`_s3AHEGiiP^{em}C{+{j+68vYAos7TW*ZEI>8e!;kOt3+(>L`HU@*^#LqCskA ziebJR$+AiYUbO4VXKPbmBC<&uRSl;i(--QPjZ0WtbixAOS_m*d)gR;T@dtc}E0ll% zuDEg!v2^FYF6xqyT|KAt!KYto>p?Fc>vAgubY)*%; zzuP8*a=E`I0Q`CAOU&No(Id3Sg`6P<#dxj%ezoOm&NsBj_ag-5bV5nf1K7Vz2Nv+OZ*C*#28EOR5R4Zk5r?lGVw2MfYi(4xm{haxFE(Y zSUmckulCzjBM6Gw^nCQc@5-;A+Ud?;@O~Rec*J=$T5(z^>qz}Sbn&g?my~&JVD|Vq z>1(~r2Y5ZTQ~5?4g<9;#-7bqW01b}}J`-a4A5Z$<&$H1p;BxUW>-HX7f3hYIoOG!l zS(U2)Dd_)DK)+r>p64g?-<6}1a@_3#m^R>YongM}ee5(BhsVHImig*%T{%`yp9C`N zaP6-@^B*6e0$*8tEr$DeCdoSs(9=x5PbcplC)liOKU4nZpXI3?CyREPpBnij?fK?o z1zYI1h90j@o$W5s9jxGupS?>(71>u}Q)Afpe53kZv}}AgvlM&yBNL!ti<=zyAeu?$ zP64yr?W4^JlC7JY*=ND{;9!UZX!8wQ>^X^`@3A{j4i09ib%;k}%Uh1irzmeQGA0f3 zo5XKyC?Ciadea*oThZijBN(=R545#xxw5_*yGP5+d-P8B=BWJmE4Dag`dVbOeN3(v z7uTZb7q;vRG#9?4j27h9*CW7fL3lF>&!t*k{(RAk332OCx2Za31IcDd^!J?x)cV-g zyDcKA2V2ovjAb)kTFkz6+@8u`4bES^H$F3irg$>JQ>1gAsDORGT-z}i;dAg|R#3f+^)R=_2gxQvI1Kt{Gv)pHFS432Xk0oe7 zTsm7O|0n*kySK}>w70oLn)Uv~)d$z9&Zgif$hU#Kc<@T7FWx9-@c7er*&(7mo-!|1 zGHkSFP#V4D#tkH3(5T%$Yinhtm2Yp?dpcQGLO)F@o!_@s5*xC+NL+dtK4{92&O zIT0X^rg%6Jw~|N`BQ7n%wnxJ8>C%aJj{+F5jgynn$;TnFeWeFKfLhs2f_Y2*{>Im@ z>5Tam^#%60kLx~~0ZP==G>x;*GBTKi?%%I0FVN9+kGU!3k-H<#WHj_jPwZ+Iu@pcu zehxVLPeoqFAy||H3&Iz3(ElBnVs(l%A=Ms#_Mey9;br@4i4XARZ{bmcV#P-@5k*`S z+Na$=W$@^iD{K1HQ=K&w$W7o%>+gL%9JV>Ei1E4(=Q3^Es`PiJo$p-CH5vXkhScE} zycW}K(xxf>LZ#dketX8$)a`ovLU**&(At0pEnixIliq7%m>eY&-4KOcT$84c;(Fr| z`~O8F)I#N-DE=?j-ZCn#En5Q(?oQBP2?Pl4?v?~7Ah^4`ySoQSkl+wBxVyU*7Tn#P zDxi>8Io+pU-@fnmuQ$g2v&XJcd#|;o&u`8d+6l!;#(bOdl1ah+*Yj8$jY;gD+u?zo zdfjGDd4rJvX$s+QvMJ2gqQ%Nteh;5My~nlVZ5Pko2%8ZA51TtZ?J*6T9L+Ec2dNeT zGQX;Ppdn<|o2EkWeR}%YcJWwihgm{QCOnxq&g1$tz;u7#+L;qR>z28=6aDWyhintv zRl%HK4pIZ>?QoxvYra)fqAaQ<(IE$$I4MzUuZnWMZXBfer&UVXB3dME3j#&GIYXm4 zlhN%JgX8&&#+N0;O@#RqyWv2xj9Sp8?ljj*#jzcTA9x?eLG7% z3#@{s*SNBmzT^!>a>lP*Kemc2oQf1P>Y95&Gz!Tq>$?&D!};0?9A zS(@~|)boV^TXUrZ#!;C}!VG3?<9C*Dj6?VHXE;v?l1OsNA@L{c^Wx~G=%W(|MN5tf z6Y{PzH0$$%>Lfq?hKH*$si4b%Yp)EiNy@D<{tI=Qeboc-j97d zX)gT_()p{BHXWgeoj-=+znT=HIehRxpkdQJ15=j&!@v0_ug*aY87+pM^Z(%Q|G0I9 z)?TU-s&W6<+DpnRwH(X+R4r}n|BrdDIR5L-Tesx1FO%BOl#Oz@fYooT9PgedQBdAj zMlu4Tcsz94CaCCFAfnU)MR@#3Z->b*A()Z#{vS~r{XQgWGmaj)p5pogmF>JYAB#Ww zK^v^JV6AnrT_RE7MbfOg^;Dzbj;-m|>sA_fG40+aSR`+kY))VBX$lGtqfBktOs;k1{;UTS zzQpP%9o>sw#$E+b9gmR5=_j*Rs#zV3aQKWYw)k*%vT`tN-pF6JfDhLpQRG%zhaIiH zbGJSBY`S~N&(N9|-OazZcJd5%E^e|xzWc!OltC z^jp^Yd`u*P|MdU5_TU*=cd>-@(8n~owv}pfz9EPi4R(Am6blw6k-YN4is*!VLb-*J zb^r^3MSJHrVmn`?Fce**SMEbAGUu|Zb^xBbttpMPExyZpBRdV9U}C~l)a#Z^$s&)2 zRqrf#k|9T!JdYy8#w-H+gP({t57F70ZiZq(Qmg(7!4?IHh(2v5*&ad=DHkEg{5lj< z%;W#lG<$wEYt|qgF}N*NxQ=-4X)+)ap?Zsc)vfsQtuJ*R#QYfXL6g_7z=kiSrtzvT z>rM!T@vFI*2pa!l1E<03!Ut-PyR?vI1j1L*WKT8EjR?1#3;RI^cr6MuEhkw<4syX3 z%#&P?P4JC=S3>|e4Kys-@5%qt%z>(~Ssi2B)(~2^Bc9c0!BJae@4$h?2JmA$u!Wl} zkRCw(7o54>(0SNS_&DLH{mJWuWBK0(=j33*b}vZLuRN!P%yC#rUAYTg-rI-9z=K6x zgxrma{6?8b_okV@?zo;OJPy9NR}-qH<*0DAk_kK__!3PsAYc$6fY;_l7yKecATnEv zr0sJ_!$Hz>`9}vS)EOvi2_(uF4x2@j{O_W1y&te>9(P!z%w+0le}J%$iN5tfUev33zOBM(QjYmai@0?&9;Xoq zTtp#`@Ke8RW)r3`oUy(9tIMAX4Yrl{;F3@%+WL2cnf+fJPtbE!cU*8Q@(+`KCF1oW58^OftDq&5ETwsJ>Na# z?D#l(v@d!?dptyv&qoi3Jx6EXllhK{Q(t>)#B# z2*!4R&GZwdWnzpst9OBY^@H(uZ1eeaK6i)jgTIi(POVAV zEeNK!(swG3Ek0}SYdOJ?l05)ui>7l=0L=kPyzcg{S=x@koc08pCNaX8n%o}x# zm~XlKo=)|1>c1EU27XDSYfl4p|IAv{YL-5PzcB%A6@- z3WVw!@@Ta%yoWygePD^OOWxYc?3<6lTe7Tb($?087h}*t2mpDgQ+SkM*okd{1Oxl| z)ZrWbyT5lOpo<$><2>Vwvz+HKtTMmMm;JAI-dV{G4{B_@1*3QT=gO>p)gYlpz$a$P z0xa1T2RD)kFMRx~I@<5!;TvINwE=O6{WfdwBQ4KA`1ZOpB<*f5a+mKXS9+E$#P9kr zIuv*k?l+XOcB&MRISX}paowqA;4zHJ$?{&wUb!V-~MP_egL9PTlZj}$b;&z{xj0B{+%{P zR^Kg{g)o?}9zG=IpJ%ojy%(j%bA^Q}>3U9o4C;NN>)r3qu5t)V@3Yc*f@2+8M8 z@wM}}7k$h6`;|$}zr~9FioF=nqTaPmScvY{!M{6?v@m$@;Q$EFZO!d|N9uRxig94U zk}v<$VXG7*JuNzaM!On}S!tf_fkMWv;QdJxt3c5k^L0f>3B*C#h<_#1{2IVSbG&OJ zRFX`dHCsc${i19py+8So+Xrf=tPbY?nPoelXg4ohd$sHbk0ru@>5E5Caq3to#`Q9k zQg1#>oXLLN5>oq)#AjM`WduopE%SIpc+~Q|yqsiOaN4hi^`9%Os}DfPtz$A`fn7x7 ztH?BBnY$tT(4bV;zUPV_vF=?()lCmAk2$VQ==qPz#k%qUVV{dIK$ED_^GN#Avdn(Y zv}hcHg-@nEePO$IPjxhj=(K0)hg<6DwU3uco$oih)m>drX-q%u*M88sm!Us!aC`39 zr*=DGAUW{*Erg9y#OaYQvC^FTR5z%?`%nVkf?dpczh;LEQ28 zq%57r(ajzSdW-X1hrI?g@=!eWpE=h`nk*E$!%P$t@ht&gp=n*E4>uQ%%e8z-WrW|* zdMv7j6A_B9G*N6P;Lu)KZVcS|)ttubel-BuIr5>2YsMEsD56uca4OwjqR7Ae=meEI zU304@l%a=4`|pR%%zPKt@AZ-&Uz<_z88@E3b*KqQ3@el*pmn7agVGeFR!ngud2;t^ z-?gb%l=d|bp*KWhlw?B0B{}!U=NffEh>&A`lRQKIX)xv`p~d2AOEM!}p=UG#LYi}_ zMbFDWDM9o%Y;HaM-kI#?5?BXft2;7gWq%i6kCE(P%gKYZx5LFVR$YF~6!&@`fBXPV zl>gDGcv;KK(PFis;STOk{Rcl*32o%hIEK>!(Z8J7a+=B28QMt6FNN=2Z9M;7WVqs1 z2^{HrGJboxIDIXaz`y(Z>-f^7H-AQS%E1S|483A8a?&^;LhjMGwm2~iY77>ndNoty z_Es~_oSm@2Uxs$Ys-!ddMOe0vfyQ=m<+q>qqiYP3#?^oKSbku-I`=;qRj~JDuVCZ; z`=y+E8(-tfFHQGJuKrta|B6}$T0m8No@X`Z-y-U7!Fx)T z(|fne;-fCpJ~LcxyU@x-&vM?xhehxUVfudE0f{QGzto< zT@5Js;a4-&-?mXQcV=9Hw14zaIgNoBJrGQtu#j7e&_KXb@PCDrMxfaM6(V?cAug&c z#Nt28hjt9%JXZ8W0jAiAP&q!OEZtw%evx@e@ga(OvX&-sv#D2ZFda1xCNb>|Vube; z3gZWiVakJIkE8kvW{f{f)Q1nntp)*}*M?w9%gE&p(GY#+)hS}@{8Fa91f2x#TI)cM zo9^&OGzO=E01;;TctR$15<;Ey5|4KIQVga-ugpO0PF(>vwLz96g?E+B+WLv0wY=3q z&Gy-XTf9SOnAZ4F3jwo(C?lJ=7+tyVoek?B!tc$q&j`3>L$>Mf z-6e{4+`|DU3%)BAWRN)W)=r|<4cM62&o5pFKOaA@8!l#_*r_DTCjirxd3Li4?T5^x z@#L#JPpDyU@jddKKl&MOG|jyZKT4CIv4yyqso9yDb5)&Nr&~5s6H^sf#%+Fd(_4dr zQ(j!JO1*Q#XER`tl7z0tJKkL(^~Upw&QbWru_$T@y#(>=|6<2sb`~5Ph?Ae{@FW+S^Bb149+@kO8m&*q{XXS(V>$kT^ z`NSQv$KW3d*v0Vy^+^vm58~+t_2+FM>C_J<)P=%xS#@hgKliY!xRoz|EFb)`bH3vo z)zkVF$-VoK|J`G!=9%RdLS6Vy^420f@wm@xo7t7>4{3xgw1ph5l%lsy{PghE-EuFW_V<>f*2lgS zyyVB-!n%!m(tVUU|FCn_g0VU=o8b}J^xQ5(O_0<>rfY8+b#K?c&>=>@lKstrxxGJh zsSg_Y_w!Hb17Ur79)Pic#R zw%LWRQ2t49~q50^lAvwFq@K&e#sdWgf77$U@UX6rv5H7vgbwO8hi` z32BWsRj^pRP1WY>l@dzn5$&<*IH)pFb$WLS&V}e5^1ZtkMa@_K72iz$Spc)Tx0N~r z_;IuQrxro1pb%hUxx?EnYq6vu^}AwcXJD2+Nb_XF;gaKNHM5r?Q6%Ts&HbRpOK-{D ze*AtFYS74e7wN;df*eA>|dK@*av z(D=(X;&FV+NWkV6cNwf*JDX``VOy9wH$TU$WtVSS=X&-9tgodDe{$0^_oJQH@~&J1`@&3$=EGp4(FeWlr=oXcR3@Lm#ru(( zrto}F^%9hZ73bhFfb;wzb^tGttI%YIU4}%78bPdY$-*U{^uV{|s8rATu)=qE73ged z=BlQqrmE(u>Snfkjrf5i&GUK<{A{-d&H@YWXYeePc$+@7d}jg}91T}^YPs?j*-|`Q z7#s^pSeN34=R4Bm7Pt9Cb9=4AB7ej%*1r6^Ur0)+;mMB&r3y=Q)BSo2wo|$(gMWW)lP*#)b98?l_CWzZZDO40C- zDHFCZ`X1*-o8g!L=zM7@V0QBoadJo{)Vf))<84{eG1zwW^x>cEb?BN2OgbE}he;xj zdRr2`us?p)gCEKZ_?EY!hP{6W`MB<9k{mL#Kb%@%zf!xi-?%4OApJ<`<Aai(D z_V=74OPs;;Lj9ROuRR*ShS-Y4eV>wjU!SA#TBxqqUp6D4g)%bI3NvnOc9&dsyYYK4 z<>Jkm?3sjqyqTY~+=14^d|?eoo3+63X6 zeG|ox4&Q0ohc-eFmIMK^VYayVCVx+bFB*mU{Jt3AyWg$M_=A8g^402LTz~Lx*(H~k z2hxBV@?evnaB^{E0P-wnAYBlG*7LBv##OK^k%Ryf*u+KJ|Ec+LM2s+(_T>cbC1zU& z9qe_}M=n@N72_b+t&wp^>bA@fTOHJmIKD1qdRU<++Xmi64>$IZ=QxQLBC^JmUpB0V zEnyx2AOgEQ$%SfWj9AWtb>T@dwi}Ufx&WX35SPJrog&f1tiK^bczg3i58=)CkZbbeD4I;G>PW+`5>@HSEd z?-#zKGh07{M(svt;ta93U3yo7yJN>Yqxohi9#@>u%VcF!wQwNM1HO&-+WwEO=?}jV zWPLqpQ+e>25%-K7$42>Hw%P`V+7AHE z-1m#kd3HJI#4{WyKQkkmqh_v#(&+;V52=R*$phgK-lD$WaT%KKCh>kkW{1~ea@{kd zG;%F--Dk>gMNKGj!Is3dX+(c>R*)tEOy|!W-6Q^JXzc}Qk{4r4owzgVW1I*FAZJN{ zsPb|T4T5^}4z*c4=kX|XvetxSEVTF&x6a=*?$FNb1Vpzt&F4ER(_(gve<;~>=fl$kgmfu~N{L{_5?4i9>(?OAXVtBeBY049l z%KbgkWJ5FJ5J7Ie?K_@0ZxyauS3v>P>_Gro$X#=9tyiI+qi~b(9aDeYSd6~5l?{uI6aT(nK;d!1X~;oI{C`;g;Cnvf4Q z&%5#NDJzgeOCDY_2NQWmDsDl0P$+t(w(uIv8SG*~p_Gx=rD7p(PbOuytA%Cu>FGq- zAH;!7?Pb)6hL3LZ$uGoQq>s^TlXba`*E$L$h75wqp22UmF8STNFf4~FjJ0h~QQNnk zfyI7%DTqeuirK+ZfQy^#O1-u zYE4<5?VesSg(PyuaLRW#nQ4~&u);p1%Q00Ph~FO(Aw-ze@S}Miz{$9dAbrNKI?yqK zuShu+b16@$F}UY^tj0$tBU@ZEjDF`G11O3SXwa+)H|-AEqq*0>MT9t1TL-fHohU!X z_I8}R?7u?}p(Ea=fg4=y9tSU&qY}O5(veNh*!-DDtgh#WD4zHtvTn?mejh>uFXPoL z9AT`t6|>B79ZeH?=i4C?pH!G9T+tKT_J&&HntwoVjpAEJpQqO+T4Xgt-WiRo_)b#k z0*(>>=RGAt;k0Ur@Zy&_hnIq{xk-U{vfZ39i60mx4m%Zi1?dF|wM_P<30ofB2dwiV ztpgCn$vA{H{UT{Bc@M#f$rf1{k?6=#zKjZork|+{luRZt#b-miQly5NJCXPZw)lTC zyLg#c-z)%FY)78LC<(mp<+xFon$%px_X`y#(6WF1k*>TjK)HdtNb@(fDw=Ca#`Wvd zV`f4fgXoK#cJmI}kM|xyr%ptbGb!g@kqQ(H2BZa*1}kCXUDYAm-g|4t3z|ecUzPM$ z4gz)-x*5l&rvrhR8e}nYt3OFtSGjV?&N3 zH((oc2hh0jGJ38T9f4NPs?(orXEfl|Y;2ao*;ko_0aJQEzX}-JEJanLhlV)BOoSaoB5exrPlGH}nh=HQyO|kWY6My4F#L6R+36qHmSchvqN#MeVU@cBL%qh@JC)YD(b-VciWSIZ zs_K|IhHJ6tSrNJ!8Mo7d<@;XYM$OJ_GY9ifs(Ef;)n-x&Dy5F(s_2+t4Op6~J5js| z$L5qQs;rcXIRE|SS9o=jsl8@H>aHCzBS$+k8p3--RW`UlYvJm#ydTK3RDs{g(|Pyv z+eaAvmhy##9RU$?ie}VTb|pHNlf>CZtSQcj(^a3ZSQ@3tg0IfId2t)QL49eH z>XZg>8L2rT&DbGl8VGG?OEVm1V4)b0gOf8_fl)4A)vs|lb}ou_{uy|#NvH4PWy!+- zp!;np9GAd_T8IRm!vvt1YIKO6K>>6Pqx&v`6JJ`e&^HVhBC6jcn1qs{>EfjHv&T=D zF@U2rFHG3!?NG$AQ`*N4N&!hEJG{CjClghdxY#NjvJA%UFmyVzC68qM7fJ)gp&c(J zO@5{{-5CNCIY(PG7O?sN-(faeD{7}qgSaQJneepii1#$d?KJjh-fGqK_ZvZ!dY%n4 z1$M}t%f4@JaMQ=0`-HMRLl`;M)Z1R}!V`VRg6O=loZ4AYqww1tIyv&kWmBDWS>syA z>LgKEl8?7KeoKnN9|gRsrAsr78#6!*Ycftw`(0<9svA27+gWtEZ;Vi@ZwVPQq-grs zZ)1GIf?h2EG3k?KZN5%vAUHv*(BBVasOq(18?7fF0L1m8^Iz3O({R&!mXsR)SMgnv2 zDcF(QT;y>5@r|xX!lSizwS=SMFbT8Es@mSLM+Uj3eKoDZh|^2{8F<~e_j!85rwI{@ zjHnp}*$F>^6$RS-5jE`Csz6bDmDV8Ut8|*ge)o?fd@K!;>bgsz=iMp{?!xW#INJD$ z(0~mim=KmI)z>o3y9lE04k>)p>)=K-%v5nRIbm4umgtp;b9(2(n^M(%uEp(mO~7z1 zkM6fEFT1gc%PYH~x_s7*ZPz@gweREEz6f5&WUqe1BeYC`aC6fl&aj>6IsGxidQV;o&5uF=TtS&zkE5rf}3I^Y=k!FmtNeD(YW7~c)i>wXkAn?^Q)~>&xJxJz z^7^xauu1P(}fi*YNsm3=rq0Z2x%RG2XG#^>&pyX`UU-gZ2v{OFCk&d60 z@A^&K7=**0vzuTQulumDc8gU{L|##{Wo$!&peYI47FF{Z`!=j<&gVQ~vd4vdafq(tG>=wYNUzLXhA zJ2ePy9nmeh5{S*?zRh|@iWzOmy6L5%Kn?y2cG zLv1jAPas8TM%k4H81x`Qtc(64Xhs8dj8>|2;9uw}szTIq2l(C_>E(r1Dgk;pK4n!` z16s|4FWaUT)ZNMRJ9W^8Mz7e|zYgXV2cLe?cWvE{j4Ltfo}fizIRO>-?4*%8#B`;F zCYxGHntbZ(X2I+pO;ctp4ngXSKkgYTzay3L%_L#J7_bf?*&aLDx-}Vh2lgEug01oT z;RoJ@9U2%$iYIU9EQE1ItFyX_ko*-#^YC<~LNIqj_HLE0te3;ZsLId@l}v*M9?5TMB)%z)6|kLM>gCVSurS~I z#*3H*Il&wu6Tt41I5XZ7zv}S_jVWFr#jdGV(5n<5br5ke$`;-MTac{pYiJ$%0@>mg zUGP~K)z^CUw7whh%Qokr{}NEXo-bXijC^2$<&x>j?8Gii6N^g@y*ROcW?Sf!tpPkN zjM~sQ;yJLGond*T=~qbY9@`SI7dfo6v(k8Zgqvcgc2SbYff=zza&VKC?YxdXsxZld z^aZa=6V-OjZG~boEf+*}Mj4(-S-kju8!#!M=lgVY3+E%JA{ZmuHj%}QeWm+cUL!~_ zsAdNo(=>aoRY^wjDRW?`w>4k*Sm1Vv3YKcEgoFJ}l+IhUeFrbV{=CQ5C{{y9HbGe& zMdII>TIsQ_AMg_Ga$!zjZ2E=swu5dV6rX(}a z>k~}*oLKJF{6H!Vg*JKl&??eS5Bru978v;|>wWAp=zQJUdTH6#!=$gJ^?c$LPIrju z>Jk@d=};N8tFB%(UghfIvXHf!3azZ=M8eEa;SV(>$A$_2G;~YaXyafp)L^q{Q>4&t zV{E*g3f!TkOr4jb!3F&;7)_Q8(X*C~wq08>Y;`-k2v;Q9r9vbZsYb#gUz&8-ohTe= z_u_Yk+Dx!8aa0`y__AN7B4N!@M?ceuwC@E)&zDH6RHcY+a<34abF>CeXi7hE{FoQ% zwVrD%tm>^XVU*Y_*g0??{7Ex2v!C=JXKuf1O`n^;LP5rJV!gpBI?M2x;Etbmil;6iuP7)7^p;Oe#{@WBGvJD z+vPEc;`i{2c|`;t9K}9%1#UNI6&2j^C0cuxkR`67H&#d~g_Dg_{^lHYoDgfj-f@tP zeVH}OWRRhaU;!o6Bq{{Iju$ZFII@oF&%<{@3tGZiHzzSJmwEo<<1sS8AeMVFvGX~5V!1uDN%PdJ#aUF%Cw{x zOm~UW`Zj91VpyTV3#I(C_V4U^s8nZH=Jn+;V}s{mQ}*@Bl-5&NkUJ!g+0|`l#=QE` zJbp8P!(vUwS2u3UfT%i^eC=e`rpBSe21 zdrtK;oT_dRa8}EiMI{`+qf-^MXPq_C|NO|5tuBpa17-uxEC0j#C!v{~?3vvRMzVS7 zXm_J9n(Pg4PySrs1KTqD?RKmkyyS`(Du9 zFdk{x*KgvBsTF6Gd2YvhX_XGPj zcUebOZhkDh3!XpA85}jY74&DRMnFQC_sL0$yq#$e^HREY4secKWOR{+MwELZWad!c z>)=F*xUdGY9*0L*0e&T)X0yFw$533;5g7}aJ55yWKIrysNbFj8bdC5%*H zT94!!%*UzT`AkMD^4S@90s3siadRdPJ)zeE{>$1yVrz#Ftm)-yR=Va$&8i51~rnQ>V$ zBI_Dzh_%y)PVVg4yGo3riSwwy8j-`r+R_qduy+2w(7;haItVdp^*|Y%RaYYnERB^P zTNxAt%BDh=P!xF^1x=l^KVQ>{5qD^Ksv2ms<^*=fjq zr8>QDUbxn}%RrwWhQpeLdU=vLZYUuQrz`tp*443Hz=7MaH zkT`#gD-Yl`mVEpN$iq24wX*S9fWH`n>Or;*@8;CJ`u1P=&mA1d zcYU55?7Y=95P+q-g2bKDE)gksln2JnNIH}|O#6;#kRKz3r%zK5Q1(ORLTt%M{-Amu z%k`@~Ynz@JsIaW;OUeSna$f7~fQ)h7F z$!O$*SA$TzY`$@YA8gTb!$LeaR9uxWYcJD|ZUzDk7vxQ|`|rQ}Ea(~jqHqK->f^`j zjetR2PPbhy8xE$nE>rb9DZ4olGUy#GOD3L7QyMh*`MPfV?7fU{#>?A_7e2oCv?l2hEV_yOfM`VyPg@5Uf>o${*y1?O^IA` z-~X|3RH+r!sm5E~PwQeytIrVObDd)I@HxKmx1KEw4UJjsIxX|?@Z`I!y(27K^VxQV zRQf-mrKd`osb~wt)plpYU{R_$+Qju=C|2zI2HFtubAKk8um7Zb zprd1=a|lSj;CHeJ?^l`&JNEp}9rYy=%xn~kSQcL=mozbLd~nN-*VXsEK=8-QX8Nx6 z(7T2Ao@WgOigr@AyC@E;coo3GAI~v6EVd}g*W!Do{l`e(&O|Gr?#WTsD=B*5UALBM zM~BX*YjoQ(>%khA*l8=*lbB3liz$iaOP~ISqJnu8;?v>y3iyIXHN}h%LKS+^T~Rmz z*ga!p$z3<>Gc>Wk`Phf5KC|Pg*h-;PJ;0CN+%GBP7k#coVE1V@b9(npo<+#L{ZFdY z7k-36kjpJAYYO4rGlB`vsmyykjvpgjs2|5X@-{exUqnC1nAwr~OzkYdltR8Xv<)D9p?Rby*`IQ%OW1@6IaI_pmd<$bb z72|tLKqRT`&uSIJPuKmw8Fb$j-;4q;e(b)&qr=mvD1)oSlAV;V?7T%EoKpWK`6>)c zDz2|mX+B$NwE&}W#b{25;i*NBCGOSy^JhOVp$P6Q-(~NTf&On0ot3HjdgKUEA?yqG zT-|uaV5e3nli+>YCP|K+g>LuFi@Tuf-&g>4bGm%(v-==8I`~4L#X@ zc=CexCoGCR`NlU?r4ipQo#v*SfjTXgQr^i!W+!YTA~7bob9`JTr#Ro(e)WT@5Cv5G zk5$f2W}3@f^UEf;U2GL>Xce6Q|BTQ@LOJrCfWUcA&J z?Rn}bR6D;lBB@QO(_&MbSHi3%1fu-$y;YhX7g6}vA_m=aIZXM_{FD~(<^iSDG z{Azyc%Qt+8%Y2>v;@psC5v(I~c%<@Lc3*8wnp9101Ldq!VUT=@+Ge!HLNzMM%+o%Z zxZI3%^yf)_H4=A(^%uhOsA0?f!RkUQ*KtTo6a3Ln1lRjZ9id7P8)rl#}RT7KX!(9B`$+wJzEsZlpXH7M_k^dn#x(kq2RPn zTtT9aw;JT-*1zicKj?Z&%u}p!K9a zt-sBok_1CC0CTXDs4P!vNz}%*^erm|3_q4|7+#>i6)?MNAG`3b=r7^sjN_3?!t>nD zR zTW`l>x6&ii{X=!kYHQsIN<7Yk@4dHgXY^n9PH0Ak$E7XINTWT8LdjD zR#Kv!wy?u0nZR;&1GYSJ=Xl{0tG*A^d>U0zWGZkhK9GY-xe=Tkq_03^A=;5C&GvJe z1XupZD27M%KWR@8^6d9r01R}f)pCzUs^hzn)9RlT2wB6!^v699@)UlLB{aCoXkRer z%A!0yyBHDa8YJHiIqa8^NsVL zL`0rgKqd|j-G0_^Q0H~vov2dOMDmLx#L_$8S9=@AqKkuEdNI@gdV`}imOYfw8LEn6 z0UM3;=Z)uO%f{a6bKLwNU*`nl$jt zd2J^zpo6yCe95Z55>+Q9SCY3?;_zD4-v&iiU1pyJglN?w9Bj`(93~9UDY*avfx;uS zde6|=U(&RS>OJ@@YV2uOPVN+bHzn z>>6lMu?4Rxbz2R=)#q#{v`OPy_~IIKb}>VlQT1QK z@Cs5w(?&NkiLLxFm!mFI^dyK}N7>>UJkYft_}omE)NR3(+mc==$fL(!b-(oKbTiea z01?7g7R{{*@0RrA4+%3$2GsuI7NRBSsXG!@-%V3>|KuwED@Js6{EklLMp6 zC$ou>X3+j9iREr@+PIjhnWT5_+2*fZM> z0gu`_wO#`j!S`nR`khbEA=*|Gz`60@I`6LpMO-Oft`ww0^8TG427764>Iw5|KYfyD7zwVt5V=U3U(@3CT=d`eow z7-ze<@<=;oTwz&#(ZVgl4e3fwm%C$I^yl^?(2BP0p&At=$*5Myk}(;SmW3jl65{F! ztgKNd5YU@M&W*XFecv-Ji`ejG#D6~~z$9C%Lfl`Uc=to9msfAWDXmNs5z^I}wnkMY zhX}E^2H8rm3Gy%6S8bXvcC(Gf(mvKh>9_0p9rm8bm9k!It3lfI@vnbN?GzHfk2bb!MwDULXl&7u(x5 zaBCPE$|yR>F+hs7IoH4F7)|1GUk!FIkd*(3EDa4m+x?6G+>l(6NP&dLbzf`>neHe zs;B%$W5>>}Z#v!@M%!qfqgyZ#>p7oXp3e|$d3+*`+kORUhiSmIX}9!75inX`5$9yZ!>Ov962 zHybKpe^xtl3M6hbw(uUEOlNh!4ufrtIAbXXxwzw*YvrY%RqIYDg0vRO)6jm16YM@% z&S#d|ulI4*>gIsGjoZ(6F~%b4tZTiLV>E2Edz~5TwZQv*Y0LE$yHFvrfd$}xK>W*s z|3=bA2C?s3B=b<)uKBRfO4ez;(G#p^5nx9mB@{jMK6{+5ZtE{oz+_a{H`pGI~auvo0w~ZgsB!qn8mD zx*^dNr<`H9*+W?S`Qhj)tIiB_a=#wmhP2zBxp~$ZCaDbAww@s4c%c(Q|KK+Xg&3tm z#~pMKO$=jr60N!{y?=gA+`WQbE)#atKahQ_mtY>5vNUnA>NWfpoD_+8H_O7(bRfOS z>}1i}uP=;^Y{*)77&D9+qjmJl?xKZ)w~`t#-c!aSnWF{IYoAw2K(gU@`~COkrt3I{P|PO#=M8c`7%3cF|<(ws`&z;K1G#f&Qk~ zdNVfVFONL7S3u#5YRhi6GI#8_PHi37_0Bu888%9bUpXzhXwMQk~GFng@uSJ2ux%a zEyxu9njb_v6G0$lU5$P2q9^vY?3BW^sb?L;C-J;v$y|RwbBi1|&c3abwtd9(N9lQx z!}lPYSV7^Kjr34BrY|yoRV1o7N;V!5QaTip8SMOpu$9rZU@#lSa6v^_ZOc>48wmI6 z3$d(zt9i*UcU_Oj%oX!ZvUJ8vcm0VUmCkx2UC4<1m-ig@+k$VlZEuDE(d}(4$Zs~B zBgKjKj)#F8SY1TGeMxw~fW7lhM1k8Nkr`AN*f-0oVJ4!qCwy6i*~&I`X}WSff~n%f z!4!$+b{d$Kqg?)G-Da`m%ey_=L_K8Sl*0jMLll8z>|)BfdQOafe?r zpDNxWifny1Pn{Wumt063EzC7;N_kA!z-$_}{IEPff?C`95Aye6v1m<(q!Iy%@!d;O zXGdF?-kl@%qLhq75MCgLtwYNR$l@6v!_iEW>UcQ6WiRUU0HlP)u2q5_Uxq?&<#@{8 zl2?m2kj_Lhfr4x;gOOTfu;aWdC;VMlLa!2U?O}uw=v9B)w8bteKr$Vwp&9W4vDpRq z{Mf8GT*-U9621axt!4xUpFWcRUJMc8B{!-(0ow%cnDeZNVXrK+K{1%XgIW3B#jlV2>SyPdYAv7#=Zh7%I;fV5RlFx1u2mlI;5l|q*JA&Ll{y*YJdR= z=^Q{Bq!j6tZjcy|ln!O+?z}I4_ulni|8;-*vCd+#26)%m`|Nk0v!C-k=PZs2ZK=NR z_j05#zAwU0F*=;YE2%CjHKm8#^$MpV}gP$uj53QY7zXaWaQR z|063I;qt&G3W?g;pK20Vyd7mv(x5KVg1$)7l=CwWrS?6FC*c>EgS|_4=;emV{f)k- zft**!9osidoH_Uf?N&WJGdS<0{+RFQJ@I2m^}2+V?$J^088o!|q$Cg-RjNgLa2^K) zigNP2;BSQXKl0s&MWNd}nDl4A8og{Bv4`XhO8M1~KZp!x1d}H3s;5i>HHEtyiDu~K#~2{sdk9C|%lGjaL@uYkJn$F+ za3eDLN1E$>qNkVt&^#${_)_D@(FJ%XW6cV~u5tJYjHUf#sm+|090o6D5~ZHua))lu zn{r5a&*u5K4reGORQ9(sKL}4a-3@h^J4ZCFWyjz1coP}r)My*6<%8@GQAw}cZY2+w zd+m9`RMX3RbJ@P&<-YAC@gNhuwPZ~+9}Mh6N$8goA{tQdpfx_)&6|f^?Sc(}${h9_ zSmKR-YVHt`A}XLqMQ?{NRVY$vq?*2yQ2O*x%f4lp?x+X_ULa-GLBeRBoY3)%(?*9L zQ^>IC{ZhZMkfXKE$ih3AfukxveK{MnexkykH$~3iRKaKXJ>hxgAx_%;;isbDz}KE! zW?BYInTJ;0ug$1ahP7M_^0?;l6sZrqp~f>7WbYEob10fxDk9oEI+wSp zi3)QI*Z_V}GvBg+%yFAL(u}rCnVGBcvSIKH>L4QZuj0cD) z$NwwAc(n0DedXrB_B&x`0k&PPbY|pE9Mr1rHGJ>gfbL|l6`b*NY_{Rj2&9zOglo@v zDu{wz)A2=Dv)d>G<~|mEMPHxjKz9~~58Jvwu*3-7A`@q1c#7FGOq6nulvr_0G1`$+G2louOs2=T z!9Avy$%~j@M`rSY0O}70Ea6)`+qRcLqz2xLFky^TT$fbG*bF%ZN|8G(dF~2qmP)NA zJ#Z;Fv2qjLIeFw=E-!hcl9LB2HmBpmp_beLI1}n>OBPHyXS(lKS&#r@%ed(*sf@IR zvG$6(r%h8=5hI+V$0UPa5@+M(+1uR*G~%Ga23*t)l?qPRh)V8>4=rb&5l+}@=XDW(OX1|6wcTjZL#g#&0^R- zC_G&g*F}!Dp~iSade)#lp>f{**$Kftx^(ZI*B2&YzeKdnpNb;vw0~1^3Zzh}5ftvr zx-t~?Nlfd%B8A1<$4vsG4h78lci9CqzUfwpM@{Z z44*u$CmhJ&1E+QGk+`iNo*+$@TeS)(+P=}EYkYg0Pw}n{4wJ!6-Ti(hFxqfQ_YFPI zfo_PAOVQ&5TO3pSk<(jr8G+mA=(??hd5S9rHPLK>@1(t8U#!(Dyc)-^6ZVoG4u&w6 zrN$>F2Wa#OfL@1uUzC<-%h9V_iA^jtowX9NWySnOZkkr+Of{@rBj3vgZ+MeKwZUKZ zjuc|){DfUZ1xm}!z{+yA^yxsrfR?<9A|@khD$PTg`rQxJkU3Y_J!nxvS1$&-K)SE> z!E0XLpKV`2n;(@E7`{V@3LcrsWLYG|JPdLC)i!p@nTFl4!&+HS;s#qW(+^xO4l_Ea{&AJ|_Q2|N zs6x)M>A_goHR5$(S&S)d2Kl~##9eD>6&0F6lD|q@zhdYo1Hw7G`|$`fjE|yy3n4N| zs&vv%C9@iz5`9X&YCEB@P`yW|pM^$JG3Ft0s)J(nF?TWi-N>7)F5HvQ?z~HeBb+T~ zJvu$dcB@IhL0t?!jfT96>Pn5CAiSQ}p7uzH+HV9z5|4N{7O_g6fo>oXm76MDVi`?R zF`vwo;en=4Qq71QyAD=lpuYK*P=65Xg6a9D73YU?e{rs4J2L7emIMNCNNpXb#w>IrvdsnSOz;2yBON6v^-QnW7qWx< zd4iFmvgobKa%7wjRi#TTz`@_K_@omQYJtL%O+i!Y;=Te6gjDmOo(AAG#K^=u*dBW$|ncsx@)C8l3K+U$`P)Cs0?U zK3jNgp-bxg=?+Yg2*Bf|P|x=KQ5E<5IJrKzY0{aPOFJr?gCOuojnEz)^1*H*Og3k^ zZSpSpNQDsh+|pl>Sn*O8KfyGl1~q}pX2K?)50#-25|!m#W3;>%O?4W!9TtGcTe_IIwYbsumw+k+;an+h`(Zz<%Pgb?k423Y+Qu^f}f?B?Xd zC{rxAGFS0vNrVmiYleBDT|RNJC=v^yJ?_9gcy3POTIp28Z=~-rOxjYcZ=uXI=$@-| zbSS+S{N~Mo$qO9T5Ka5IJzzJpxMy>BXEuf_Se)#EtW?ChZ2p!rOoVW;uq))!!< zHbjWFvinT1qi*9NfHB|l`;9+(;^tn~8s5Pj<=;?#nrg~%GYL==u2}2>ZCOi1BUoNU zUDL{eNK-2eoPiSoVN{_2>64OPIhZq*6^zxwi{6xu`fWOJAyeg517A+jKoRuUeCAWZ z5m5??g%HK`F&!$xL9=40Ed9U)c)`oSy`jJZ?S4)(MT9;R;}m&ypp#J~sgj7}7WjCg zkF$qIbmRLv9#H`{ib%wP8ESWt@&sc%98zg66{7^BMz{r6{M+Xs!M-neIOvm&1r73Jf!i=tG631M5li$oKF&S-s*k6 z;)le$1|5?>FFxKx`qx|8OW285o7jRmC4|fZEWBIAt)c_voJtH>HM!O{nMgFKNOP}$ zx){ivP9*?P+CH%#Vdsh&^ThXmc&IAOlTc75FD|M@6(RN>#oH^7wmkxijlKw9BDl=T z!8_dxT+JnKnqOCw6f&JwvRN~%TZM`rtk!b~hG#46_#$g?Yj>56Bp&g;AZl3`wAgd@ zr)=$S;J}0=P@huPELODXnrwBxDGIgQu@Z}K;siiSW&f{Kn2|Y>{=k+)bc+j&uZNeZd$hDhN zs@su^i*aRR290PeyysX!ijvt~U{JOjSjj8+nTqr|OZDvSg_zNFeK>si7Il~+(o`hq zRq_%=QbxQLchfM>(7U}s$Pe;uyYT#N7RmRds;jRiyFaE~eB|>teX-dIHH-~4jSG84 z%72`-^yG_88Cu3D!=!1R6n1{mc1j~~6rIS>iLj3?Z^$UvCagq4ni_&%~cTs--3PHcuz zfuKCwmJ-2zr=Y)Iky83ZP=B+-{ziqI%J-%c?{o&qu(jRMaoWxk;HG>Dkl*gkTm0xOTEoFe+B@`bd%dLS8Q;fON2e*~`2GS{u%X1fZ&Iw)eKVwr9Y`qDypaW@RvO~e8w79FqLV5cOHd%M+B9-xQW>?MErN&##$@OvD9m>n9+;~zJ8xDo zZa1i$wD7cJ3f@lHzvx>OAWbsZZ&xW*Db$3l-@<5?6GbK0!Y#Xq$|>C)@(1U3_gHFc z-lAO?n@i-J$3WLlwq<;-SmbKkVhDY=xVuV@#LEno;e<3b34k_ba^X}-UOwKRuuDZG z6AG0OvehDAM9aUMgB?LhXVA=zrvd)PgHO~tg1`}1BP>%wig)sgteWK$T<9Ps3>oYq zd4|awU%X!@KD~2?+E+nFN}B{`Lz3x?<7>~?3f>ox;dDD|>U`#2iOW~CAhuj9J zQQvO<(5fn2L@%p~Tf46aE}CnzVwM#an)Gb)tANu;lTQkkfi<>YS|*nLU1>1Cld$Gyfhj+bPI-{2@^C zm+njOjZ8qg9@V$6A4L}@7pHLD2jt{gnD4sS%hWmw(I|whE(uEg)YF(>sB~mzreSoH zgFp)^lmbt1CnEPhX95XDqv5+E~b3$EG6pKb#9ts*EPfRRjN)5=m7$RNMiLhFe zRvN%K1I9dUZAR{CO0Y?%|i>Sz80 z{UJ3rgUo^MgUzR*Fceuzt%FFD1Q*ZsNAOQNiMrbsC^24h%MY5(He){O=KgkNmZ>!A-$#0UX618 z!@KHqhEA!w#_0_s-vdd|iq&mdn;fu{;npG2mAVEph8<6RdCW+mJR9+t!o5KZKFFELjTm~&Fq1y;zFl&D$i40=EV`Lk9}BoF?e z|qP=x$eRrbpG=`M+# zBEh3Yi1=e6L|#Zj4Oik+GgE4869eikH9&gWZM)BjF1#}U#VIf{XEqerd2MlK92$@8 z8PJ&t!ccmIM0}A?piMe9;0Kl6xFQ~JZa@e<^gc(Zq8BMW|Jnq5o47=xM9=*OOso4M zVWg#EAVM$n=`ZU0j>V3erAZ2s0_RVXR3~p-PVUx=pLIN{2qiAYaHh$r#=7+BcDhJA zy4W&yP8Qg+$=&cE>gxLh7aZ&so^!+nn`gr9D>O?ENG_z4+bDyxD*W)nq_uYxk4&&G zwv<#RZ-#w4efDKEN|jnBqK$;#?JWjVFkcIIg?+H6W@m99mSeS}X0J6|b*^P@)c$U; zOm6#_JpHqEdqR2g@)5uux~MrBoXj~Cf@|o0xa%7EYQydpos=_Hd1!xQbATgPU!IpT zZJ*R!ZvHA+KF}#m8(ZP|gM$V&!}KQQ{MX1gjlY<%b=f_hh*LB=_+J0agYusnGZExK zfjJ!I@Ob8u4zqa7*S{VS4E%j@BaKcpGPU|`r>a+r@QYu zefn7ohP422Ol!_>@E0LX*N`{HCzU+Q|JezT!rZ=W^jYpkfB#gs=A1DGI{ zPV4^40XnURz26INlPT{G{jOrl&ZG2p7TP{=!FTxSPaE}t97RY3StOzgHCdS22Ksf} zbde8^rtIUwq{I9bqq9&m$ddy|l%ltZaEW9 zO-aP|i%E6AVUNF)MDwGQ_pu?W1^_@wqNT^&QTkL$ZS7Xt|5X93=5HHd!vTH*WjiA_ z2P2m($`tiD-`ldn-JS#V^a6a8P%#R2;^wY%if|Dwy8?qm*|m>0kKxj$(fAY9AI5}y zT75oS8V?tVuj1WqGXd%=gT(0E)F*MS#~?C`&t>AzNdunXtU{j*-&6Z0-aIpCA$-YZ zkKkj`gA{S5beSNPn^+SL^B<7e2Nuvl|v-HWqbntCza%1IlJ^qE<`P0+daY|FA$ z5O?4{G%8l78D-)XU`Wg6Qioq+)eK=8O%|&-DdB3daSGM6IYUahQq^_;XCBLlxin^g zoNB%q6EX1=YbIGqZ>%dfh32L9{VTmpa)MYh!Pvm3+Rrb^=yIcg`G)vMcp9ab4sGst zD=zD!OD;ONZ*li{9JRo2Qr74rm~jN_$KYx+JTfCc?ca`kK?z$~c@ZX)BPjmj5wS}` z+GkbyTL=HAW|IskBmw9Z)z=J|wrUkCGHjsoLE>8b1tZgf~q=)>5wE_mlOY%%AvxLv{Xhhp3n{WpiyV zQ476*r}RCg8=W&BHq+;5u!sm;iuv`U@EAyVe;}WfvH)T|vOuN#f9~ynzb^?uu;j*$ zoct#T|G%G;$pSkb*^bs`|GD%3YZojR*x0YqesiAwCzt)VA6v39pC##mIPJiHwCo?F zOSJ*};EJmAw^EP4H60KNmR49>Zp7{C=Wh+J2-v6DyLRjMzqT+2Iujd>+RCzjW1l7# zU>}<>RPFDbkTKQ(5)lV>q9DL6{=0o_fPG3#`8)sQ=l?S<|Mx?1C@|Qvi{HEcY8>zf zN=$%#g+C90^Kbsz511?wFxV^xQ#${~zCyHYrK=^IGeae%iZA;08^f>`^A4Zq!PwIDnG2QnU(w?yVgJt_mU~+C64x$s;v4(7}T}0N(8QZnGRcSIyg=t>-1?`#5}iiqeZ2>P$sg!xo&j3 zoYa^`Ed$@n;_$cIo~w5tZ_TPuY8%B}^`!sXJEabne2SxWkAS9hSw5?5Z1HJ-+mFqcCBDm)3jHqG zlATwoDonOpACfOPyq<^7H9+h8iY0A5PgjQz>kG9#Mf5`*%A0{}IreM|HXhrnTD>HF zSN;-^ac*tdk--RG7H`eewI6TILQWi2t>|7b#bRxeWK(@FtyK#(ENR*=&p`NI_uklH zIcC2h4U^@7Icxqs7Xc*K_?K*KXDiN=mKVog(w#u>`-km9B>MS1cXt6QEfg9P z4TTm_5lAH0)G~7byw}{D4_l;Pla0S=uG)_M+xnNXD2cQFiU>jbNn_~Nm7w4;zy9ja zFlK{yUtP{ZpLn14Tu4jVG?@G5LX&q~+0x3f;z!xVRZYB(V~{a}AMMP|(fqZ$_=@c3 zw<|&Al`J3aq&!;E>rb{i-sg~UisUbu@j4e3vTnD{oOZmkvI=aE8}w_Vl5miFFV&RZ zf{crkIwn&h-M{>;VES`9%_K&3W)Vy9uKGHy?k60ynKvu?))PamPA=jd!@xaSpQR;* zy|2oh(^{{WzA8b=e6&txy*L?2WmxTBowVb8?pCILAu^t zKi$~cIG7k<;YpIicup`Fur$TaY$QXwb0)kw*)*$!W7y<9hEFdF<`0(a+HvAwg&%lb z9kP-ipMEf<2+YSl?$mq-p~o(7Sjdrwrnhzq;1Z|4+Ycj@g=i#J@Foe+{(vfi|nebm*o!mtnUjJ)le%EyUjOlAbVZ2 z+&8};pYD-tgwII{PEjFnA=cK_M;jCR01NlY`qq&^c~gT6I>eS{UmVDd3kR;(KU!Ky z5ZRg}Fg@nz`${_JdLv)Qab!RANspw;sl^*85!Led*Lw z2#&GS?h$8mNt`bIIZIC7RC#^fjT5SpjN9>0d&4Y?moc2zZVeDyGmmVLJr_D%~x~QK&gY#^Bs&zd%oa`*&WK>#j z4L>>aZv)9bOvm9XOgihRD$yA%8MacWhqS~$!m9ko;S<$Rlnz6 z;BuF1;iJ|A@@8)YA|>PK`1r@U>&E8laQpGf{Sw`q7QeKMrR>>SC&-KW!ywF2Dwr}N zQ^l}B7c#5q`==#GW)|jXAoufHhBTI#ZriluYByj?>JG>0EJ~YBdrP-upvP_2%Lnkt z&7~>DELC~JLLj`iI;j*v`(4)$r#tDhqVz)O#AFi=OUVAbb8LuW86F;99i2>TD5Wy{ zEQ@mIG7tG;(is0c+b>FlWj@H7W~Bev4+P@EQ_Cb?T$v$&Ewo7%sh5G1#zb!E4fR}c z-lePrv`IgJ! zB~9m{fbxf+lg5WEK%KOx+uUt)lE}#Uy+3(HD7M{Q#aS9ZN9To>YipDB^{V@Z+uMhq z^9}kbpFSczkuM9q`gy};6~Kv3<|JVhs@1Fg9V2>nH0mhJz27JP*evfQPWjiK$ejfC zU*ut(4Zi)KmYT3{Fs94HO@(}^v5kfB%O{q~jD2HxDRufhA^~^=Ob+Ww#~a_Dz5pP> zU#t$t`u65Fb#7m-{^LvkyGGD{mRbCcHJxO_!PFAOmu4sm{q}*bf2Q7dp7T{xC^%Q% zaJ)EH4rK+HiF>f897L-f4gkfBa+!vtXL$Ih^mH#bT#Job8d+vbB2p^8SQZ<*`KGbH zc|3wBfVKaM>!=2T#5W3HOIAK_lGdRO%b0I)^_r_H`F7;B0p~bgXV;i+c0e3WVIYEk7%cZt=k&;7h36iZgW7i!KXRiJ6{bd45?#$#LGnvxe^HuD`dkIM za!Srhv9`E_@muXmaA3*c@s(Dj3fLM`j(}F2ZNd%^{RznbPhE21-X=Y(wgR5lb*6b4 z{HKUFCM7BmH3?=m7E@~fV&^_i-oHMMJEVsZ%CANJh%1>j$AavM?lsh z2e@eiaNjqg*AOEJ2iHzn$H|G_eh#Nn+E(>ui+i2Q(VuIn7Hr~z`tU+j9Lpna{g5`$ zjnVJt{b@viDLk_pYxE1ZY!T?=L&dRU0_Dbkv6MlDm8$8k^(?s_X)#-=ILKj z#cd~B9xAWN|BK3tL7fz+06+Qb-n*^R|JI_pbLan9w*OC8pJU2=egmSBx$WsQy2Q`; z!RiGf-NPDx_8(9aQ~pnZnf_i<04>LcdO4YHweL?g)=({4JkMC|grOBDDk4}?N=R>e zlNC!7f%IH2_|yNDV&0baC`u^+xCv%N!BXhM)T;j@mT*p&sp)g%ls2mLcTW{~0Z(7Q zCSLy+W0fg>JFrn39}RDD);~ANSrUNbf0C%q{4WOgND2_nKn<9WH2>a<^>m<~{o4Fa zQN12cea2_{yR4V#!N7nU0h z81cBO1H_JDv=1rz9K;b*r;|O|&7ZmcJc3|a7@aIzVug(9H);BWeSPuDsW^bw_=PY; z&P=?CDen;^S7|-?Phr6S5GD7Sz)y<_g};MRl1Bho@=4s8M&ple{#|`g#R3r!)UYXH f{V&2>9Ckx1UnIt21q(U31AHmSs>&2g8~gtkogKiE literal 0 HcmV?d00001 diff --git a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md new file mode 100644 index 00000000..510c3d60 --- /dev/null +++ b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md @@ -0,0 +1,337 @@ +# NPU Matmul kernel from scratch -- reaching CANN library performance using 100 lines of Python
(step-by-step optimization guide using PTO-ISA) + +- Date: 2026/03/12 +- Author: Jiawei Zhuang +- Contributor: Filip Skogh, Mirko De Vita, Hyun Min Chang + +# Outline + +- [Motivation](#motivation) +- [Step 0: NPU programming crash course for CUDA/Triton programmers](#step-0-npu-programming-crash-course-for-cudatriton-programmers) + - [Typical kernel launch syntax](#typical-kernel-launch-syntax) + - [Auto vs manual software pipelining](#auto-vs-manual-software-pipelining) +- [Step 1: Functionally-correct naive version](#step-1-functionally-correct-naive-version) +- [Step 2: Double buffering](#step-2-double-buffering) +- [Step 3: "Swizzling" for L2 cache reuse](#step-3-swizzling-for-l2-cache-reuse) +- [Step 4: (optional) Manual software pipelining](#step-4-optional-manual-software-pipelining) +- [Appendix A: PTO-DSL syntax note](#appendix-a-pto-dsl-syntax-note) +- [Appendix B: Using NPU profiler](#appendix-b-using-npu-profiler) + +**To reproduce all results shown in this guide**, see commands in [README.md](./README.md) + +# Motivation + +This guide is the NPU version of "step-by-step matmul optimization", a popular article style for NVIDIA GPUs (e.g. [for A100](https://siboehm.com/articles/22/CUDA-MMM) and [for H100](https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog)), but never written for our NPUs before. + +I intentionally keep the code samples **minimal, hackable, from-scratch, and without magical templates and wrappers**, to make them easier to follow than the more advanced "Matmul optimization practices" [in catlass](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md) or [in AscendC](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html) (which hide optimization tricks behind templates and wrappers). + +We will compare our custom kernel's performance to `torch.matmul`, which invokes [aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md) (our "cuBLAS" for NPU), internally implemented by [many thousands of lines of AscendC](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel). We show step-by-step how to match the performance of such a carefully optimized library, using **only ~100 lines of Python DSL**. + +# Step 0: NPU programming crash course for CUDA/Triton programmers + +(jump to the next section if you have programmed NPU kernels before) + +## Typical kernel launch syntax + +The [SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data)-style kernels on NPU look **deceptively similar** to CUDA/Triton kernel syntax: +- The `block_idx` and `block_num` built-in variables assist offset calculations for each core -- [example here](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/jit/add_dynamic_multicore/run_add.py#L46-L51) +- The CUDA-style `kernel_name<<>>(args)` kernel launch -- [example here](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/aot/add_dynamic_multicore/caller.cpp#L11) + +However, there is an important difference: all NPU kernels are ["persistent kernels"](https://triton-lang.org/main/getting-started/tutorials/09-persistent-matmul.html) in CUDA terminology, i.e. the `block_dim` is forced to be the number of cores instead of growing with the input data size. + +Check this [PTO dynamic-shape vector-add example](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L46-L100) -- each core calculates its own global memory offsets, and the required number of iterations [depends dynamically on the input data size](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L83). This is **unlike** conventional ("non-persistent") CUDA/Triton kernels, where a data-dependent `block_dim` handles the dynamic input size. For example, unlike [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) that sets `grid = (ceil_div(n_elements, BLOCK_SIZE),)`, most of our NPU kernels (no matter whether they are written in PTO, AscendC, CCE, or other frameworks) always have `grid = (num_cores,)`. + +(A data-dependent large `block_dim` *might* work for simple cases on NPU, but it can often hit bugs during Cube-Vector synchronization, and can also overflow if `block_dim >= 65536` -- a bug [that we fixed](https://github.com/huawei-csl/pto-kernels/pull/39) by switching to persistent-kernel style.) + +## Auto vs manual software pipelining + +Our NPU uses on-chip [scratchpad memory](https://en.wikipedia.org/wiki/Scratchpad_memory) instead of hardware-managed cache, so [data hazards](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Data_hazards) must be avoided by the programmer or software using [set_flag & wait_flag APIs](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0106.html), essentially a [binary-semaphore](https://en.wikipedia.org/wiki/Semaphore_(programming)#Producer%E2%80%93consumer_problem) synchronization mechanism. The closest analogy in CUDA is [all the `cp.async` stuff](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/async-copies.html) that needs manual waits. See this [manually synchronized vector-add example](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/demos/torch_jit/add/add_custom.cpp#L78-L115). For complex fused kernels like [FlashAttention](https://github.com/PTO-ISA/pto-isa/tree/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/kernels/manual/common/flash_atten), it can be hard to reason about manual synchronization, software pipelining, and prefetching. + +To solve this headache, [PTO-DSL](https://github.com/huawei-csl/pto-dsl) offers automatic synchronization, internally achieved by the [InsertSync](https://github.com/zhangstevenunity/PTOAS/tree/8eb9e23fa95e18c3db789e0a171a98df07a8a846/lib/PTO/Transforms/InsertSync) compile pass based on the [PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/8eb9e23fa95e18c3db789e0a171a98df07a8a846/docs/PTO_IR_manual.md). The kernel code still looks "sequential" (in the pipelining dimension), similar to writing Triton or CuTile code. + +# Step 1: Functionally-correct naive version + +According to our [NPU hardware architecture](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/opdevg/Ascendcopdevg/atlas_ascendc_10_0008.html), a matmul operation requires this movement across the memory hierarchy: +- `GM` (global memory) -> `L1` -> `L0` (`L0A` or `L0B` for left or right operands) -> `Cube core` -> `L0C` -> `GM` + +The on-chip tile size (an algorithm parameter) is bounded by the L1/L0 SRAM size constraint (a hardware parameter). The [NPU hardware spec](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/opdevg/Ascendcopdevg/atlas_ascendc_10_0011.html) can be found in files `${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/*.ini` in any CANN-installed environment: + +```bash +grep -A 9 "AICoreSpec" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B2.ini +``` + +gives: + +``` +[AICoreSpec] +... +l0_a_size=65536 # 64 KiB +l0_b_size=65536 # 64 KiB +l0_c_size=131072 # 128 KiB +l1_size=524288 # 512 KiB +``` + +Consider the classic [tiled matrix multiplication](https://en.wikipedia.org/wiki/Loop_nest_optimization#Example:_matrix_multiplication) -- a general-shape matmul `C = A @ B` is implemented by tile-level operations over `A_tile = A[i1:i2,k1:k2]`, `B_tile = B[k1:k2,j1:j2]`, `C_tile = C[i1:i2,j1:j2]`, so that each tile fits into SRAM. Given the above SRAM info, we choose the tile sizes as: +- `[128 x 512]` for `A_tile` on `L1`, taking 128 KiB (fp16) +- `[256 x 256]` for `B_tile` on `L1`, taking 128 KiB (fp16) +- `[128 x 64]` for `A_tile` on `L0A`, taking 16 KiB (fp16) +- `[64 x 256]` for `B_tile` on `L0B`, taking 32 KiB (fp16) +- `[128 x 256]` for `C_tile` on `L0C`, taking 128 KiB (fp32 accumulation) +- The Cube unit performs the [`TMATMUL`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL.md) instruction of size `(M, N, K) = (128, 256, 64)`, taking `L0A` and `L0B` as input and `L0C` as output. + +Why choose these tile sizes: +- This is a common tiling choice [in the ATB library's matmul](https://gitcode.com/cann/ascend-transformer-boost/blob/br_release_cann_8.5.0_20260527/src/kernels/kernels/matmul/pp_matmul_f16_kernel/op_kernel/pp_matmul.cce?init=initTree), but many other choices also work as long as they fit into the buffers. +- The Cube unit prefers larger tile sizes for higher FLOPs utilization. For example, 128 x 128 typically achieves higher FLOPs than 32 x 32. For the full set of supported matmul shapes and dtypes, see the [`Mmad` instruction](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/ascendcopapi/atlasascendc_api_07_0249.html). +- We still have >=50% space left for `L1`, `L0A`, `L0B`. They are reserved for double-buffering later. + +See [step1_baseline_numpy_sim.py](./step1_baseline_numpy_sim.py) for the full "NumPy emulation code" that explains the algorithm logic. It's the most basic "split-MN matmul", where each core outputs its own `C_tile = C[i1:i2,j1:j2]`. We leave Split-K and Stream-K matmuls for future posts. The key code components are: +- The top-level loop `for li in range(core_loop):` comes from our "persistent kernel" requirement explained in [Typical kernel launch syntax](#typical-kernel-launch-syntax). Instead of having two-level "row and column loops", we bundle them together into a single-level `core_loop = n_loop * m_loop`, where each iteration can be independently assigned to a different core and completes its own `C_tile` calculation. +- Then we only need to accumulate over the inner K-dimension: + - The second-level loop `for k_idx in range(k_dtile_num)` is for "GM - L1 level" iterations. Once the current tile on `L1` is fully consumed by matmul and no longer needed, we load the next tile from `GM`. + - The third-level loop `for phase in range(8):` is for "L1 - L0 level" iterations. Once the current tile on `L0` is fully consumed by matmul and no longer needed, we load the next tile from `L1`. + - Notice that the third-level loop can be **statically unrolled** because we have a fixed ratio between `L1` and `L0` tile sizes. Because `L0` tiles are smaller than `L1` tiles, more than one "L0-level iteration" is required to match each "L1-level iteration". + +Then, we translate this NumPy emulation code into equivalent PTO-DSL code in [step1_baseline.py](./step1_baseline.py) and [common_utils.py](./common_utils.py). The PTO code logic largely follows the NumPy emulation, while using NPU-specific data movement and compute APIs: +- Use `pto.load` ([TLOAD](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TLOAD.md)) for `GM`->`L1` load +- Use `tile.extract` ([TEXTRACT](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TEXTRACT.md)) for `L1`->`L0A`, `L1`->`L0B` loads +- Use `tile.matmul`/`tile.matmul_acc` ([TMATMUL](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL.md)/[TMATMUL_ACC](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL_ACC.md)) for compute on `L0` +- Use `pto.store` ([TSTORE](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TSTORE.md)) for `L0C`->`GM` store +- Use native Python `for i in range()` for statically unrolled loop, and `for i in pto.range()` for run-time dynamic loop. Similarly for `if`/`else` branching. + +More DSL-specific syntax details are explained in [Appendix A: PTO-DSL syntax note](#appendix-a-pto-dsl-syntax-note). + +This simple 80-line PTO kernel produces numerically correct results on NPU, but the performance is only 50% of the `torch.matmul` reference. We will close the gap in the next section. + +![image info](./fig/flops_step1_baseline.png) + +# Step 2: Double buffering + +Profiling our previous kernel with `msprof op simulator`: + +```bash +msprof op simulator --aic-metrics=PipeUtilization \ + --kernel-name="_Z28matmul_kernel_step1_baselinePDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step1-baseline +``` + +(see [Appendix B: Using NPU profiler](#appendix-b-using-npu-profiler) for more profiler usage details) + +We see that the Cube core is idle for 50% of the time: + +![image info](./fig/pipeline_N1024_baseline.png) + +Double buffering overlaps compute and data transfer: + +![image info](./fig/pipeline_N1024_doublebuf.png) + + +See full code in [./step2_doublebuffer.py](./step2_doublebuffer.py). + +Profile with: + +```bash +msprof op simulator --aic-metrics=PipeUtilization \ + --kernel-name="_Z26matmul_kernel_ABt_autosyncPDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step2-doublebuffer +``` + +The only difference is that we allocate 2x local buffers for A and B on both `L1` and `L0`: + +```python +a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] +b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] +a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] +b_l0 = [pto.alloc_tile(tile_buf_b_l0), pto.alloc_tile(tile_buf_b_l0)] +``` + +and alternate between the "odd" and "even" buffers across iterations. + +Now the FLOPs are doubled for not-so-large matrices: +![image info](./fig/flops_step2_doublebuf.png) + +For large-enough matrices such as 16384x16384, the FLOPs **suddenly drop** because the NPU L2 cache is not large enough to hold the entire matrix, and the data gets evicted from cache. + +We can check the L2 cache size with: + +```bash +grep -A 8 "SoCInfo" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B2.ini +``` + +gives: + +``` +[SoCInfo] +ai_core_cnt=24 +cube_core_cnt=24 +vector_core_cnt=48 +ai_cpu_cnt=6 +memory_type= +memory_size=68719476736 # 64 GiB +l2_type=0 +l2_size=201326592 # 192 MiB +``` + +An 8192x8192 matrix (64 MiB in float16) is smaller than L2, but a 16384x16384 matrix (256 MiB in float16) is larger than L2, so we see worse performance. + +For `910B4`, both HBM size and L2 cache size are smaller by half (thus the cache eviction effect happens for smaller matrices): + +```bash +grep -A 8 "SoCInfo" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B4.ini +``` + +``` +[SoCInfo] +ai_core_cnt=20 +cube_core_cnt=20 +vector_core_cnt=40 +ai_cpu_cnt=6 +memory_type= +memory_size=34359738368 # 32 GiB +l2_type=0 +l2_size=100663296 # 96 MiB +``` + +# Step 3: "Swizzling" for L2 cache reuse + +Swizzling improves L2 cache reuse across multiple cores. We borrow this figure [from Triton matmul](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#l2-cache-optimizations): +Grouped vs row-major ordering (from Triton) + +To read this figure, assume 9 cores computing a subset `C` matrix in the first iteration (the yellow area, each number 0 ~ 8 marks the core id). In the naive "row-major ordering", the full matrix B (assume larger than L2 cache!) needs to be loaded from global memory; while in the "grouped ordering", the data traffic w.r.t. global memory is much less. + +[step3_swizzle.py](./step3_swizzle.py) incorporates a 10-line swizzling function `swizzle_nz`, while keeping the rest of the code same as step2. [step3_swizzle_numpy_sim.py](./step3_swizzle_numpy_sim.py) explains the swizzle scheme intuitively. The swizzle algorithm is one of the algorithms [from catlass](https://gitcode.com/cann/catlass/blob/v1.4.0/include/catlass/gemm/block/block_swizzle.hpp), which also [has a nice explanation](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/swizzle_explanation.md) +(for GPU experts -- such index remapping is analogous to [the "scheduler" in DeepGEMM](https://github.com/deepseek-ai/DeepGEMM/blob/v2.1.1/deep_gemm/include/deep_gemm/common/scheduler.cuh), which alters data assignment and loop order for each SM) + +With just this 10-line swizzle function, the FLOPs are much improved, reaching ~90% of `torch.matmul`! + +![image info](./fig/flops_step3_swizzle.png) + +To confirm the L2 cache effect, profile cache hit with `msprof op`: + +```bash +msprof op \ + --aic-metrics=Occupancy,Roofline,Default,L2Cache,PipeUtilization,MemoryL0 \ + --kernel-name="_Z26matmul_kernel_ABt_autosyncPDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step3-swizzle +``` + +For a small 4096x4096 matrix, L2 cache hit is high (97.88%) even without a swizzled loop order: + +cachehit_N4096 + +For a larger 16384x16384 matrix that exceeds L2, L2 cache hit is low (30.9%) without swizzling: + +cachehit_N16384 + +With swizzling, the 16384x16384 case now gets a high (93.72%) L2 hit rate: + +cachehit_N16384_swizzle + + +# Step 4: (optional) Manual software pipelining + +The last 10% performance gap can be squeezed out by manual software pipelining in [./step4_manual_pipelining.py](./step4_manual_pipelining.py). + +![image info](./fig/flops_step4_manual_pipeline.png) + +Even with manual sync, the code only increases from ~100 lines to ~150 lines of Python, still much shorter than library code. How to manually arrange the sync flags is out of scope for this guide. We are [investigating the compile pass](https://github.com/zhangstevenunity/PTOAS/issues/226) so that compiler-inserted sync can eventually reach manual performance. + +# Appendix A: PTO-DSL syntax note + +The current [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/3f0860b1e750f2c4d26a93c6501a212b60196863/ptodsl) is just a very thin wrapper over the [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/) of PTO dialect. The entire package has **only ~1000 lines of Python** (you can check by `cd ptodsl && find . -name "*.py" | xargs wc -l`). + +To keep the framework simple during rapid development, we are NOT using Python AST parsing or AST rewriting. Thus, all Python-native constructs (`if`/`for` control flows, Python classes, iterators, etc.) execute like normal Python code. This is unlike other pure-AST (the case for Triton & CuTile) or hybrid AST+tracing (the case for Tilelang & CuteDSL) frontends that *might or might not* rewrite native `if`/`range` as special IR builders (e.g. see the [complex rules for CuteDSL](https://github.com/Dao-AILab/quack/blob/v0.3.2/docs/dsl_control_flow.rst)). The current PTO-DSL frontend is pure Python tracing, most like JAX's approach. + +**Users should keep in mind:** run-time dynamic control flows are only available in the `pto` namespace such as `pto.range` (which creates [MLIR structured control flow](https://mlir.llvm.org/docs/Dialects/SCFDialect/) in the IR module), while Python native control flows are evaluated at build time. + +Common cases: + +- **Python `for ... in range(...)`** + - runs before generating the IR (build-time) + - usually acts like compile-time metaprogramming/unrolling +- **`for ... in pto.range(...)`** + - emits an MLIR `scf.for` loop + - executes dynamically at kernel run-time +- **Python `if condition:`** + - condition evaluated at build-time by Python + - branch is selected before generating IR +- **`with pto.if_context(cond):` / `pto.cond(...)`** + - emits runtime `scf.if` + - condition is evaluated when kernel runs + +**Example 1: `pto.range` (runtime loop in IR)** + +From `step1_baseline.py`: + +```python +for li in pto.range(bid, core_loop, num_blocks): + ... +``` + +This is **not** Python iteration over integers. In PTO-DSL, `pto.range` is an IR-builder primitive (see `control_flow.py`) that constructs `scf.ForOp` and yields an induction-variable value. + +Practical effect: +- loop trip count depends on runtime values like `bid`, `core_loop`, `num_blocks` +- loop stays as a loop in generated IR (not unrolled by Python) + +**Example 2: Python `range` (build-time unrolling)** + +From `step1_baseline.py`: + +```python +for phase in range(8): + ... +``` + +This loop is executed by Python while building IR, so it typically creates 8 repeated code regions in IR. + +For readers with C++ background: +- this is conceptually similar to compile-time code generation / metaprogramming +- useful when loop bounds are small constants + +**Example 3: Python `if` vs `pto.if_context`** + +From `step1_baseline.py`: + +```python +if phase == 0: + with pto.if_context(is_first_k_tile, has_else=True) as branch: + tile.matmul(a_l0, b_l0, c_l0) + with branch.else_context(): + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +else: + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +``` + +How to read this correctly: +- `if phase == 0` is a **Python** branch (build-time), because `phase` is a Python integer from `range(8)`. +- `pto.if_context(is_first_k_tile, ...)` emits a **runtime** branch in IR, because `is_first_k_tile` is a kernel scalar value. + +In plain words: +- first, Python decides which code shape to generate for each unrolled `phase` +- inside that shape, PTO-DSL inserts dynamic control flow for runtime conditions + +# Appendix B: Using NPU profiler + +How to find the kernel name for the `--kernel-name=` argument: first run `msprof op` without `--kernel-name=`, then it will print the kernel name. + +See the [full official doc for msProf](https://www.hiascend.com/document/detail/zh/canncommercial/850/devaids/optool/atlasopdev_16_0082.html). + +For the UI to inspect profiler traces, download with: + +```bash +# Windows x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_win.exe + +# Mac arm and x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_darwin-aarch64.dmg +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_darwin-x86_64.dmg + +# Linux arm and x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_linux-aarch64.zip +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_linux-x86_64.zip +``` + +Those links are obtained from [this CANN download page](https://www.hiascend.com/developer/download/community/result?module=sto). diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/run_simple_matmul.py b/examples/aot/matmul_optimization_guide/run_matmul.py similarity index 100% rename from examples/aot/matmul_swizzle/step_by_step_guide/run_simple_matmul.py rename to examples/aot/matmul_optimization_guide/run_matmul.py diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py b/examples/aot/matmul_optimization_guide/step1_baseline.py similarity index 94% rename from examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py rename to examples/aot/matmul_optimization_guide/step1_baseline.py index e16bb91d..34c5541c 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/step1_baseline.py +++ b/examples/aot/matmul_optimization_guide/step1_baseline.py @@ -41,10 +41,10 @@ def matmul_kernel_step1_baseline( tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) a_l1 = pto.alloc_tile(tile_buf_a_l1) - b_l1 = pto.alloc_tile(tile_buf_b_l1_256) + b_l1 = pto.alloc_tile(tile_buf_b_l1) a_l0 = pto.alloc_tile(tile_buf_a_l0) - b_l0 = pto.alloc_tile(tile_buf_b_l0_256) - c_l0 = pto.alloc_tile(tile_buf_c_256) + b_l0 = pto.alloc_tile(tile_buf_b_l0) + c_l0 = pto.alloc_tile(tile_buf_c) for li in pto.range(bid, core_loop, num_blocks): m_idx = li // n_loop @@ -72,7 +72,7 @@ def matmul_kernel_step1_baseline( b_half = phase // 4 h_off = const(b_half * K_TILE) sv_b = pto.slice_view( - tile_view_b_256, + tile_view_b, source=tv_b, offsets=[k_offset + h_off, n_offset], sizes=[c_kt, c_nt], @@ -102,7 +102,7 @@ def matmul_kernel_step1_baseline( pto.load(sv_a_next, a_l1) sv_c = pto.slice_view( - tile_view_c_256, + tile_view_c, source=tv_c, offsets=[m_offset, n_offset], sizes=[const(M_TILE), c_nt], diff --git a/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py b/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py new file mode 100644 index 00000000..83be2721 --- /dev/null +++ b/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py @@ -0,0 +1,111 @@ +import numpy as np + +M_TILE = 128 +K_QTILE = 64 +K_TILE = 256 +K_DTILE = 512 +N_FULL = 256 + + +def _print_tile_memory(name, arr): + kib = arr.nbytes / 1024 + print(f"[tile-mem] {name}: shape={arr.shape}, dtype={arr.dtype}, bytes={arr.nbytes} ({kib:.1f} KiB)") + + +def step1_numpy_sim(a, b): + """ + a: [m, k] float16/float32 + b: [n, k] float16/float32 + returns c: [m, n], equivalent to a @ b.T + """ + m_total, k_total = a.shape + n_total, k_total_b = b.shape + assert k_total == k_total_b + assert m_total % M_TILE == 0, "Step1 kernel uses full M tiles in this demo." + assert k_total % K_DTILE == 0, "Step1 kernel uses full K_DTILE tiles." + assert n_total % N_FULL == 0, "Tutorial simulation assumes full N tiles." + + # Corresponds to: n_loop, m_loop, core_loop, k_dtile_num + n_loop = (n_total + N_FULL - 1) // N_FULL + m_loop = m_total // M_TILE + core_loop = n_loop * m_loop + k_dtile_num = k_total // K_DTILE + + c = np.zeros((m_total, n_total), dtype=np.float32) + + # Explicit tile-buffer allocation (mirrors pto.alloc_tile in step1_baseline.py). + # Keep shapes fixed to tutorial constants for easy hardware-memory cross-checks. + # a_l1: M_TILE * K_DTILE * sizeof(float16) = 128 * 512 * 2 = 131072 B = 128 KiB + a_l1 = np.empty((M_TILE, K_DTILE), dtype=np.float16) + # b_l1: K_TILE * N_FULL * sizeof(float16) = 256 * 256 * 2 = 131072 B = 128 KiB + b_l1 = np.empty((K_TILE, N_FULL), dtype=np.float16) + # a_l0: M_TILE * K_QTILE * sizeof(float16) = 128 * 64 * 2 = 16384 B = 16 KiB + a_l0 = np.empty((M_TILE, K_QTILE), dtype=np.float16) + # b_l0: K_QTILE * N_FULL * sizeof(float16) = 64 * 256 * 2 = 32768 B = 32 KiB + b_l0 = np.empty((K_QTILE, N_FULL), dtype=np.float16) + # c_tile: M_TILE * N_FULL * sizeof(float32) = 128 * 256 * 4 = 131072 B = 128 KiB + c_tile = np.empty((M_TILE, N_FULL), dtype=np.float32) + + _print_tile_memory("a_l1", a_l1) + _print_tile_memory("b_l1", b_l1) + _print_tile_memory("a_l0", a_l0) + _print_tile_memory("b_l0", b_l0) + _print_tile_memory("c_tile", c_tile) + + # Corresponds to: for li in pto.range(...) + for li in range(core_loop): + # Corresponds to: m_idx = li // n_loop; n_idx = li % n_loop + m_idx = li // n_loop + n_idx = li % n_loop + m_offset = m_idx * M_TILE + n_offset = n_idx * N_FULL + + # Corresponds to tile accumulator c_l0 (reused buffer, reset per output tile). + c_tile.fill(0.0) + + for k_idx in range(k_dtile_num): + k_offset = k_idx * K_DTILE + + # Prefetch A tile for current K chunk (equivalent to pto.load into a_l1). + a_l1[:, :] = a[m_offset : m_offset + M_TILE, k_offset : k_offset + K_DTILE] + + # Corresponds to: for phase in range(8) + for phase in range(8): + # Corresponds to loading one B half tile every 4 phases + if phase % 4 == 0: + b_half = phase // 4 + h_off = b_half * K_TILE + # b_l1 layout is [K_TILE, N_FULL], matching tile_buf_b_l1. + b_l1[:, :] = b[n_offset : n_offset + N_FULL, k_offset + h_off : k_offset + h_off + K_TILE].T + + # Corresponds to extract A/B quarter tiles + a_col = phase * K_QTILE + b_row = (phase % 4) * K_QTILE + a_l0[:, :] = a_l1[:, a_col : a_col + K_QTILE] + b_l0[:, :] = b_l1[b_row : b_row + K_QTILE, :] + + # Emulated tile matmul instruction: + # lhs a_l0: [M_TILE, K_QTILE] = [128, 64], fp16 source + # rhs b_l0: [K_QTILE, N_FULL] = [64, 256], fp16 source + # out c_tile: [M_TILE, N_FULL] = [128, 256], fp32 accumulate + # Keep tile storage in fp16; cast only right at matmul for fp16->fp32 accumulate. + c_tile += a_l0.astype(np.float32) @ b_l0.astype(np.float32) + + c[m_offset : m_offset + M_TILE, n_offset : n_offset + N_FULL] = c_tile + + return c + + +def test_step1_numpy_sim(): + np.random.seed(0) + for m, n, k in [(256, 512, 512), (384, 768, 1024)]: + a = np.random.randn(m, k).astype(np.float16) + b = np.random.randn(n, k).astype(np.float16) + c_ref = a.astype(np.float32) @ b.astype(np.float32).T + c_sim = step1_numpy_sim(a, b) + np.testing.assert_allclose(c_sim, c_ref, rtol=1e-4, atol=1e-3) + print("step1_numpy_sim unit test passed") + + +if __name__ == "__main__": + test_step1_numpy_sim() diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py b/examples/aot/matmul_optimization_guide/step2_doublebuffer.py similarity index 93% rename from examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py rename to examples/aot/matmul_optimization_guide/step2_doublebuffer.py index d7b9ba66..87bd4c03 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/step2_doublebuffer.py +++ b/examples/aot/matmul_optimization_guide/step2_doublebuffer.py @@ -50,10 +50,10 @@ def matmul_kernel_ABt_autosync( tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] - b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] - b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] - c_l0 = pto.alloc_tile(tile_buf_c_256) + b_l0 = [pto.alloc_tile(tile_buf_b_l0), pto.alloc_tile(tile_buf_b_l0)] + c_l0 = pto.alloc_tile(tile_buf_c) for li in pto.range(bid, core_loop, num_blocks): m_idx = li // n_loop @@ -82,7 +82,7 @@ def run_loop_k(a_curr, a_next): for h in range(2): h_off = const(h * K_TILE) sv_b = pto.slice_view( - tile_view_b_256, + tile_view_b, source=tv_b, offsets=[k_offset + h_off, n_offset], sizes=[c_kt, c_nt], @@ -123,7 +123,7 @@ def run_loop_k(a_curr, a_next): run_loop_k(a_l1[1], a_l1[0]) sv_c = pto.slice_view( - tile_view_c_256, + tile_view_c, source=tv_c, offsets=[m_offset, n_offset], sizes=[const(M_TILE), c_nt], diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py b/examples/aot/matmul_optimization_guide/step3_swizzle.py similarity index 94% rename from examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py rename to examples/aot/matmul_optimization_guide/step3_swizzle.py index c8f9e102..47d7d598 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle.py +++ b/examples/aot/matmul_optimization_guide/step3_swizzle.py @@ -53,10 +53,10 @@ def matmul_kernel_ABt_autosync( tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] - b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] - b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] - c_l0 = pto.alloc_tile(tile_buf_c_256) + b_l0 = [pto.alloc_tile(tile_buf_b_l0), pto.alloc_tile(tile_buf_b_l0)] + c_l0 = pto.alloc_tile(tile_buf_c) for li in pto.range(bid, core_loop, num_blocks): m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) @@ -83,7 +83,7 @@ def run_loop_k(a_curr, a_next): for h in range(2): h_off = const(h * K_TILE) sv_b = pto.slice_view( - tile_view_b_256, + tile_view_b, source=tv_b, offsets=[k_offset + h_off, n_offset], sizes=[c_kt, c_nt], @@ -124,7 +124,7 @@ def run_loop_k(a_curr, a_next): run_loop_k(a_l1[1], a_l1[0]) sv_c = pto.slice_view( - tile_view_c_256, + tile_view_c, source=tv_c, offsets=[m_offset, n_offset], sizes=[const(M_TILE), c_nt], diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py b/examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py similarity index 100% rename from examples/aot/matmul_swizzle/step_by_step_guide/step3_swizzle_numpy_sim.py rename to examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py b/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py similarity index 95% rename from examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py rename to examples/aot/matmul_optimization_guide/step4_manual_pipelining.py index 38033523..22d5f6e3 100644 --- a/examples/aot/matmul_swizzle/step_by_step_guide/step4_manual_pipelining.py +++ b/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py @@ -53,10 +53,10 @@ def matmul_kernel_ABt( tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] - b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] + b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] - b_l0 = [pto.alloc_tile(tile_buf_b_l0_256), pto.alloc_tile(tile_buf_b_l0_256)] - c_l0 = pto.alloc_tile(tile_buf_c_256) + b_l0 = [pto.alloc_tile(tile_buf_b_l0), pto.alloc_tile(tile_buf_b_l0)] + c_l0 = pto.alloc_tile(tile_buf_c) pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) @@ -93,7 +93,7 @@ def run_loop_k(curr_id, next_id, a_curr, a_next): b_evt = 2 + h h_off = const(h * K_TILE) sv_b = pto.slice_view( - tile_view_b_256, + tile_view_b, source=tv_b, offsets=[k_offset + h_off, n_offset], sizes=[c_kt, c_nt], @@ -156,7 +156,7 @@ def run_loop_k(curr_id, next_id, a_curr, a_next): run_loop_k(1, 0, a_l1[1], a_l1[0]) sv_c = pto.slice_view( - tile_view_c_256, + tile_view_c, source=tv_c, offsets=[m_offset, n_offset], sizes=[const(M_TILE), c_nt], diff --git a/examples/aot/matmul_swizzle/.gitignore b/examples/aot/matmul_swizzle/.gitignore deleted file mode 100644 index baa7820a..00000000 --- a/examples/aot/matmul_swizzle/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -matmul.cpp -matmul.pto -outputs diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/.gitignore b/examples/aot/matmul_swizzle/step_by_step_guide/.gitignore deleted file mode 100644 index 52af4f6e..00000000 --- a/examples/aot/matmul_swizzle/step_by_step_guide/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -simple_matmul_manual_sync.pto -simple_matmul_auto_sync.pto -simple_matmul_manual_sync.cpp -simple_matmul_auto_sync.cpp diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md b/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md deleted file mode 100644 index 34ba6996..00000000 --- a/examples/aot/matmul_swizzle/step_by_step_guide/optimization_guide.md +++ /dev/null @@ -1,489 +0,0 @@ -# PTO DSL Matmul Optimization Guide (4 Steps) - -This tutorial walks through a practical optimization path for dynamic-shape matmul on NPU using the ptodsl framework. - -The key idea: **keep correctness fixed**, then change only one optimization dimension at a time so each speedup is easy to understand and measure. - ---- - -## 1) Mental Model: What each step changes - -- **Step1 (`step1_baseline.py`)**: functionally correct baseline; simple tile order; single L1 buffers. -- **Step2 (`step2_doublebuffer.py`)**: add double-buffering for A/B tiles (overlap data movement with compute), still linear tile order. -- **Step3 (`step3_swizzle.py`)**: keep double-buffering and add swizzled tile traversal to improve access/balance patterns. -- **Step4 (`step4_manual_pipelining.py`)**: keep step3 algorithm but replace compiler auto-sync with explicit event-driven software pipeline. - ---- - -## 2) Shared Building Blocks (`common_utils.py`) - -All steps reuse the same tile sizes, metadata, and swizzle helper. - -### Why shared utilities matter - -- Keeps step diffs focused on optimization logic. -- Reduces accidental config drift across kernels. -- Makes benchmarking comparisons fair. - -### Key shared code - -```python -M_TILE = 128 -K_QTILE = 64 -K_TILE = 256 -K_DTILE = 512 -N_FULL = 256 -SWIZZLE_COUNT = 5 -``` - -```python -def build_meta_data(): - def meta_data(): - dtype = pto.float16 - acc_dtype = pto.float32 - ptr_type = pto.PtrType(dtype) - i32 = pto.int32 - tv_2d = pto.TensorType(rank=2, dtype=dtype) - ... -``` - -```python -def swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2): - tile_block_loop = (n_loop + c_swizzle_m1) // c_swizzle - tile_block_span = c_swizzle * m_loop - tile_block_idx = li // tile_block_span - ... - m_idx = s.select(odd_block, flipped_m_idx, m_idx) - return m_idx, n_idx -``` - -If `swizzle_nz` looks confusing: think of it as remapping linear tile index `li` into a 2D `(m_idx, n_idx)` traversal order that improves behavior compared with pure row-major tile walking. - ---- - -## 3) Step1 Baseline: Correctness-first kernel - -File: `step1_baseline.py` - -### Algorithm behavior - -- Dynamic shape support from runtime `(m, n, k)` parameters. -- Tiles are visited in plain linear order: - - `m_idx = li // n_loop` - - `n_idx = li % n_loop` -- One L1 tile for A and one L1 tile for B (no ping-pong buffers). -- No explicit pipeline/event synchronization. - -### Important code - -```python -for li in pto.range(bid, core_loop, num_blocks): - m_idx = li // n_loop - n_idx = li % n_loop - m_offset = m_idx * c128 - n_offset = n_idx * c256 -``` - -```python -a_l1 = pto.alloc_tile(tile_buf_a_l1) -b_l1 = pto.alloc_tile(tile_buf_b_l1_256) -... -pto.load(sv_a0, a_l1) -... -pto.load(sv_b, b_l1) -``` - -```python -if phase == 0: - with pto.if_context(is_first_k_tile, has_else=True) as branch: - tile.matmul(a_l0, b_l0, c_l0) - with branch.else_context(): - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) -else: - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) -``` - -### Why this is the baseline - -It is easy to reason about and debug. Every later step should preserve this numerical result. - -### NumPy simulation of Step1 (algorithm teaching version) - -The full code is in `step1_numpy_sim.py`. - -Run it directly: - -```bash -python ./step1_numpy_sim.py -``` - -### Line-by-line mapping to `step1_baseline.py` - -- **Loop space construction** - - NumPy: `n_loop`, `m_loop`, `core_loop`, `k_dtile_num` - - ptodsl: same scalar setup in `step1_baseline.py` -- **Core tile traversal** - - NumPy: `for li in range(core_loop)` - - ptodsl: `for li in pto.range(bid, core_loop, num_blocks)` -- **Tile index mapping** - - NumPy: `m_idx = li // n_loop`, `n_idx = li % n_loop` - - ptodsl: same formulas -- **K loop** - - NumPy: `for k_idx in range(k_dtile_num)` - - ptodsl: `for k_idx in pto.range(c0, k_dtile_num, c1)` -- **Phase loop (build-time unrolled in ptodsl)** - - NumPy: `for phase in range(8)` - - ptodsl: same Python loop, used for static unrolling in IR build -- **First-accumulate logic** - - NumPy: `if phase == 0 and is_first_k_tile: c_tile = prod else: c_tile += prod` - - ptodsl: `if phase == 0` + `pto.if_context(is_first_k_tile, has_else=True)` with `matmul` / `matmul_acc` - -### Why `b_l0.T` is needed (and how it maps to ptodsl) - -In this tutorial, `b` is stored as shape `[n, k]`, while `a` is `[m, k]`. - -- NumPy quarter tile: - - `a_l0` shape is `[M_TILE, K_QTILE]` - - `b_l0` shape is `[N_FULL, K_QTILE]` -- To compute output tile `[M_TILE, N_FULL]`, we need: - - `[M_TILE, K_QTILE] @ [K_QTILE, N_FULL]` - - therefore NumPy uses `a_l0 @ b_l0.T` - -In ptodsl, this transpose handling is embedded by the tensor/view layout settings and tile ops: -- `tv_b` is created with `layout="DN"` in `step1_baseline.py` -- `tile.extract(...)` and `tile.matmul(...)` then consume B in the expected orientation for GEMM - -So `b_l0.T` in NumPy is the explicit equivalent of what ptodsl layout + tile pipeline already encodes implicitly. - -### Why accumulate in `float32` - -The original kernel metadata sets: -- input dtype: `float16` -- accumulator dtype: `float32` - -That is why the NumPy simulation casts tile loads to `float32` and keeps `c_tile`/`c` as `float32`. This mirrors: -- `acc_dtype = pto.float32` -- `tile_buf_c_256` using `acc_dtype` - -Using float32 accumulation is important for numerical stability across many partial products (especially large K). - ---- - -## 4) Step2 Double-buffer: overlap movement and compute - -File: `step2_doublebuffer.py` - -### Algorithm delta from Step1 - -- Change single buffers into ping-pong buffers: - - `a_l1 = [buf0, buf1]` - - `b_l1 = [buf0, buf1]` -- Keep tile traversal **non-swizzled** (same simple `m_idx/n_idx` as baseline). -- Keep autosync flow (no explicit manual event schedule in source). - -### Important code - -```python -a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] -b_l1 = [pto.alloc_tile(tile_buf_b_l1_256), pto.alloc_tile(tile_buf_b_l1_256)] -... -is_curr0 = (k_idx % c2) == c0 -with pto.if_context(is_curr0, has_else=True) as branch: - run_loop_k(a_l1[0], a_l1[1]) -with branch.else_context(): - run_loop_k(a_l1[1], a_l1[0]) -``` - -```python -def run_loop_k(a_curr, a_next): - ... - tile.extract(a_curr, c0, a_col, a_l0[ping]) - ... - with pto.if_context(k_idx + c1 < k_dtile_num): - ... - pto.load(sv_a_next, a_next) -``` - -### Why this tends to speed up - -While compute is consuming `a_curr`, the next tile can be prepared into `a_next`, reducing pipeline bubbles. - -Tiny timeline sketch (conceptual): - -```text -Step2 (double-buffer, auto-sync) -time ----> -Load A/B buf0: [====] -Compute buf0: [========] -Load A/B buf1: [====] -Compute buf1: [========] -``` - ---- - -## 5) Step3 Swizzle: improve tile traversal pattern - -File: `step3_swizzle.py` - -### Algorithm delta from Step2 - -- Keep same double-buffer kernel structure. -- Only change the mapping from linear loop index `li` to tile coordinates `(m_idx, n_idx)`: - - from linear mapping - - to `swizzle_nz(...)` mapping - -### Important code - -```python -c_swizzle = const(SWIZZLE_COUNT) -c_swizzle_m1 = c_swizzle - c1 -... -m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) -``` - -Everything else (double-buffer loop body) stays essentially the same as Step2, which makes Step2 -> Step3 comparison clean. - -### Intuition for new users - -Swizzling is **not changing math**, only **work scheduling order**. On NPUs, scheduling order can strongly affect memory traffic and utilization. - -### NumPy swizzle mapping demo - -To make swizzle behavior concrete, use `step3_swizzle_numpy_sim.py`. -It prints tile index mapping before/after swizzle for several swizzle factors. - -```bash -python ./step3_swizzle_numpy_sim.py -``` - -Example output format: - -```text -=== swizzle=5, m_loop=4, n_loop=7, core_loop=28 === -li | linear(m,n) -> swizzle(m,n) - 0 | ( 0, 0) -> ( 0, 0) - 1 | ( 0, 1) -> ( 0, 1) - 2 | ( 0, 2) -> ( 0, 2) - ... -``` - -Interpretation: -- `linear(m,n)` is the baseline order (`m_idx = li // n_loop`, `n_idx = li % n_loop`). -- `swizzle(m,n)` is the remapped order used by `swizzle_nz(...)`. -- As you vary `c_swizzle` (2, 3, 5), you can see how traversal shape and direction change, especially near N-tail blocks. -- The script also prints 2D order grids: - - `linear_order_grid[m, n] = li` in baseline traversal - - `swizzle_order_grid[m, n] = li` in swizzled traversal - This gives an intuitive “heatmap-like” view of where each tile is visited in time. - ---- - -## 6) Step4 Manual Pipelining: explicit software schedule - -File: `step4_manual_pipelining.py` - -### Algorithm delta from Step3 - -- Keep swizzled traversal and double-buffer dataflow. -- Switch from autosync-style source to explicit event orchestration: - - `record_event(...)` - - `wait_event(...)` - - `record_wait_pair(...)` - -### Important code - -```python -pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) -pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) -``` - -```python -pto.wait_event("MOV_M2L", "LOAD", event_id=b_evt) -pto.load(sv_b, b_l1[h]) -pto.record_event("LOAD", "MOV_M2L", event_id=b_evt) -``` - -```python -pto.wait_event("MOV_M2L", "MATMUL", event_id=0) -... -pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) -pto.store(c_l0, sv_c) -``` - -### Why this can help - -Manual scheduling gives tighter control over producer-consumer ordering and overlap. It often improves tail behavior and removes conservative compiler sync points. - -Tiny timeline sketch (conceptual): - -```text -Step4 (manual pipeline with explicit events) -time ----> -LOAD ----record----> MOV_M2L ----record----> MATMUL ----record----> STORE - ^ | | | - |------ wait ----------+------ wait -----------+------ wait --------+ -``` - ---- - -## 7) Build and Run - -### Build all 4 steps - -```bash -bash ./compile.sh -``` - -Artifacts are generated in `build_artifacts/`: -- `step1_baseline_kernel.so` -- `step2_doublebuffer_kernel.so` -- `step3_swizzle_kernel.so` -- `step4_manual_pipelining_kernel.so` - -### Validate correctness - -```bash -python ./run_simple_matmul.py -``` - -Run one step only: - -```bash -python ./run_simple_matmul.py --variant step1-baseline -python ./run_simple_matmul.py --variant step2-doublebuffer -python ./run_simple_matmul.py --variant step3-swizzle -python ./run_simple_matmul.py --variant step4-manual-pipelining -``` - -### Run stepwise benchmark - -```bash -python ./bench_matmul.py -``` - ---- - -## 8) Interpreting benchmark ratios - -The benchmark prints three ratio groups: - -1. **Step1 ratio**: `step2 / step1` - - isolates gain from double-buffering. -2. **Step2 ratio**: `step3 / step2` - - isolates gain from swizzle. -3. **Step3 ratio**: `step4 / step3` - - isolates gain from manual software pipelining. - -Reference result: - -```text -=== Summary === -Step1 (double-buffer speedup, both non-swizzle auto-sync): -avg FLOP ratio(double_noswizzle_auto/single_noswizzle): 1.607x -min FLOP ratio(double_noswizzle_auto/single_noswizzle): 0.943x -max FLOP ratio(double_noswizzle_auto/single_noswizzle): 1.826x -Step2 (swizzle speedup, both double-buffer auto-sync): -avg FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 1.227x -min FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 0.863x -max FLOP ratio(double_swizzle_auto/double_noswizzle_auto): 1.871x -Step3 (manual-sync speedup, both double-buffer swizzle): -avg FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.100x -min FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.001x -max FLOP ratio(double_swizzle_manual/double_swizzle_auto): 1.173x -``` - ---- - -## 9) Suggested learning path - -- First, run `step1` only and inspect correctness outputs. -- Next, compare `step1` vs `step2` source side by side, focusing on buffer allocation and `run_loop_k`. -- Then inspect only the index mapping change from `step2` to `step3`. -- Finally, study `step4` event dependencies as a timeline (LOAD -> MOV_M2L -> MATMUL -> STORE). - -If you keep this one-change-per-step mindset, it becomes much easier to learn NPU kernel optimization systematically. - ---- - -## Appendix A) ptodsl Syntax for Python Users - -If you are new to ptodsl, the biggest source of confusion is: -- some syntax is **Python control flow** -- some syntax is **IR-builder control flow** - -They look similar, but they execute at different times. - -### Build-time vs run-time cheat sheet - -- **Python `for ... in range(...)`** - - runs when generating the IR (build-time) - - usually acts like compile-time metaprogramming/unrolling -- **`for ... in pto.range(...)`** - - emits an MLIR `scf.for` loop - - executes dynamically at kernel run-time -- **Python `if condition:`** - - condition evaluated at build-time by Python - - branch is selected while generating IR -- **`with pto.if_context(cond):` / `pto.cond(...)`** - - emits runtime `scf.if` - - condition is evaluated when kernel runs - -### Example 1: `pto.range` (runtime loop in IR) - -From `step1_baseline.py`: - -```python -for li in pto.range(bid, core_loop, num_blocks): - ... -``` - -This is **not** Python iteration over integers. In ptodsl, `pto.range` is an IR-builder primitive (see `control_flow.py`) that constructs `scf.ForOp` and yields an induction variable value. - -Practical effect: -- loop trip count depends on runtime values like `bid`, `core_loop`, `num_blocks` -- loop stays as a loop in generated IR (not unrolled by Python) - -### Example 2: Python `range` (build-time unrolling) - -From `step1_baseline.py`: - -```python -for phase in range(8): - ... -``` - -This loop is executed by Python while building IR, so it typically creates 8 repeated code regions in IR. - -For readers with C++ background: -- this is conceptually similar to compile-time code generation / metaprogramming -- useful when loop bounds are small constants - -### Example 3: Python `if` vs `pto.if_context` - -From `step1_baseline.py`: - -```python -if phase == 0: - with pto.if_context(is_first_k_tile, has_else=True) as branch: - tile.matmul(a_l0, b_l0, c_l0) - with branch.else_context(): - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) -else: - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) -``` - -How to read this correctly: -- `if phase == 0` is a **Python** branch (build-time), because `phase` is a Python integer from `range(8)`. -- `pto.if_context(is_first_k_tile, ...)` emits a **runtime** branch in IR, because `is_first_k_tile` is a kernel scalar value. - -In plain words: -- first, Python decides which code shape to generate for each unrolled `phase` -- inside that shape, ptodsl inserts dynamic control flow for runtime conditions - -### Rule of thumb - -When in doubt, ask: -1. Is this condition/index a Python value (`int`, `bool`)? - - then it is build-time. -2. Is this a ptodsl scalar/value (`s.*`, kernel arg-derived)? - - then use ptodsl control flow (`pto.range`, `pto.if_context`, `pto.cond`) for runtime behavior. diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py b/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py deleted file mode 100644 index f3f2fe97..00000000 --- a/examples/aot/matmul_swizzle/step_by_step_guide/simple_matmul_builder.py +++ /dev/null @@ -1,26 +0,0 @@ -import argparse - -from step2_doublebuffer import build as build_step2 -from step3_swizzle import build as build_step3 -from step4_manual_pipelining import build as build_step4 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--manual-sync", - action="store_true", - help="Emit explicit record/wait events instead of relying on auto sync insertion.", - ) - parser.add_argument( - "--disable-swizzle", - action="store_true", - help="Emit step2 (double-buffer only) instead of swizzled versions.", - ) - args = parser.parse_args() - if args.manual_sync: - print(build_step4()) - elif args.disable_swizzle: - print(build_step2()) - else: - print(build_step3()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py b/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py deleted file mode 100644 index ef496abb..00000000 --- a/examples/aot/matmul_swizzle/step_by_step_guide/single_buffer_matmul.py +++ /dev/null @@ -1,9 +0,0 @@ -import argparse - -from step1_baseline import build - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - _ = parser.parse_args() - print(build()) diff --git a/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py b/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py deleted file mode 100644 index 2b5d02ab..00000000 --- a/examples/aot/matmul_swizzle/step_by_step_guide/step1_numpy_sim.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np - -M_TILE = 128 -K_QTILE = 64 -K_TILE = 256 -K_DTILE = 512 -N_FULL = 256 - - -def step1_numpy_sim(a, b): - """ - a: [m, k] float16/float32 - b: [n, k] float16/float32 - returns c: [m, n], equivalent to a @ b.T - """ - m_total, k_total = a.shape - n_total, k_total_b = b.shape - assert k_total == k_total_b - assert m_total % M_TILE == 0, "Step1 kernel uses full M tiles in this demo." - assert k_total % K_DTILE == 0, "Step1 kernel uses full K_DTILE tiles." - assert n_total % N_FULL == 0, "Tutorial simulation assumes full N tiles." - - # Corresponds to: n_loop, m_loop, core_loop, k_dtile_num - n_loop = (n_total + N_FULL - 1) // N_FULL - m_loop = m_total // M_TILE - core_loop = n_loop * m_loop - k_dtile_num = k_total // K_DTILE - - c = np.zeros((m_total, n_total), dtype=np.float32) - - # Corresponds to: for li in pto.range(...) - for li in range(core_loop): - # Corresponds to: m_idx = li // n_loop; n_idx = li % n_loop - m_idx = li // n_loop - n_idx = li % n_loop - m_offset = m_idx * M_TILE - n_offset = n_idx * N_FULL - - # Corresponds to tile accumulator c_l0 - c_tile = np.zeros((M_TILE, N_FULL), dtype=np.float32) - - # Corresponds to: load A tile once before k_idx loop - a_l1 = a[m_offset : m_offset + M_TILE, 0:K_DTILE].astype(np.float32) - - for k_idx in range(k_dtile_num): - k_offset = k_idx * K_DTILE - is_first_k_tile = k_idx == 0 - - # prefetch A tile for current k chunk (equivalent to pto.load) - a_l1 = a[m_offset : m_offset + M_TILE, k_offset : k_offset + K_DTILE].astype(np.float32) - - # Corresponds to: for phase in range(8) - for phase in range(8): - # Corresponds to loading one B half tile every 4 phases - if phase % 4 == 0: - b_half = phase // 4 - h_off = b_half * K_TILE - b_l1 = b[n_offset : n_offset + N_FULL, k_offset + h_off : k_offset + h_off + K_TILE].astype( - np.float32 - ) - - # Corresponds to extract A/B quarter tiles - a_col = phase * K_QTILE - b_row = (phase % 4) * K_QTILE - a_l0 = a_l1[:, a_col : a_col + K_QTILE] - b_l0 = b_l1[:, b_row : b_row + K_QTILE] # [N_FULL, K_QTILE] - - # Corresponds to matmul vs matmul_acc - prod = a_l0 @ b_l0.T - if phase == 0 and is_first_k_tile: - c_tile = prod - else: - c_tile += prod - - c[m_offset : m_offset + M_TILE, n_offset : n_offset + N_FULL] = c_tile - - return c - - -def test_step1_numpy_sim(): - np.random.seed(0) - for m, n, k in [(256, 512, 512), (384, 768, 1024)]: - a = np.random.randn(m, k).astype(np.float16) - b = np.random.randn(n, k).astype(np.float16) - c_ref = a.astype(np.float32) @ b.astype(np.float32).T - c_sim = step1_numpy_sim(a, b) - np.testing.assert_allclose(c_sim, c_ref, rtol=1e-4, atol=1e-3) - print("step1_numpy_sim unit test passed") - - -if __name__ == "__main__": - test_step1_numpy_sim() From 45427a465cb3d828eadba9d87dbd2cea32218ac9 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 07:26:58 +0100 Subject: [PATCH 27/53] add Chinese version of matmul optim guide --- .../mamtul_optim_guide_zh.md | 344 ++++++++++++++++++ .../matmul_optim_guide.md | 12 +- 2 files changed, 352 insertions(+), 4 deletions(-) create mode 100644 examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md diff --git a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md new file mode 100644 index 00000000..496d8785 --- /dev/null +++ b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md @@ -0,0 +1,344 @@ +# 从零手搓昇腾Matmul,用100行Python追平CANN主线库性能
(基于 PTO-ISA 的逐步优化指南) + +For English version see [matmul_optim_guide.md](./matmul_optim_guide.md) + +- 日期:2026/03/12 + +# 目录 + +- [写作动机](#motivation) +- [第 0 步:给CUDA/Triton用户的NPU编程速通](#step-0-npu-programming-crash-course-for-cudatriton-programmers) + - [NPU kernel launch行为](#typical-kernel-launch-syntax) + - [Software pipelining,自动vs手动](#auto-vs-manual-software-pipelining) +- [第 1 步:功能正确的基础版本](#step-1-functionally-correct-naive-version) +- [第 2 步:Double buffering](#step-2-double-buffering) +- [第 3 步:通过 "Swizzling" 提升 L2 cache 复用](#step-3-swizzling-for-l2-cache-reuse) +- [第 4 步:(可选)手动 software pipelining](#step-4-optional-manual-software-pipelining) +- [附录 A:PTO-DSL 语法说明](#appendix-a-pto-dsl-syntax-note) +- [附录 B:NPU profiler 使用方法](#appendix-b-using-npu-profiler) + +**复现本文全部结果**,见 [README.md](./README.md) 里的命令。 + +
+# 写作动机 + +本文是NPU版本的“Matmul算子逐步优化实录”。这类文章在友商GPU十分流行(比如[这篇A100的](https://siboehm.com/articles/22/CUDA-MMM)和[这篇H100的](https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog)),但在我司的NPU上似乎还没有过公开的“从零手搓”教程。 + +我们会逐步把一个基于**约100行Python DSL**的算子优化到持平主线库的性能。对照的性能基线是NPU上的`torch.matmul`,内部调用[aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md)(NPU的“cuBLAS平替”),实现方式为[上万行的AscendC代码](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel)。 + +本教程的代码坚持:**极简、易于魔改、不带黑盒模板封装**,只提炼**少数最关键的**性能优化点。还有些更全面的、对corner case考虑更细致的Matmul实现例如[Catlass的矩阵乘模板总结](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md)和[AscendC的Matmul性能优化策略总览](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html),把大量优化都藏在了模板和封装里,适合作为后续进阶材料。 + + +# 第 0 步:给 CUDA/Triton 用户的 NPU 编程速通 + +(如果你已经写过NPU算子,可快速略过本节) + + +## NPU kernel launch行为 + +NPU上[SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data)风格的kernel看起来和CUDA/Triton语法**似乎很像**: +- 内置变量`block_idx`和`block_num`用于每个core的地址offset计算 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/jit/add_dynamic_multicore/run_add.py#L46-L51) +- CUDA画风的`kernel_name<<>>(args)`kernel launch方式 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/aot/add_dynamic_multicore/caller.cpp#L11) + +其实二者有个关键区别:NPU算子的写法基本都属于CUDA术语里的["persistent kernels"](https://triton-lang.org/main/getting-started/tutorials/09-persistent-matmul.html),也就是`block_dim`等于硬件的核数,而不是随着输入数据size增长。 + +例如这个[基于PTO的动态shape向量相加](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L46-L100):每个core不仅自己算好global memory offset,计算的循环迭代次数也会[随着动态的输入数据size而增加](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L83)。这和常规的(非“persistent”)CUDA/Triton kernel 不一样。比如 [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) 设定 `grid = (ceil_div(n_elements, BLOCK_SIZE),)`,用launch时动态计算的`block_dim`匹配动态input size;而我们大多数的NPU kernel(不管基于PTO、AscendC、CCE 还是其他框架)通常都是 `grid = (num_cores,)`。 + +(在NPU上,大于核数的`block_dim`在简单场景能跑通,但Cube-Vector核间同步容易出bug。而且`block_dim >= 65536`会溢出,远小于CUDA的`maxGridSize`。我们遇过这个bug,通过切回“persistent-kernel”写法[修好了](https://github.com/huawei-csl/pto-kernels/pull/39)) + + +## Software pipelining,自动vs手动 + +NPU的片上缓存为[scratchpad memory](https://en.wikipedia.org/wiki/Scratchpad_memory),而非硬件管理的cache。所以要避免[data hazards](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Data_hazards)需要开发者或编译器正确地使用[set_flag & wait_flag 接口](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0106.html),本质上是基于 [binary semaphore](https://en.wikipedia.org/wiki/Semaphore_(programming)#Producer%E2%80%93consumer_problem) 的同步机制。CUDA里最接近的是[`cp.async`+`wait`那一套](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/async-copies.html)。可以参考这个[基于PTO-ISA手动同步的vector add示例](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/demos/torch_jit/add/add_custom.cpp#L78-L115)。对更复杂的融合算子如[FlashAttention](https://github.com/PTO-ISA/pto-isa/tree/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/kernels/manual/common/flash_atten),思考手动同步、software pipelining 和 prefetching, 对算子开发人员过于烧脑。 + +为了解决这个痛点,[PTO-DSL](https://github.com/huawei-csl/pto-dsl) 提供了自动同步,内部由基于[PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/8eb9e23fa95e18c3db789e0a171a98df07a8a846/docs/PTO_IR_manual.md)的[InsertSync pass](https://github.com/zhangstevenunity/PTOAS/tree/8eb9e23fa95e18c3db789e0a171a98df07a8a846/lib/PTO/Transforms/InsertSync)实现。对用户而言,算子代码看起来还是“串行的”(在pipelining意义上),写起来更接近Triton/CuTile的手感。 + + +# 第 1 步:功能正确的基础版本 + +根据[NPU硬件架构](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/opdevg/Ascendcopdevg/atlas_ascendc_10_0008.html),要完成matmul需要的数据搬运路径是: +- `GM`(global memory)-> `L1` -> `L0`(左/右操作数对应`L0A`/`L0B`)-> `Cube core` -> `L0C` -> `GM` + +读取到片上的tile 大小(算法参数)受到 L1/L0 SRAM容量(硬件参数)的约束。要查询[硬件参数规格](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/opdevg/Ascendcopdevg/atlas_ascendc_10_0011.html),可以在任意安装了CANN的环境里看文件 `${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/*.ini`: + +```bash +grep -A 9 "AICoreSpec" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B2.ini +``` + +输出: + +``` +[AICoreSpec] +... +l0_a_size=65536 # 64 KiB +l0_b_size=65536 # 64 KiB +l0_c_size=131072 # 128 KiB +l1_size=524288 # 512 KiB +``` + +考虑经典的[分块矩阵乘法](https://en.wikipedia.org/wiki/Loop_nest_optimization#Example:_matrix_multiplication)。任意shape的`C = A @ B`运算会被分解为tile级别操作:`A_tile = A[i1:i2,k1:k2]`、`B_tile = B[k1:k2,j1:j2]`、`C_tile = C[i1:i2,j1:j2]`,保证每个 tile 能放进 SRAM。结合上面的 SRAM 信息,这里选择: +- `A_tile` 在 `L1` 上为 `[128 x 512]`,占 128 KiB(fp16) +- `B_tile` 在 `L1` 上为 `[256 x 256]`,占 128 KiB(fp16) +- `A_tile` 在 `L0A` 上为 `[128 x 64]`,占 16 KiB(fp16) +- `B_tile` 在 `L0B` 上为 `[64 x 256]`,占 32 KiB(fp16) +- `C_tile` 在 `L0C` 上为 `[128 x 256]`,占 128 KiB(fp32 accumulation) +- Cube unit执行size为`(M, N, K) = (128, 256, 64)` 的 [`TMATMUL`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL.md) 指令,输入为 `L0A` 和 `L0B`,输出为 `L0C`。 + +为啥选这组参数: +- 这是[ATB 库的matmul](https://gitcode.com/cann/ascend-transformer-boost/blob/br_release_cann_8.5.0_20260527/src/kernels/kernels/matmul/pp_matmul_f16_kernel/op_kernel/pp_matmul.cce?init=initTree) 的常用tiling方案之一。也有其他很多可行组合,只要 buffer 能装下。 +- L0上更大的tile有利于Cube unit达到更高的FLOPS。比如128 x 128比32 x 32的FLOPs高好几倍。完整支持的 matmul shape 和 dtype 参见[`Mmad`指令](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/ascendcopapi/atlasascendc_api_07_0249.html)。 +- `L1`、`L0A`、`L0B` 都预留了 >=50% 空间没用,留给下一步double-buffering用。 + +[step1_baseline_numpy_sim.py](./step1_baseline_numpy_sim.py) 提供了“NumPy 仿真代码”帮助理解算法逻辑。这里用的算法是最基础的 “split-MN matmul”,每个 core 输出自己的 `C_tile = C[i1:i2,j1:j2]`。(Split-K 和 Stream-K等变种留到以后再说)。算法核心逻辑如下: +- 顶层循环 `for li in range(core_loop):` 来自前文的“persistent kernel”要求。我们不做双层“行列循环”,而是把它们合并成单层 `core_loop = n_loop * m_loop`。这样每次迭代都可以独立分配给不同 core,并独立完成一个 `C_tile`。 +- 然后只需沿内层 K 维做累加: + - 第二层 `for k_idx in range(k_dtile_num)` 对应 “GM - L1 级”迭代:当前 `L1` tile 被 matmul 用完后,再从 `GM` 加载下一个。 + - 第三层 `for phase in range(8):` 对应 “L1 - L0 级”迭代:当前 `L0` tile 被 matmul 用完后,再从 `L1` 加载下一个。 + - 由于 `L1` tile 和 `L0` tile 的尺寸比固定,第三层循环可以**静态展开**。因为 `L0` tile 小于 `L1` tile,每次 “L1 级”迭代会对应多个 “L0 级”迭代。 + +接着把NumPy翻译成等价的PTO-DSL,见 [step1_baseline.py](./step1_baseline.py) 和 [common_utils.py](./common_utils.py)。代码结构几乎一一对应,只是把NumPy API换成了NPU特有的API: +- `pto.load`([`TLOAD`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TLOAD.md))做 `GM`->`L1` +- `tile.extract`([`TEXTRACT`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TEXTRACT.md))做 `L1`->`L0A`、`L1`->`L0B` +- `tile.matmul`/`tile.matmul_acc`([`TMATMUL`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL.md)/[`TMATMUL_ACC`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TMATMUL_ACC.md))做 `L0` 上的计算 +- `pto.store`([`TSTORE`](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/docs/isa/TSTORE.md))做 `L0C`->`GM` +- 静态loop unrolling用 Python 原生 `for i in range()`;run-time动态循环用 `for i in pto.range()`。`if`/`else` 也同理类似。 + +更详细的DSL语法说明见 [附录 A:PTO-DSL 语法说明](#appendix-a-pto-dsl-syntax-note)。 + +这个80行的算子实现可以在NPU跑出正确的数值结果,但性能只有 `torch.matmul` 的 50% 左右。下一节追上性能差距。 + +![image info](./fig/flops_step1_baseline.png) + + +# 第 2 步:Double buffering + +先用 `msprof op simulator` 测试前一版 kernel: + +```bash +msprof op simulator --aic-metrics=PipeUtilization \ + --kernel-name="_Z28matmul_kernel_step1_baselinePDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step1-baseline +``` + +(更多 profiler 用法见 [附录 B:NPU profiler 使用方法](#appendix-b-using-npu-profiler)) + +可以看到 Cube core 有 50% 时间在空转: + +![image info](./fig/pipeline_N1024_baseline.png) + +做了Double buffering(本质是用空间换时间),可以把计算和数据传输尽量重叠: + +![image info](./fig/pipeline_N1024_doublebuf.png) + +完整代码见 [./step2_doublebuffer.py](./step2_doublebuffer.py)。 + +Profile改进后的算子: + +
+ +```bash +msprof op simulator --aic-metrics=PipeUtilization \ + --kernel-name="_Z26matmul_kernel_ABt_autosyncPDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step2-doublebuffer +``` + +
+ +唯一的代码改动是在 `L1` 和 `L0` 上给 `A_tile`、`B_tile` 各开 2 份 buffer: + +```python +a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] +b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] +a_l0 = [pto.alloc_tile(tile_buf_a_l0), pto.alloc_tile(tile_buf_a_l0)] +b_l0 = [pto.alloc_tile(tile_buf_b_l0), pto.alloc_tile(tile_buf_b_l0)] +``` + +然后在迭代之间交替使用 "odd" / "even" 两块 buffer。 + +优化效果显著,对于中小规模的矩阵,FLOPs 基本翻倍: +![image info](./fig/flops_step2_doublebuf.png) + +但矩阵一旦变大(比如 16384x16384),FLOPs 会**突然跌落**。原因是 NPU 的 L2 cache 装不下整块矩阵,开始出现 cache eviction。 + +查看 L2 cache 大小: + +```bash +grep -A 8 "SoCInfo" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B2.ini +``` + +输出: + +``` +[SoCInfo] +ai_core_cnt=24 +cube_core_cnt=24 +vector_core_cnt=48 +ai_cpu_cnt=6 +memory_type= +memory_size=68719476736 # 64 GiB +l2_type=0 +l2_size=201326592 # 192 MiB +``` + +8192x8192 矩阵(float16 下 64 MiB)小于 L2;而16384x16384(float16 下 256 MiB)大于 L2,所以后者的性能显著更差。 + +`910B4` 的 HBM 和 L2 都是 910B2 的一半(因此更小矩阵就会触发cache eviction): + +```bash +grep -A 8 "SoCInfo" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B4.ini +``` + +``` +[SoCInfo] +ai_core_cnt=20 +cube_core_cnt=20 +vector_core_cnt=40 +ai_cpu_cnt=6 +memory_type= +memory_size=34359738368 # 32 GiB +l2_type=0 +l2_size=100663296 # 96 MiB +``` + + +# 第 3 步:通过 "Swizzling" 提升 L2 cache 复用 + +提高多核之间的L2 cache复用,“swizzling”是最常用的技巧,对NPU和GPU都适用。下图借自 [Triton matmul讲解](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#l2-cache-optimizations): +Grouped vs row-major ordering (from Triton) + +这张图可以这样理解:假设第一轮迭代有9个核各自在算 `C` 的一个子块(黄色标记区域,0~8 是 core id)。在朴素的 "row-major ordering" 下,完整 B 矩阵(假设大于 L2)要频繁从 global memory 读取;而用 "grouped ordering" 后,global memory traffic大幅下降。 + +[step3_swizzle.py](./step3_swizzle.py) 在 step2 基础上只加了一个 10 行 swizzle 函数 `swizzle_nz`,其余代码完全不动。[step3_swizzle_numpy_sim.py](./step3_swizzle_numpy_sim.py) 直观解释了swizzle对循环下标的影响。这个具体swizzle方案来自[catlass的block swizzle](https://gitcode.com/cann/catlass/blob/v1.4.0/include/catlass/gemm/block/block_swizzle.hpp)([讲解文档](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/swizzle_explanation.md))。 + +(For GPU熟练工: 这个下标重映射类似[DeepGEMM的scheduler](https://github.com/deepseek-ai/DeepGEMM/blob/v2.1.1/deep_gemm/include/deep_gemm/common/scheduler.cuh),重排每个SM的数据分配和循环顺序) + +只加这 10 行 swizzle,FLOPs 就有明显提升,达到了`torch.matmul`的90%! + +![image info](./fig/flops_step3_swizzle.png) + +为了确认是L2 cache在起作用,用`msprof op`检查cache hit: + +```bash +msprof op \ + --aic-metrics=Occupancy,Roofline,Default,L2Cache,PipeUtilization,MemoryL0 \ + --kernel-name="_Z26matmul_kernel_ABt_autosyncPDhS_S_iii_mix_aic" \ + --output="msprof_res" --launch-count=5 \ + python ./run_matmul.py --variant step3-swizzle +``` + +对4096x4096小矩阵,即使不用swizzled loop order,L2 hit就很高(97.88%): + +cachehit_N4096 + +对16384x16384大矩阵,由于超过了L2 size,不swizzle的话L2 hit低到了30.9%: + +cachehit_N16384 + +加了swizzling后,16384x16384场景的L2 hit 提升到93.72%了: + +cachehit_N16384_swizzle + + +# 第 4 步:(可选)手动 software pipelining + +最后这 10% 的性能差距,可以通过 [./step4_manual_pipelining.py](./step4_manual_pipelining.py) 里的手动排流水压榨出来。 + +![image info](./fig/flops_step4_manual_pipeline.png) + +即便做了手动同步,代码也只是从 ~100 行增长到 ~150 行 Python,仍然比CANN算子库的代码短很多。如何手工排流水超出了本文的讲解范围。我们正在 [推进相关 compile pass](https://github.com/zhangstevenunity/PTOAS/issues/226),争取让编译器自动同步性能持平手排。 + + +# 附录 A:PTO-DSL 语法说明 + +当前的 [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/3f0860b1e750f2c4d26a93c6501a212b60196863/ptodsl) 只是在 PTO dialect 的 [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/)上做了很薄的封装。整个DSL包只有 **约1000行Python**(可以用 `cd ptodsl && find . -name "*.py" | xargs wc -l` 自行确认) + +为了在开发阶段维持一个简单好改的框架,我们目前**不**做Python AST parsing / AST rewriting。因此,所有 Python 原生语法(包括`if`/`for` 控制流、Python class、iterator 等)都按普通Python代码执行。这点和其他Python DSL的做法不太相同:有的是纯 AST 路线(如 Triton、CuTile),有的是 AST+tracing 混合路线(如 Tilelang、CuteDSL),它们 *可能会,也可能不会* 把原生 `if`/`range` rewrite成特殊 IR builder(可参考 [CuteDSL 的复杂规则](https://github.com/Dao-AILab/quack/blob/v0.3.2/docs/dsl_control_flow.rst))。当前 PTO-DSL frontend 是纯 Python tracing,更接近 JAX 的思路。 + +**用户只要记住:** run-time动态控制流全在 `pto` 命名空间里(例如 `pto.range`,会在 IR 中生成 [MLIR structured control flow](https://mlir.llvm.org/docs/Dialects/SCFDialect/));而 Python 原生控制流是在 build-time 就求值完成的。 + +常见场景: + +- **Python `for ... in range(...)`** + - 在生成 IR 前执行(build-time) + - 常用于编译期 metaprogramming / unrolling +- **`for ... in pto.range(...)`** + - 生成 MLIR `scf.for` loop + - 在 kernel run-time 动态执行 +- **Python `if condition:`** + - condition 在 build-time 由 Python 求值 + - 分支在生成 IR 前就被选定 +- **`with pto.if_context(cond):` / `pto.cond(...)`** + - 在 IR 中生成 runtime `scf.if` + - condition 在 kernel 运行时求值 + +**示例 1:`pto.range`(IR 里的 runtime loop)** + +来自 `step1_baseline.py`: + +```python +for li in pto.range(bid, core_loop, num_blocks): + ... +``` + +这**不是**普通Python循环。在 PTO-DSL 里,`pto.range` 是一个 IR-builder primitive(见 `control_flow.py`),会创建 `scf.ForOp` 并返回 induction-variable。 + +实际效果:会以 loop 形式保留在 IR 里(不会被 Python 展开) + +**示例 2:Python `range`(build-time unrolling)** + +来自 `step1_baseline.py`: + +```python +for phase in range(8): + ... +``` + +这个 loop 在构建 IR 时由 Python 执行,所以通常会在 IR 中生成 8 份重复代码区域。 + +类比C++编程: +- 概念上接近 compile-time codegen / metaprogramming +- 当 loop bound 是小常量时非常实用 + +**示例 3:Python `if` vs `pto.if_context`** + +来自 `step1_baseline.py`: + +```python +if phase == 0: + with pto.if_context(is_first_k_tile, has_else=True) as branch: + tile.matmul(a_l0, b_l0, c_l0) + with branch.else_context(): + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +else: + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) +``` + +理解方式: +- `if phase == 0` 是 **普通Python** 分支(build-time) +- `pto.if_context(is_first_k_tile, ...)` 在 IR 中生成 **runtime** 分支 + + +# 附录 B:NPU profiler 使用方法 + +`--kernel-name=` 参数里的 kernel 名字怎么找:先不带 `--kernel-name=` 跑一次 `msprof op`,输出里会直接打印 kernel 名。 + +完整官方文档见 [msProf](https://www.hiascend.com/document/detail/zh/canncommercial/850/devaids/optool/atlasopdev_16_0082.html)。 + +查看 profiler trace 的 UI 工具下载: + +```bash +# Windows x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_win.exe + +# Mac arm and x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_darwin-aarch64.dmg +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_darwin-x86_64.dmg + +# Linux arm and x86 +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_linux-aarch64.zip +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindStudio/MindStudio%208.3.0/MindStudio-Insight_8.3.0_linux-x86_64.zip +``` + +以上链接来自 [CANN 下载页](https://www.hiascend.com/developer/download/community/result?module=sto)。 diff --git a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md index 510c3d60..13639881 100644 --- a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md +++ b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md @@ -23,9 +23,9 @@ This guide is the NPU version of "step-by-step matmul optimization", a popular article style for NVIDIA GPUs (e.g. [for A100](https://siboehm.com/articles/22/CUDA-MMM) and [for H100](https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog)), but never written for our NPUs before. -I intentionally keep the code samples **minimal, hackable, from-scratch, and without magical templates and wrappers**, to make them easier to follow than the more advanced "Matmul optimization practices" [in catlass](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md) or [in AscendC](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html) (which hide optimization tricks behind templates and wrappers). +We show step-by-step how to match the performance of a carefully optimized official library, using **only ~100 lines of Python DSL**. The target to compare is `torch.matmul`, which invokes [aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md) (our "cuBLAS" for NPU), internally implemented by [many thousands of lines of AscendC](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel). -We will compare our custom kernel's performance to `torch.matmul`, which invokes [aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md) (our "cuBLAS" for NPU), internally implemented by [many thousands of lines of AscendC](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel). We show step-by-step how to match the performance of such a carefully optimized library, using **only ~100 lines of Python DSL**. +I intentionally keep the code samples **minimal, hackable, from-scratch, and without magical templates and wrappers**, to highlight the few key optimizations. There are more comprehensive "Matmul optimizations lists" [in catlass](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md) or [in AscendC](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html), which hide optimization tricks behind templates and wrappers. They are more suited for later, more advanced study. # Step 0: NPU programming crash course for CUDA/Triton programmers @@ -125,11 +125,12 @@ Double buffering overlaps compute and data transfer: ![image info](./fig/pipeline_N1024_doublebuf.png) - See full code in [./step2_doublebuffer.py](./step2_doublebuffer.py). Profile with: +
+ ```bash msprof op simulator --aic-metrics=PipeUtilization \ --kernel-name="_Z26matmul_kernel_ABt_autosyncPDhS_S_iii_mix_aic" \ @@ -137,7 +138,9 @@ msprof op simulator --aic-metrics=PipeUtilization \ python ./run_matmul.py --variant step2-doublebuffer ``` -The only difference is that we allocate 2x local buffers for A and B on both `L1` and `L0`: +
+ +The only difference is that we allocate 2x local buffers for `A_tile` and `B_tile` on both `L1` and `L0`: ```python a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] @@ -201,6 +204,7 @@ Swizzling improves L2 cache reuse across multiple cores. We borrow this figure [ To read this figure, assume 9 cores computing a subset `C` matrix in the first iteration (the yellow area, each number 0 ~ 8 marks the core id). In the naive "row-major ordering", the full matrix B (assume larger than L2 cache!) needs to be loaded from global memory; while in the "grouped ordering", the data traffic w.r.t. global memory is much less. [step3_swizzle.py](./step3_swizzle.py) incorporates a 10-line swizzling function `swizzle_nz`, while keeping the rest of the code same as step2. [step3_swizzle_numpy_sim.py](./step3_swizzle_numpy_sim.py) explains the swizzle scheme intuitively. The swizzle algorithm is one of the algorithms [from catlass](https://gitcode.com/cann/catlass/blob/v1.4.0/include/catlass/gemm/block/block_swizzle.hpp), which also [has a nice explanation](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/swizzle_explanation.md) + (for GPU experts -- such index remapping is analogous to [the "scheduler" in DeepGEMM](https://github.com/deepseek-ai/DeepGEMM/blob/v2.1.1/deep_gemm/include/deep_gemm/common/scheduler.cuh), which alters data assignment and loop order for each SM) With just this 10-line swizzle function, the FLOPs are much improved, reaching ~90% of `torch.matmul`! From 867f5f2a2ac27447bf6f6741ad3c292cb79b6def Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 07:30:34 +0100 Subject: [PATCH 28/53] re-org example dir to multi-level --- examples/aot/{ => activations}/geglu_dynamic_multicore/.gitignore | 0 examples/aot/{ => activations}/geglu_dynamic_multicore/README.md | 0 .../aot/{ => activations}/geglu_dynamic_multicore/bench_geglu.py | 0 examples/aot/{ => activations}/geglu_dynamic_multicore/caller.cpp | 0 examples/aot/{ => activations}/geglu_dynamic_multicore/compile.sh | 0 .../{ => activations}/geglu_dynamic_multicore/geglu_builder.py | 0 .../aot/{ => activations}/geglu_dynamic_multicore/run_geglu.py | 0 examples/aot/{ => activations}/relu_dynamic_multicore/.gitignore | 0 examples/aot/{ => activations}/relu_dynamic_multicore/README.md | 0 examples/aot/{ => activations}/relu_dynamic_multicore/caller.cpp | 0 examples/aot/{ => activations}/relu_dynamic_multicore/compile.sh | 0 .../aot/{ => activations}/relu_dynamic_multicore/relu_builder.py | 0 examples/aot/{ => activations}/relu_dynamic_multicore/run_relu.py | 0 .../aot/{ => batch_matmul}/matmul_dynbatch_multicore/.gitignore | 0 .../aot/{ => batch_matmul}/matmul_dynbatch_multicore/README.md | 0 .../aot/{ => batch_matmul}/matmul_dynbatch_multicore/caller.cpp | 0 .../aot/{ => batch_matmul}/matmul_dynbatch_multicore/compile.sh | 0 .../matmul_dynbatch_multicore/matmul_builder.py | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore/run_matmul.py | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_2buf/README.md | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_2buf/caller.cpp | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_2buf/compile.sh | 0 .../matmul_dynbatch_multicore_2buf/matmul_dsl.py | 0 .../matmul_dynbatch_multicore_2buf/matmul_ref.cpp | 0 .../matmul_dynbatch_multicore_2buf/run_matmul.py | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_opt/.gitignore | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_opt/README.md | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_opt/caller.cpp | 0 .../{ => batch_matmul}/matmul_dynbatch_multicore_opt/compile.sh | 0 .../matmul_dynbatch_multicore_opt/matmul_builder.py | 0 .../matmul_dynbatch_multicore_opt/run_matmul.py | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/.gitignore | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/README.md | 0 .../aot/{ => elementwise}/add_dynamic_multicore/add_builder.py | 0 .../{ => elementwise}/add_dynamic_multicore/add_double_builder.py | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/bench_add.py | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/caller.cpp | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/compile.sh | 0 .../aot/{ => elementwise}/add_dynamic_multicore/compile_double.sh | 0 examples/aot/{ => elementwise}/add_dynamic_multicore/run_add.py | 0 examples/aot/{ => simple_static}/add_static_multicore/.gitignore | 0 examples/aot/{ => simple_static}/add_static_multicore/README.md | 0 .../aot/{ => simple_static}/add_static_multicore/add_builder.py | 0 examples/aot/{ => simple_static}/add_static_multicore/caller.cpp | 0 examples/aot/{ => simple_static}/add_static_multicore/compile.sh | 0 examples/aot/{ => simple_static}/add_static_multicore/run_add.py | 0 .../aot/{ => simple_static}/matmul_static_singlecore/.gitignore | 0 .../aot/{ => simple_static}/matmul_static_singlecore/README.md | 0 .../aot/{ => simple_static}/matmul_static_singlecore/caller.cpp | 0 .../aot/{ => simple_static}/matmul_static_singlecore/compile.sh | 0 .../matmul_static_singlecore/matmul_builder.py | 0 .../{ => simple_static}/matmul_static_singlecore/run_matmul.py | 0 52 files changed, 0 insertions(+), 0 deletions(-) rename examples/aot/{ => activations}/geglu_dynamic_multicore/.gitignore (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/README.md (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/bench_geglu.py (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/caller.cpp (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/compile.sh (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/geglu_builder.py (100%) rename examples/aot/{ => activations}/geglu_dynamic_multicore/run_geglu.py (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/.gitignore (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/README.md (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/caller.cpp (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/compile.sh (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/relu_builder.py (100%) rename examples/aot/{ => activations}/relu_dynamic_multicore/run_relu.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/.gitignore (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/README.md (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/caller.cpp (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/compile.sh (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/matmul_builder.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore/run_matmul.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/README.md (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/caller.cpp (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/compile.sh (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/matmul_dsl.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/matmul_ref.cpp (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_2buf/run_matmul.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/.gitignore (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/README.md (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/caller.cpp (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/compile.sh (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/matmul_builder.py (100%) rename examples/aot/{ => batch_matmul}/matmul_dynbatch_multicore_opt/run_matmul.py (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/.gitignore (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/README.md (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/add_builder.py (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/add_double_builder.py (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/bench_add.py (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/caller.cpp (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/compile.sh (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/compile_double.sh (100%) rename examples/aot/{ => elementwise}/add_dynamic_multicore/run_add.py (100%) rename examples/aot/{ => simple_static}/add_static_multicore/.gitignore (100%) rename examples/aot/{ => simple_static}/add_static_multicore/README.md (100%) rename examples/aot/{ => simple_static}/add_static_multicore/add_builder.py (100%) rename examples/aot/{ => simple_static}/add_static_multicore/caller.cpp (100%) rename examples/aot/{ => simple_static}/add_static_multicore/compile.sh (100%) rename examples/aot/{ => simple_static}/add_static_multicore/run_add.py (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/.gitignore (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/README.md (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/caller.cpp (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/compile.sh (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/matmul_builder.py (100%) rename examples/aot/{ => simple_static}/matmul_static_singlecore/run_matmul.py (100%) diff --git a/examples/aot/geglu_dynamic_multicore/.gitignore b/examples/aot/activations/geglu_dynamic_multicore/.gitignore similarity index 100% rename from examples/aot/geglu_dynamic_multicore/.gitignore rename to examples/aot/activations/geglu_dynamic_multicore/.gitignore diff --git a/examples/aot/geglu_dynamic_multicore/README.md b/examples/aot/activations/geglu_dynamic_multicore/README.md similarity index 100% rename from examples/aot/geglu_dynamic_multicore/README.md rename to examples/aot/activations/geglu_dynamic_multicore/README.md diff --git a/examples/aot/geglu_dynamic_multicore/bench_geglu.py b/examples/aot/activations/geglu_dynamic_multicore/bench_geglu.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore/bench_geglu.py rename to examples/aot/activations/geglu_dynamic_multicore/bench_geglu.py diff --git a/examples/aot/geglu_dynamic_multicore/caller.cpp b/examples/aot/activations/geglu_dynamic_multicore/caller.cpp similarity index 100% rename from examples/aot/geglu_dynamic_multicore/caller.cpp rename to examples/aot/activations/geglu_dynamic_multicore/caller.cpp diff --git a/examples/aot/geglu_dynamic_multicore/compile.sh b/examples/aot/activations/geglu_dynamic_multicore/compile.sh similarity index 100% rename from examples/aot/geglu_dynamic_multicore/compile.sh rename to examples/aot/activations/geglu_dynamic_multicore/compile.sh diff --git a/examples/aot/geglu_dynamic_multicore/geglu_builder.py b/examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore/geglu_builder.py rename to examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py diff --git a/examples/aot/geglu_dynamic_multicore/run_geglu.py b/examples/aot/activations/geglu_dynamic_multicore/run_geglu.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore/run_geglu.py rename to examples/aot/activations/geglu_dynamic_multicore/run_geglu.py diff --git a/examples/aot/relu_dynamic_multicore/.gitignore b/examples/aot/activations/relu_dynamic_multicore/.gitignore similarity index 100% rename from examples/aot/relu_dynamic_multicore/.gitignore rename to examples/aot/activations/relu_dynamic_multicore/.gitignore diff --git a/examples/aot/relu_dynamic_multicore/README.md b/examples/aot/activations/relu_dynamic_multicore/README.md similarity index 100% rename from examples/aot/relu_dynamic_multicore/README.md rename to examples/aot/activations/relu_dynamic_multicore/README.md diff --git a/examples/aot/relu_dynamic_multicore/caller.cpp b/examples/aot/activations/relu_dynamic_multicore/caller.cpp similarity index 100% rename from examples/aot/relu_dynamic_multicore/caller.cpp rename to examples/aot/activations/relu_dynamic_multicore/caller.cpp diff --git a/examples/aot/relu_dynamic_multicore/compile.sh b/examples/aot/activations/relu_dynamic_multicore/compile.sh similarity index 100% rename from examples/aot/relu_dynamic_multicore/compile.sh rename to examples/aot/activations/relu_dynamic_multicore/compile.sh diff --git a/examples/aot/relu_dynamic_multicore/relu_builder.py b/examples/aot/activations/relu_dynamic_multicore/relu_builder.py similarity index 100% rename from examples/aot/relu_dynamic_multicore/relu_builder.py rename to examples/aot/activations/relu_dynamic_multicore/relu_builder.py diff --git a/examples/aot/relu_dynamic_multicore/run_relu.py b/examples/aot/activations/relu_dynamic_multicore/run_relu.py similarity index 100% rename from examples/aot/relu_dynamic_multicore/run_relu.py rename to examples/aot/activations/relu_dynamic_multicore/run_relu.py diff --git a/examples/aot/matmul_dynbatch_multicore/.gitignore b/examples/aot/batch_matmul/matmul_dynbatch_multicore/.gitignore similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/.gitignore rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/.gitignore diff --git a/examples/aot/matmul_dynbatch_multicore/README.md b/examples/aot/batch_matmul/matmul_dynbatch_multicore/README.md similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/README.md rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/README.md diff --git a/examples/aot/matmul_dynbatch_multicore/caller.cpp b/examples/aot/batch_matmul/matmul_dynbatch_multicore/caller.cpp similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/caller.cpp rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/caller.cpp diff --git a/examples/aot/matmul_dynbatch_multicore/compile.sh b/examples/aot/batch_matmul/matmul_dynbatch_multicore/compile.sh similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/compile.sh rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/compile.sh diff --git a/examples/aot/matmul_dynbatch_multicore/matmul_builder.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/matmul_builder.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py diff --git a/examples/aot/matmul_dynbatch_multicore/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore/run_matmul.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/README.md b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/README.md similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/README.md rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/README.md diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/caller.cpp similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/caller.cpp rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/caller.cpp diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/compile.sh b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/compile.sh similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/compile.sh rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/compile.sh diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/matmul_dsl.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_ref.cpp similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/matmul_ref.cpp rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_ref.cpp diff --git a/examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_2buf/run_matmul.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py diff --git a/examples/aot/matmul_dynbatch_multicore_opt/.gitignore b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/.gitignore rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore diff --git a/examples/aot/matmul_dynbatch_multicore_opt/README.md b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/README.md similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/README.md rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/README.md diff --git a/examples/aot/matmul_dynbatch_multicore_opt/caller.cpp b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/caller.cpp similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/caller.cpp rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/caller.cpp diff --git a/examples/aot/matmul_dynbatch_multicore_opt/compile.sh b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/compile.sh similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/compile.sh rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/compile.sh diff --git a/examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/matmul_builder.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py diff --git a/examples/aot/matmul_dynbatch_multicore_opt/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py similarity index 100% rename from examples/aot/matmul_dynbatch_multicore_opt/run_matmul.py rename to examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py diff --git a/examples/aot/add_dynamic_multicore/.gitignore b/examples/aot/elementwise/add_dynamic_multicore/.gitignore similarity index 100% rename from examples/aot/add_dynamic_multicore/.gitignore rename to examples/aot/elementwise/add_dynamic_multicore/.gitignore diff --git a/examples/aot/add_dynamic_multicore/README.md b/examples/aot/elementwise/add_dynamic_multicore/README.md similarity index 100% rename from examples/aot/add_dynamic_multicore/README.md rename to examples/aot/elementwise/add_dynamic_multicore/README.md diff --git a/examples/aot/add_dynamic_multicore/add_builder.py b/examples/aot/elementwise/add_dynamic_multicore/add_builder.py similarity index 100% rename from examples/aot/add_dynamic_multicore/add_builder.py rename to examples/aot/elementwise/add_dynamic_multicore/add_builder.py diff --git a/examples/aot/add_dynamic_multicore/add_double_builder.py b/examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py similarity index 100% rename from examples/aot/add_dynamic_multicore/add_double_builder.py rename to examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py diff --git a/examples/aot/add_dynamic_multicore/bench_add.py b/examples/aot/elementwise/add_dynamic_multicore/bench_add.py similarity index 100% rename from examples/aot/add_dynamic_multicore/bench_add.py rename to examples/aot/elementwise/add_dynamic_multicore/bench_add.py diff --git a/examples/aot/add_dynamic_multicore/caller.cpp b/examples/aot/elementwise/add_dynamic_multicore/caller.cpp similarity index 100% rename from examples/aot/add_dynamic_multicore/caller.cpp rename to examples/aot/elementwise/add_dynamic_multicore/caller.cpp diff --git a/examples/aot/add_dynamic_multicore/compile.sh b/examples/aot/elementwise/add_dynamic_multicore/compile.sh similarity index 100% rename from examples/aot/add_dynamic_multicore/compile.sh rename to examples/aot/elementwise/add_dynamic_multicore/compile.sh diff --git a/examples/aot/add_dynamic_multicore/compile_double.sh b/examples/aot/elementwise/add_dynamic_multicore/compile_double.sh similarity index 100% rename from examples/aot/add_dynamic_multicore/compile_double.sh rename to examples/aot/elementwise/add_dynamic_multicore/compile_double.sh diff --git a/examples/aot/add_dynamic_multicore/run_add.py b/examples/aot/elementwise/add_dynamic_multicore/run_add.py similarity index 100% rename from examples/aot/add_dynamic_multicore/run_add.py rename to examples/aot/elementwise/add_dynamic_multicore/run_add.py diff --git a/examples/aot/add_static_multicore/.gitignore b/examples/aot/simple_static/add_static_multicore/.gitignore similarity index 100% rename from examples/aot/add_static_multicore/.gitignore rename to examples/aot/simple_static/add_static_multicore/.gitignore diff --git a/examples/aot/add_static_multicore/README.md b/examples/aot/simple_static/add_static_multicore/README.md similarity index 100% rename from examples/aot/add_static_multicore/README.md rename to examples/aot/simple_static/add_static_multicore/README.md diff --git a/examples/aot/add_static_multicore/add_builder.py b/examples/aot/simple_static/add_static_multicore/add_builder.py similarity index 100% rename from examples/aot/add_static_multicore/add_builder.py rename to examples/aot/simple_static/add_static_multicore/add_builder.py diff --git a/examples/aot/add_static_multicore/caller.cpp b/examples/aot/simple_static/add_static_multicore/caller.cpp similarity index 100% rename from examples/aot/add_static_multicore/caller.cpp rename to examples/aot/simple_static/add_static_multicore/caller.cpp diff --git a/examples/aot/add_static_multicore/compile.sh b/examples/aot/simple_static/add_static_multicore/compile.sh similarity index 100% rename from examples/aot/add_static_multicore/compile.sh rename to examples/aot/simple_static/add_static_multicore/compile.sh diff --git a/examples/aot/add_static_multicore/run_add.py b/examples/aot/simple_static/add_static_multicore/run_add.py similarity index 100% rename from examples/aot/add_static_multicore/run_add.py rename to examples/aot/simple_static/add_static_multicore/run_add.py diff --git a/examples/aot/matmul_static_singlecore/.gitignore b/examples/aot/simple_static/matmul_static_singlecore/.gitignore similarity index 100% rename from examples/aot/matmul_static_singlecore/.gitignore rename to examples/aot/simple_static/matmul_static_singlecore/.gitignore diff --git a/examples/aot/matmul_static_singlecore/README.md b/examples/aot/simple_static/matmul_static_singlecore/README.md similarity index 100% rename from examples/aot/matmul_static_singlecore/README.md rename to examples/aot/simple_static/matmul_static_singlecore/README.md diff --git a/examples/aot/matmul_static_singlecore/caller.cpp b/examples/aot/simple_static/matmul_static_singlecore/caller.cpp similarity index 100% rename from examples/aot/matmul_static_singlecore/caller.cpp rename to examples/aot/simple_static/matmul_static_singlecore/caller.cpp diff --git a/examples/aot/matmul_static_singlecore/compile.sh b/examples/aot/simple_static/matmul_static_singlecore/compile.sh similarity index 100% rename from examples/aot/matmul_static_singlecore/compile.sh rename to examples/aot/simple_static/matmul_static_singlecore/compile.sh diff --git a/examples/aot/matmul_static_singlecore/matmul_builder.py b/examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py similarity index 100% rename from examples/aot/matmul_static_singlecore/matmul_builder.py rename to examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py diff --git a/examples/aot/matmul_static_singlecore/run_matmul.py b/examples/aot/simple_static/matmul_static_singlecore/run_matmul.py similarity index 100% rename from examples/aot/matmul_static_singlecore/run_matmul.py rename to examples/aot/simple_static/matmul_static_singlecore/run_matmul.py From 710a552b7901079f1fe9309567f67fcd10d678c3 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 07:35:23 +0100 Subject: [PATCH 29/53] run `pre-commit run --all-files` --- .pre-commit-config.yaml | 18 +++ .../geglu_dynamic_multicore/geglu_builder.py | 4 +- .../relu_dynamic_multicore/relu_builder.py | 13 ++- .../relu_dynamic_multicore/run_relu.py | 26 +++-- .../matmul_builder.py | 72 +++++++++--- .../matmul_dynbatch_multicore/run_matmul.py | 6 +- .../matmul_dsl.py | 67 +++++++---- .../run_matmul.py | 91 ++++++++------- .../matmul_dynbatch_multicore_opt/.gitignore | 2 +- .../matmul_builder.py | 18 ++- .../run_matmul.py | 60 +++++----- .../add_dynamic_multicore/.gitignore | 2 +- .../add_dynamic_multicore/add_builder.py | 15 ++- .../add_double_builder.py | 15 ++- .../add_dynamic_multicore/bench_add.py | 4 +- .../add_dynamic_multicore/run_add.py | 24 ++-- .../aot/fast_hadamard/hadamard_builder.py | 22 ++-- examples/aot/fast_hadamard/plot_perf.py | 1 + .../matmul_optimization_guide/bench_matmul.py | 108 ++++++++++++++---- .../matmul_optimization_guide/common_utils.py | 20 +++- .../experimental/bench_matmul.py | 26 +++-- .../experimental/matmul_builder.py | 106 +++++++++++++---- .../mamtul_optim_guide_zh.md | 2 +- .../matmul_optim_guide.md | 4 +- .../matmul_optimization_guide/run_matmul.py | 18 +-- .../step1_baseline.py | 30 ++++- .../step1_baseline_numpy_sim.py | 9 +- .../step2_doublebuffer.py | 24 +++- .../step3_swizzle.py | 29 ++++- .../step3_swizzle_numpy_sim.py | 4 +- .../step4_manual_pipelining.py | 33 ++++-- examples/aot/print_tile/print_builder.py | 27 +++-- examples/aot/print_tile/run_print.py | 16 ++- .../add_static_multicore/.gitignore | 2 +- .../add_static_multicore/add_builder.py | 27 +++-- .../add_static_multicore/run_add.py | 12 +- .../matmul_builder.py | 38 ++++-- .../matmul_static_singlecore/run_matmul.py | 12 +- examples/jit/add_dynamic_multicore/run_add.py | 24 +++- .../jit/add_static_multicore/run_add_1d.py | 28 +++-- .../jit/add_static_multicore/run_add_2d.py | 28 +++-- .../run_batch_matmul.py | 42 +++++-- examples/validate_all_examples.py | 4 +- ptodsl/api/pto_general.py | 14 +-- ptodsl/api/scalar.py | 4 +- ptodsl/api/tile.py | 14 ++- ptodsl/api/type_def.py | 12 +- ptodsl/compiler/ir.py | 4 +- ptodsl/compiler/jit.py | 12 +- ptodsl/utils/bench.py | 4 +- tests/frontend/test_add_dynamic_ir.py | 39 +++++-- tests/frontend/test_add_ir.py | 30 +++-- tests/frontend/test_caller_gen.py | 5 +- tests/frontend/test_matmul_dynamic_ir.py | 100 ++++++++++++---- .../README.md | 2 +- .../binary_builder.py | 4 +- .../test_binary_builder.py | 4 +- tests/npu/expand_dynamic_multicore/caller.py | 1 - .../expand_builder.py | 12 +- tests/npu/expand_dynamic_multicore/gen_ir.py | 4 +- .../expand_dynamic_multicore/test_expand.py | 5 +- .../test_gather_dynamic.py | 10 +- tests/npu/reduce_dynamic_multicore/caller.py | 2 +- tests/npu/reduce_dynamic_multicore/compile.sh | 2 +- tests/npu/reduce_dynamic_multicore/gen_ir.py | 2 +- .../reduce_builder.py | 14 +-- .../reduce_dynamic_multicore/test_reduce.py | 11 +- 67 files changed, 997 insertions(+), 447 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..91bda22c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-yaml + - id: check-json + - id: check-merge-conflict + - id: check-added-large-files + - id: check-toml + - id: detect-private-key + - id: check-ast +- repo: https://github.com/psf/black + rev: 25.12.0 + hooks: + - id: black diff --git a/examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py b/examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py index e5e862ce..7d1e88b3 100644 --- a/examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py +++ b/examples/aot/activations/geglu_dynamic_multicore/geglu_builder.py @@ -86,7 +86,9 @@ def _kernel( num_blocks = pto.get_block_num() vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index - num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores + num_cores = s.index_cast( + num_blocks * sub_bnum + ) # number of vector cores # Distribute rows across cores (row-level parallelism). rows_per_core = s.ceil_div(batch, num_cores) diff --git a/examples/aot/activations/relu_dynamic_multicore/relu_builder.py b/examples/aot/activations/relu_dynamic_multicore/relu_builder.py index a797d06f..3e84659b 100644 --- a/examples/aot/activations/relu_dynamic_multicore/relu_builder.py +++ b/examples/aot/activations/relu_dynamic_multicore/relu_builder.py @@ -32,7 +32,9 @@ def meta_data(): const = s.const @to_ir_module(meta_data=meta_data) - def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> None: + def sync_kernel_dyn( + arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype" + ) -> None: with pto.vector_section(): c0 = const(0) c1 = const(1) @@ -53,8 +55,12 @@ def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> num_tiles = s.ceil_div(core_len, c_tile_w) # GM tensors shape N with stride 1. - tv0 = pto.as_tensor(tensor_type, ptr=arg0, shape=[total_elements], strides=[c1]) - tv1 = pto.as_tensor(tensor_type, ptr=arg1, shape=[total_elements], strides=[c1]) + tv0 = pto.as_tensor( + tensor_type, ptr=arg0, shape=[total_elements], strides=[c1] + ) + tv1 = pto.as_tensor( + tensor_type, ptr=arg1, shape=[total_elements], strides=[c1] + ) for i in pto.range(c0, num_tiles, c1): offset_tile = i * c_tile_w @@ -87,5 +93,6 @@ def sync_kernel_dyn(arg0: "ptr_type", arg1: "ptr_type", argN: "index_dtype") -> return sync_kernel_dyn + if __name__ == "__main__": print(build()) diff --git a/examples/aot/activations/relu_dynamic_multicore/run_relu.py b/examples/aot/activations/relu_dynamic_multicore/run_relu.py index 99465bb2..9281ea50 100644 --- a/examples/aot/activations/relu_dynamic_multicore/run_relu.py +++ b/examples/aot/activations/relu_dynamic_multicore/run_relu.py @@ -35,7 +35,7 @@ def load_lib(lib_path, block_dim, check_type=True): def relu_func(x, y, n, block_dim=block_dim, stream_ptr=None): if stream_ptr is None: - stream_ptr= torch.npu.current_stream()._as_parameter_ + stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( block_dim, @@ -54,13 +54,12 @@ def test_relu(verbose=True): torch.npu.set_device(device) dtype = torch.float32 - # allocate a bigger buffer than the actual number of elements to test the padding behavior shape = [1, 2 * 128] for BLOCK_DIM in range(1, 21): relu_kernel = load_lib("relu_lib.so", block_dim=BLOCK_DIM) - print(BLOCK_DIM) - for num_elements in [3,7,13,97,143, 2*128]: + print(BLOCK_DIM) + for num_elements in [3, 7, 13, 97, 143, 2 * 128]: x = torch.rand(shape, device=device, dtype=dtype) - 0.5 y = torch.full(shape, -10, device=device, dtype=dtype) relu_kernel(x, y, n=num_elements) @@ -73,17 +72,22 @@ def test_relu(verbose=True): step = 1 for i in range(0, shape[0]): for j in range(0, shape[1], step): - if correct[i, j:j+step].all(): - print('X', end='') + if correct[i, j : j + step].all(): + print("X", end="") else: - print('.', end='') + print(".", end="") if j == num_elements - 1: - print('|', end='') - print('|') + print("|", end="") + print("|") - torch.testing.assert_close(y.flatten()[:num_elements], y_ref.flatten()[:num_elements]) + torch.testing.assert_close( + y.flatten()[:num_elements], y_ref.flatten()[:num_elements] + ) # Make sure we didn't write past the end of the buffer - torch.testing.assert_close(y.flatten()[num_elements:], torch.full_like(y.flatten()[num_elements:], -10)) + torch.testing.assert_close( + y.flatten()[num_elements:], + torch.full_like(y.flatten()[num_elements:], -10), + ) print(f"RELU test pass for shape {shape}! using {BLOCK_DIM} cores") diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py index 730190d7..28015228 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore/matmul_builder.py @@ -29,14 +29,26 @@ def meta_data(): tile_view_out = pto.SubTensorType(shape=[M, N], dtype=dtype) tile_view_bias = pto.SubTensorType(shape=[1, N], dtype=dtype) - tile_buf_aMat = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="MAT") - tile_buf_bMat = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="MAT") - tile_buf_biasData = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="MAT") - - tile_buf_aTile = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="RIGHT") + tile_buf_aMat = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="MAT" + ) + tile_buf_bMat = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_biasData = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="MAT" + ) + + tile_buf_aTile = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="LEFT" + ) + tile_buf_bTile = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="RIGHT" + ) tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype, memory_space="ACC") - tile_buf_biasTile = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="BIAS") + tile_buf_biasTile = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="BIAS" + ) return { "ptr_type": ptr_dtype, @@ -88,10 +100,18 @@ def RunTMATMULSplitK( b_end_unclamped = b_start + batches_per_core b_end = s.min_u(b_end_unclamped, batch) - tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) - tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tvOut = pto.as_tensor(tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1]) - tvBias = pto.as_tensor(tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) + tvA = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1] + ) + tvB = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1] + ) + tvOut = pto.as_tensor( + tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1] + ) + tvBias = pto.as_tensor( + tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1] + ) aMatTile = pto.alloc_tile(tile_buf_aMat) bMatTile = pto.alloc_tile(tile_buf_bMat) @@ -106,9 +126,24 @@ def RunTMATMULSplitK( for i in pto.range(c0, cIter, c1): kOff = i * cBASEK - svA = pto.slice_view(tile_view_a, source=tvA, offsets=[row_off, kOff], sizes=[cTileM, cBASEK]) - svB = pto.slice_view(tile_view_b, source=tvB, offsets=[kOff, c0], sizes=[cBASEK, cTileN]) - svBias = pto.slice_view(tile_view_bias, source=tvBias, offsets=[c0, c0], sizes=[c1, cTileN]) + svA = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[row_off, kOff], + sizes=[cTileM, cBASEK], + ) + svB = pto.slice_view( + tile_view_b, + source=tvB, + offsets=[kOff, c0], + sizes=[cBASEK, cTileN], + ) + svBias = pto.slice_view( + tile_view_bias, + source=tvBias, + offsets=[c0, c0], + sizes=[c1, cTileN], + ) pto.load(svA, aMatTile) pto.load(svB, bMatTile) @@ -142,7 +177,12 @@ def _first_iter(): pto.record_wait_pair("MATMUL", "LOAD", event_id=0) pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) - svOut = pto.slice_view(tile_view_out, source=tvOut, offsets=[row_off, c0], sizes=[cTileM, cTileN]) + svOut = pto.slice_view( + tile_view_out, + source=tvOut, + offsets=[row_off, c0], + sizes=[cTileM, cTileN], + ) pto.store(cTile, svOut) pto.record_wait_pair("STORE_ACC", "MATMUL", event_id=0) @@ -150,4 +190,4 @@ def _first_iter(): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py index b4c8d79e..197087a1 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore/run_matmul.py @@ -11,11 +11,7 @@ def torch_to_ctypes(tensor): def load_lib(lib_path): lib = ctypes.CDLL(lib_path) - def matmul_func( - c, a, b, batch_size, - block_dim, - stream_ptr=None - ): + def matmul_func(c, a, b, batch_size, block_dim, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py index 363637b7..8ec0dfc9 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/matmul_dsl.py @@ -21,14 +21,17 @@ def meta_data(): tile_buf_aMat = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="MAT") tile_buf_bMat = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="MAT") tile_buf_aTile = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="RIGHT") - tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype_acc_tile, memory_space="ACC") + tile_buf_bTile = pto.TileBufType( + shape=[K, N], dtype=dtype, memory_space="RIGHT" + ) + tile_buf_cTile = pto.TileBufType( + shape=[M, N], dtype=dtype_acc_tile, memory_space="ACC" + ) # TODO: Get rid of this? return locals() const = s.const - # Until we have set_dyn_flag with event_id as SSA values # event_id can be dynamic SSA value # https://github.com/zhangstevenunity/PTOAS/pull/176 @@ -36,15 +39,15 @@ def record_event(src, dst, event_id): pto.cond( event_id == const(0), lambda: pto.record_event(src, dst, event_id=0), - lambda: pto.record_event(src, dst, event_id=1) + lambda: pto.record_event(src, dst, event_id=1), ) - + def wait_event(src, dst, event_id): pto.cond( event_id == const(0), lambda: pto.wait_event(src, dst, event_id=0), - lambda: pto.wait_event(src, dst, event_id=1) - ) + lambda: pto.wait_event(src, dst, event_id=1), + ) @to_ir_module(meta_data=meta_data) def RunTMATMULSplitK( @@ -74,9 +77,21 @@ def RunTMATMULSplitK( # TODO: if no batched assigned to this core, early return - tvA = pto.as_tensor(tensor_type3d, ptr=a_ptr, shape=[batch, cM, cK], strides=[cK*cM, cK, c1]) - tvC = pto.as_tensor(tensor_type3d, ptr=out_ptr, shape=[batch, cM, cN], strides=[cM*cN, cN, c1]) - tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) + tvA = pto.as_tensor( + tensor_type3d, + ptr=a_ptr, + shape=[batch, cM, cK], + strides=[cK * cM, cK, c1], + ) + tvC = pto.as_tensor( + tensor_type3d, + ptr=out_ptr, + shape=[batch, cM, cN], + strides=[cM * cN, cN, c1], + ) + tvB = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1] + ) # TODO: pre-fetch more than two tiles into L1 NUM_BUFFERS = 2 @@ -88,15 +103,18 @@ def RunTMATMULSplitK( bTile = pto.alloc_tile(tile_buf_bTile) # Put B in L0B - svB = pto.slice_view(tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cN]) + svB = pto.slice_view( + tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cN] + ) pto.load(svB, bMatTile) pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) tile.mov(bMatTile, bTile) # TODO: wait here so we can use full l1 memory later for A. - # load in the first tile from GM->L1 - svA = pto.slice_view(tile_view_a, source=tvA, offsets=[b_start, c0, c0], sizes=[c1, cM, cK]) + svA = pto.slice_view( + tile_view_a, source=tvA, offsets=[b_start, c0, c0], sizes=[c1, cM, cK] + ) curr = c1 - (b_start % c2) pto.cond( curr == c1, @@ -104,7 +122,7 @@ def RunTMATMULSplitK( lambda: pto.load(svA, aMatTiles[1]), ) record_event("LOAD", "MOV_M2L", event_id=curr) - + # TODO: fix wait events if batch size is 1/2 # signal to LOAD that L1 can be overwritten pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1]) @@ -115,8 +133,15 @@ def RunTMATMULSplitK( for b_idx in pto.range(b_start, b_end, c1): curr = b_idx % c2 - svA = pto.slice_view(tile_view_a, source=tvA, offsets=[b_idx+c1, c0, c0], sizes=[c1, cM, cK]) - svC = pto.slice_view(tile_view_c, source=tvC, offsets=[b_idx, c0, c0], sizes=[c1, cM, cN]) + svA = pto.slice_view( + tile_view_a, + source=tvA, + offsets=[b_idx + c1, c0, c0], + sizes=[c1, cM, cK], + ) + svC = pto.slice_view( + tile_view_c, source=tvC, offsets=[b_idx, c0, c0], sizes=[c1, cM, cN] + ) ########## Load tile A for iteration i+1 from GM -> L1 wait_event("MOV_M2L", "LOAD", event_id=curr) @@ -124,24 +149,22 @@ def RunTMATMULSplitK( pto.cond( curr == c1, lambda: pto.load(svA, aMatTiles[0]), - lambda: pto.load(svA, aMatTiles[1]) + lambda: pto.load(svA, aMatTiles[1]), ) record_event("LOAD", "MOV_M2L", event_id=curr) - ########## Move A1 and A2 into L0A wait_event("LOAD", "MOV_M2L", event_id=c1 - curr) wait_event("MATMUL", "MOV_M2L", event_id=curr) pto.cond( curr == c0, lambda: tile.mov(aMatTiles[0], aTiles[0]), - lambda: tile.mov(aMatTiles[1], aTiles[1]) + lambda: tile.mov(aMatTiles[1], aTiles[1]), ) with pto.if_context(b_idx + c2 < b_end): record_event("MOV_M2L", "LOAD", event_id=curr) record_event("MOV_M2L", "MATMUL", event_id=curr) - ########## Perform matmul wait_event("MOV_M2L", "MATMUL", event_id=curr) wait_event("STORE_ACC", "MATMUL", event_id=curr) @@ -154,7 +177,6 @@ def RunTMATMULSplitK( with pto.if_context(b_idx + c2 < b_end): record_event("MATMUL", "MOV_M2L", event_id=curr) - ######### Store wait_event("MATMUL", "STORE_ACC", event_id=curr) pto.cond( @@ -164,9 +186,8 @@ def RunTMATMULSplitK( ) with pto.if_context(b_idx + c2 < b_end): record_event("STORE_ACC", "MATMUL", event_id=curr) - - pto.barrier('LOAD') + pto.barrier("LOAD") return RunTMATMULSplitK diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py index 0b5be8a9..cdb0929a 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/run_matmul.py @@ -34,7 +34,7 @@ def benchmark( flops: int | None = None, io_bytes: int | None = None, ) -> dict: - avg_s = do_bench(fn, unit='s', flush_cache=True) + avg_s = do_bench(fn, unit="s", flush_cache=True) stats = {"avg_ms": avg_s * 1e3} if flops is not None: stats["tflops"] = (flops / avg_s) / 1e12 @@ -55,11 +55,7 @@ def print_benchmark(stats: dict) -> None: def load_lib(lib_path): lib = ctypes.CDLL(lib_path) - def matmul_func( - c, a, b, batch_size, - block_dim, - stream_ptr=None - ): + def matmul_func(c, a, b, batch_size, block_dim, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( @@ -76,6 +72,7 @@ def matmul_func( def plot_benchmark(): import matplotlib.pyplot as plt + device = get_test_device() torch.set_default_device(device) torch.npu.set_device(device) @@ -86,7 +83,7 @@ def plot_benchmark(): pto_results, torch_results, pto2_results, pto3_results = [], [], [], [] m, k, n = 128, 128, 128 - batches = list(range(24*2, 8000, 24*2)) + batches = list(range(24 * 2, 8000, 24 * 2)) blk = [24, 1, 6] for i in batches: bs = i @@ -99,50 +96,60 @@ def plot_benchmark(): torch.npu.synchronize() c_ref = torch.matmul(a, b) diff = (c - c_ref).abs().max() - #assert diff <= 1e-5, diff + # assert diff <= 1e-5, diff if diff < 1e-5: - print('.', end='') + print(".", end="") else: - print(f'failed at shape: {a.shape} with {diff}') - + print(f"failed at shape: {a.shape} with {diff}") + flops = matmul_flops(bs, m, k, n) io_bytes = matmul_io_bytes(a, b, c) # run a benchmark for warmup (else first iterations are off) benchmark(lambda: torch.matmul(a, b, out=c)) - torch_b = benchmark( lambda: torch.matmul(a, b, out=c), - flops=flops, io_bytes=io_bytes)['gbps'] - pto2 = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[1]), - flops=flops, io_bytes=io_bytes)['gbps'] - pto3 = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[2]), - flops=flops, io_bytes=io_bytes)['gbps'] - pto = benchmark( lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[0]), - flops=flops, io_bytes=io_bytes)['gbps'] + torch_b = benchmark( + lambda: torch.matmul(a, b, out=c), flops=flops, io_bytes=io_bytes + )["gbps"] + pto2 = benchmark( + lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[1]), + flops=flops, + io_bytes=io_bytes, + )["gbps"] + pto3 = benchmark( + lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[2]), + flops=flops, + io_bytes=io_bytes, + )["gbps"] + pto = benchmark( + lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk[0]), + flops=flops, + io_bytes=io_bytes, + )["gbps"] pto_results.append(pto) pto2_results.append(pto2) pto3_results.append(pto3) torch_results.append(torch_b) print() - rel_diff = [our/their for our, their in zip(pto_results, torch_results)] + rel_diff = [our / their for our, their in zip(pto_results, torch_results)] - fig, ax1 = plt.subplots(figsize=(8,5)) + fig, ax1 = plt.subplots(figsize=(8, 5)) - ax1.plot(batches, pto_results, '-', label=f'pto-dsl ({blk[0]} cores)') - ax1.plot(batches, pto2_results, '-', label=f'pto-dsl ({blk[1]} cores)') - ax1.plot(batches, pto3_results, '-', label=f'pto-dsl ({blk[2]} cores)') - ax1.plot(batches, torch_results, '-', label='torch.matmul (24 cores)') - ax1.set_xlabel('Batch size') - ax1.set_ylabel('Bandwidth (Read A+B write C) (GB/s)') - ax1.grid(True, linestyle='--', alpha=0.6) + ax1.plot(batches, pto_results, "-", label=f"pto-dsl ({blk[0]} cores)") + ax1.plot(batches, pto2_results, "-", label=f"pto-dsl ({blk[1]} cores)") + ax1.plot(batches, pto3_results, "-", label=f"pto-dsl ({blk[2]} cores)") + ax1.plot(batches, torch_results, "-", label="torch.matmul (24 cores)") + ax1.set_xlabel("Batch size") + ax1.set_ylabel("Bandwidth (Read A+B write C) (GB/s)") + ax1.grid(True, linestyle="--", alpha=0.6) ax2 = ax1.twinx() - ax2.plot(batches, rel_diff, '-', color='purple', label='pto-dsl / torch') - ax2.set_ylabel('Relative Performance (pto-dsl / torch)') - ax2.set_ylim(0.95*min(rel_diff),1.05*max(rel_diff)) - ax2.axhline(y=1, linestyle='--', linewidth=1.0) + ax2.plot(batches, rel_diff, "-", color="purple", label="pto-dsl / torch") + ax2.set_ylabel("Relative Performance (pto-dsl / torch)") + ax2.set_ylim(0.95 * min(rel_diff), 1.05 * max(rel_diff)) + ax2.axhline(y=1, linestyle="--", linewidth=1.0) - dt_str = {torch.float16: 'fp16', torch.float32: 'fp32'}[dtype] + dt_str = {torch.float16: "fp16", torch.float32: "fp32"}[dtype] plt.title( f"""pto-dsl kernel vs torch.matmul\n @<{b.shape[0]}, {b.shape[1]}, {dt_str}>=""" @@ -150,9 +157,9 @@ def plot_benchmark(): lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() - ax1.legend(lines1 + lines2, labels1 + labels2, loc='best') + ax1.legend(lines1 + lines2, labels1 + labels2, loc="best") plt.tight_layout() - plt.savefig('dsl.png') + plt.savefig("dsl.png") def correctness_verify(): @@ -175,23 +182,21 @@ def correctness_verify(): torch.npu.synchronize() c_ref = torch.matmul(a, b) - diff = (c - c_ref).abs().max() - #assert diff <= 1e-5, diff + # assert diff <= 1e-5, diff if diff < 1e-5: - print('.', end='', flush=True) + print(".", end="", flush=True) else: - print(f'#cores={blk} failed at shape: {list(a.shape)} with error:{diff}') + print( + f"#cores={blk} failed at shape: {list(a.shape)} with error:{diff}" + ) print() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--benchmark", - dest="benchmark", - action="store_true", - help="Enable benchmarking" + "--benchmark", dest="benchmark", action="store_true", help="Enable benchmarking" ) args = parser.parse_args() correctness_verify() diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore index b9455c5b..7eac319a 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/.gitignore @@ -2,4 +2,4 @@ matmul.pto matmul.cpp matmul_kernel.so -*.png \ No newline at end of file +*.png diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py index 6dbfd8cd..b711f4d6 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/matmul_builder.py @@ -29,7 +29,9 @@ def meta_data(): tile_buf_aMat = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="MAT") tile_buf_bMat = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="MAT") tile_buf_aTile = pto.TileBufType(shape=[M, K], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[K, N], dtype=dtype, memory_space="RIGHT") + tile_buf_bTile = pto.TileBufType( + shape=[K, N], dtype=dtype, memory_space="RIGHT" + ) tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype, memory_space="ACC") return { @@ -89,9 +91,13 @@ def RunTMATMULSplitK( length = base + s.select(lt_rem, c1, c0) b_end = s.min_u(b_start + length, batch) - tvA = pto.as_tensor(tv_a, ptr=a_ptr, shape=[batch, cM, cK], strides=[cKM, cK, c1]) + tvA = pto.as_tensor( + tv_a, ptr=a_ptr, shape=[batch, cM, cK], strides=[cKM, cK, c1] + ) tvB = pto.as_tensor(tv_b, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tvOut = pto.as_tensor(tv_out, ptr=out_ptr, shape=[batch, cM, cN], strides=[cMN, cN, c1]) + tvOut = pto.as_tensor( + tv_out, ptr=out_ptr, shape=[batch, cM, cN], strides=[cMN, cN, c1] + ) aMatTile = pto.alloc_tile(tile_buf_aMat) bMatTile = pto.alloc_tile(tile_buf_bMat) @@ -100,7 +106,9 @@ def RunTMATMULSplitK( cTile = pto.alloc_tile(tile_buf_cTile) # B is shared across batches: load once GM->L1->L0B. - svB = pto.slice_view(tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cTileN]) + svB = pto.slice_view( + tile_view_b, source=tvB, offsets=[c0, c0], sizes=[cK, cTileN] + ) pto.load(svB, bMatTile) pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) tile.mov(bMatTile, bTile) @@ -136,4 +144,4 @@ def RunTMATMULSplitK( if __name__ == "__main__": m = build() - print(m) \ No newline at end of file + print(m) diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py index 31c8ba78..58f8cb55 100644 --- a/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_opt/run_matmul.py @@ -3,6 +3,7 @@ import torch import torch_npu from ptodsl.test_util import get_test_device + try: import matplotlib.pyplot as plt except ImportError: @@ -29,9 +30,7 @@ def matmul_io_bytes(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor) -> int: # Does not include cache effects or intermediate buffers. elt = _dtype_nbytes(a.dtype) return (a.numel() + b.numel() + c.numel()) * elt - #return (a.numel() + b.numel()) * elt - - + # return (a.numel() + b.numel()) * elt def benchmark( @@ -44,7 +43,7 @@ def benchmark( flops: int | None = None, io_bytes: int | None = None, ) -> dict: - avg_s = do_bench(fn, warmup_iters=warmup, benchmark_iters=iters, unit='s') + avg_s = do_bench(fn, warmup_iters=warmup, benchmark_iters=iters, unit="s") stats = {"name": name, "iters": iters, "avg_ms": avg_s * 1e3} if flops is not None: stats["tflops"] = (flops / avg_s) / 1e12 @@ -65,11 +64,7 @@ def print_benchmark(stats: dict) -> None: def load_lib(lib_path): lib = ctypes.CDLL(lib_path) - def matmul_func( - c, a, b, batch_size, - block_dim, - stream_ptr=None - ): + def matmul_func(c, a, b, batch_size, block_dim, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( @@ -84,8 +79,6 @@ def matmul_func( return matmul_func - - def plot_benchmark(): device = get_test_device() torch.set_default_device(device) @@ -98,7 +91,7 @@ def plot_benchmark(): matmul_func = load_lib("./matmul_kernel.so") # assume defined torch.manual_seed(0) - bs, m, k, n = 24*200, 128, 128, 128 + bs, m, k, n = 24 * 200, 128, 128, 128 for blk in blk_values: a = torch.rand((bs, m, k), device=device, dtype=dtype) b = torch.rand((k, n), device=device, dtype=dtype) @@ -109,19 +102,29 @@ def plot_benchmark(): torch.npu.synchronize() c_ref = torch.matmul(a, b) diff = (c - c_ref).abs().max() - assert diff <= 1e-5, diff + assert diff <= 1e-5, diff flops = matmul_flops(bs, m, k, n) io_bytes = matmul_io_bytes(a, b, c) - torch_b = benchmark("torch.matmul", - lambda: torch.matmul(a, b, out=c), - device=device, warmup=20, iters=20, - flops=flops, io_bytes=io_bytes)['gbps'] - pto = benchmark("custom_kernel", - lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk), - device=device, warmup=20, iters=20, - flops=flops, io_bytes=io_bytes)['gbps'] + torch_b = benchmark( + "torch.matmul", + lambda: torch.matmul(a, b, out=c), + device=device, + warmup=20, + iters=20, + flops=flops, + io_bytes=io_bytes, + )["gbps"] + pto = benchmark( + "custom_kernel", + lambda: matmul_func(c, a, b, batch_size=bs, block_dim=blk), + device=device, + warmup=20, + iters=20, + flops=flops, + io_bytes=io_bytes, + )["gbps"] pto_results.append(pto) torch_results.append(torch_b) @@ -136,20 +139,21 @@ def plot_benchmark(): return # plot results - plt.figure(figsize=(8,5)) - plt.plot(blk_values, pto_results, 'o-', label='mlir') - plt.plot(blk_values, torch_results, 's-', label='torch.matmul (all cores)') - plt.xlabel('Number of cores') - plt.ylabel('Bandwidth (Read A+B write C) (GB/s)') + plt.figure(figsize=(8, 5)) + plt.plot(blk_values, pto_results, "o-", label="mlir") + plt.plot(blk_values, torch_results, "s-", label="torch.matmul (all cores)") + plt.xlabel("Number of cores") + plt.ylabel("Bandwidth (Read A+B write C) (GB/s)") plt.title( f"""Benchmark: Custom Kernel vs torch.matmul\n A: {tuple(a.shape)} B: {tuple(b.shape)}, C: {tuple(c.shape)} \n A+B+C size: {total_mb:.1f} MB""" ) - plt.grid(True, linestyle='--', alpha=0.6) + plt.grid(True, linestyle="--", alpha=0.6) plt.legend() plt.tight_layout() - plt.savefig('our.png') + plt.savefig("our.png") + if __name__ == "__main__": plot_benchmark() diff --git a/examples/aot/elementwise/add_dynamic_multicore/.gitignore b/examples/aot/elementwise/add_dynamic_multicore/.gitignore index 34d860d5..1dde81a9 100644 --- a/examples/aot/elementwise/add_dynamic_multicore/.gitignore +++ b/examples/aot/elementwise/add_dynamic_multicore/.gitignore @@ -3,4 +3,4 @@ add.pto add_lib.so add_double.cpp add_double.pto -add_double_lib.so \ No newline at end of file +add_double_lib.so diff --git a/examples/aot/elementwise/add_dynamic_multicore/add_builder.py b/examples/aot/elementwise/add_dynamic_multicore/add_builder.py index c6e0d9e5..c4a67865 100644 --- a/examples/aot/elementwise/add_dynamic_multicore/add_builder.py +++ b/examples/aot/elementwise/add_dynamic_multicore/add_builder.py @@ -80,13 +80,22 @@ def vec_add_1d_dynamic( offset_global = tile_offset_global * c_tile sv0 = pto.slice_view( - subtensor_type, source=tv0, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv0, + offsets=[offset_global], + sizes=[c_tile], ) sv1 = pto.slice_view( - subtensor_type, source=tv1, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv1, + offsets=[offset_global], + sizes=[c_tile], ) sv2 = pto.slice_view( - subtensor_type, source=tv2, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv2, + offsets=[offset_global], + sizes=[c_tile], ) pto.load(sv0, tb0) diff --git a/examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py b/examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py index a2da0b60..022a18bd 100644 --- a/examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py +++ b/examples/aot/elementwise/add_dynamic_multicore/add_double_builder.py @@ -85,13 +85,22 @@ def vec_add_1d_dynamic( offset_global = tile_offset_global * c_tile sv0 = pto.slice_view( - subtensor_type, source=tv0, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv0, + offsets=[offset_global], + sizes=[c_tile], ) sv1 = pto.slice_view( - subtensor_type, source=tv1, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv1, + offsets=[offset_global], + sizes=[c_tile], ) sv2 = pto.slice_view( - subtensor_type, source=tv2, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv2, + offsets=[offset_global], + sizes=[c_tile], ) with pto.if_context((i % c2) == c0, has_else=True) as branch: pto.load(sv0, tb0_ping) diff --git a/examples/aot/elementwise/add_dynamic_multicore/bench_add.py b/examples/aot/elementwise/add_dynamic_multicore/bench_add.py index 618cbca9..54f60dd4 100644 --- a/examples/aot/elementwise/add_dynamic_multicore/bench_add.py +++ b/examples/aot/elementwise/add_dynamic_multicore/bench_add.py @@ -24,7 +24,9 @@ def add_func(x, y, z, stream_ptr=None): return add_func -def bench_add(add_func, x, y, z, kernel_name="add_func", warmup_iters=5, benchmark_iters=50): +def bench_add( + add_func, x, y, z, kernel_name="add_func", warmup_iters=5, benchmark_iters=50 +): io_bytes = x.numel() * x.element_size() * 3 # Overwrite a large buffer between launches to reduce L2 cache reuse. cache = torch.empty((256 * 1024 * 1024,), dtype=torch.int8, device=x.device) diff --git a/examples/aot/elementwise/add_dynamic_multicore/run_add.py b/examples/aot/elementwise/add_dynamic_multicore/run_add.py index a18dbaaa..e2a3ab53 100644 --- a/examples/aot/elementwise/add_dynamic_multicore/run_add.py +++ b/examples/aot/elementwise/add_dynamic_multicore/run_add.py @@ -9,24 +9,16 @@ def torch_to_ctypes(tensor): def lib_to_func(lib): - def add_func( - x, - y, - z, - stream_ptr=None - ): + def add_func(x, y, z, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ N = x.numel() lib.call_kernel( - stream_ptr, - torch_to_ctypes(x), - torch_to_ctypes(y), - torch_to_ctypes(z), - N + stream_ptr, torch_to_ctypes(x), torch_to_ctypes(y), torch_to_ctypes(z), N ) + return add_func @@ -42,7 +34,14 @@ def test_add(lib_path="./add_lib.so"): tile_size = 1024 # Keep shapes aligned to tile size, but vary tile counts so they are not # required to be multiples of `num_cores`. - tile_counts = [1, 7, num_cores - 1, num_cores + 3, 2 * num_cores + 7, 5 * num_cores - 5] + tile_counts = [ + 1, + 7, + num_cores - 1, + num_cores + 3, + 2 * num_cores + 7, + 5 * num_cores - 5, + ] shape_list = [tile_size * tiles for tiles in tile_counts] torch.manual_seed(0) @@ -60,6 +59,7 @@ def test_add(lib_path="./add_lib.so"): torch.testing.assert_close(z, z_ref) print(f"result equal for shape {shape}") + if __name__ == "__main__": test_add() test_add("./add_double_lib.so") diff --git a/examples/aot/fast_hadamard/hadamard_builder.py b/examples/aot/fast_hadamard/hadamard_builder.py index 032ff728..6d641a26 100644 --- a/examples/aot/fast_hadamard/hadamard_builder.py +++ b/examples/aot/fast_hadamard/hadamard_builder.py @@ -110,8 +110,12 @@ def process_rows(tb_row, tb_even, tb_odd, gm_offset, cur_samples): ) # Alias row halves inside UB row tile (no GM round-trip # per Hadamard iteration). - tb_first = tile.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]) - tb_second = tile.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]) + tb_first = tile.subset( + tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] + ) + tb_second = tile.subset( + tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] + ) pto.load(sv_row, tb_row) for _ in pto.range(c0, log2_n, c1): @@ -231,9 +235,7 @@ def process_rows( tile.sub(tb_even, tb_odd, tb_second) pto.barrier("VEC") - pto.record_wait_pair( - "VEC", "STORE_VEC", event_id=event_id - ) + pto.record_wait_pair("VEC", "STORE_VEC", event_id=event_id) pto.store(tb_row, sv_row) pto.record_event("STORE_VEC", "VEC", event_id=event_id) pto.record_event("VEC", "LOAD", event_id=event_id) @@ -254,18 +256,22 @@ def process_rows( use_ev0 = (chunk_i % c2) == c0 with pto.if_context(use_ev0, has_else=True) as branch: - process_rows(tb_row_0, tb_even_0, tb_odd_0, 0, gm_offset, cur_samples) + process_rows( + tb_row_0, tb_even_0, tb_odd_0, 0, gm_offset, cur_samples + ) with branch.else_context(): - process_rows(tb_row_1, tb_even_1, tb_odd_1, 1, gm_offset, cur_samples) + process_rows( + tb_row_1, tb_even_1, tb_odd_1, 1, gm_offset, cur_samples + ) for event_id in (0, 1): pto.wait_event("VEC", "LOAD", event_id=event_id) pto.wait_event("STORE_VEC", "VEC", event_id=event_id) - if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() parser.add_argument( "--manual-sync", diff --git a/examples/aot/fast_hadamard/plot_perf.py b/examples/aot/fast_hadamard/plot_perf.py index 6a894fd7..3b66bc50 100644 --- a/examples/aot/fast_hadamard/plot_perf.py +++ b/examples/aot/fast_hadamard/plot_perf.py @@ -1,5 +1,6 @@ import os import csv + try: import matplotlib.pyplot as plt except ImportError: diff --git a/examples/aot/matmul_optimization_guide/bench_matmul.py b/examples/aot/matmul_optimization_guide/bench_matmul.py index 82fdc0ba..476ddb8f 100644 --- a/examples/aot/matmul_optimization_guide/bench_matmul.py +++ b/examples/aot/matmul_optimization_guide/bench_matmul.py @@ -108,10 +108,30 @@ def _maybe_plot(rows, plot_dir): legend_scale = 2.0 step_defs = [ - ("step1", "single_auto_noswizzle_tflops", "Step1 Kernel", "flops_step1_baseline.png"), - ("step2", "double_auto_noswizzle_tflops", "Step2 Kernel", "flops_step2_doublebuf.png"), - ("step3", "double_auto_swizzle_tflops", "Step3 Kernel", "flops_step3_swizzle.png"), - ("step4", "double_manual_swizzle_tflops", "Step4 Kernel", "flops_step4_manual_pipeline.png"), + ( + "step1", + "single_auto_noswizzle_tflops", + "Step1 Kernel", + "flops_step1_baseline.png", + ), + ( + "step2", + "double_auto_noswizzle_tflops", + "Step2 Kernel", + "flops_step2_doublebuf.png", + ), + ( + "step3", + "double_auto_swizzle_tflops", + "Step3 Kernel", + "flops_step3_swizzle.png", + ), + ( + "step4", + "double_manual_swizzle_tflops", + "Step4 Kernel", + "flops_step4_manual_pipeline.png", + ), ] for _, custom_key, custom_label, out_name in step_defs: @@ -121,8 +141,18 @@ def _maybe_plot(rows, plot_dir): base_label_size = ax.xaxis.label.get_size() chunk = [r for r in rows if r["n"] == n and r["k"] == k] if not chunk: - ax.set_title(f"TFLOPS vs M (N={n}, K={k})", fontsize=base_title_size * title_scale) - ax.text(0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center") + ax.set_title( + f"TFLOPS vs M (N={n}, K={k})", + fontsize=base_title_size * title_scale, + ) + ax.text( + 0.5, + 0.5, + "No data", + transform=ax.transAxes, + ha="center", + va="center", + ) ax.set_xlabel("M", fontsize=base_label_size * axis_label_scale) ax.set_ylabel("TFLOPS", fontsize=base_label_size * axis_label_scale) ax.grid(alpha=0.25) @@ -149,7 +179,9 @@ def _maybe_plot(rows, plot_dir): color="#1f77b4", label=custom_label, ) - ax.set_title(f"TFLOPS vs M (N={n}, K={k})", fontsize=base_title_size * title_scale) + ax.set_title( + f"TFLOPS vs M (N={n}, K={k})", fontsize=base_title_size * title_scale + ) ax.set_xlabel("M", fontsize=base_label_size * axis_label_scale) ax.set_ylabel("TFLOPS", fontsize=base_label_size * axis_label_scale) ax.set_xlim(left=0) @@ -239,7 +271,9 @@ def main(): if not single_auto_noswizzle_lib.is_absolute(): single_auto_noswizzle_lib = base_dir / single_auto_noswizzle_lib if not double_auto_swizzle_lib.exists(): - raise FileNotFoundError(f"Double-buffer auto-sync swizzle library not found: {double_auto_swizzle_lib}") + raise FileNotFoundError( + f"Double-buffer auto-sync swizzle library not found: {double_auto_swizzle_lib}" + ) if not double_auto_noswizzle_lib.exists(): raise FileNotFoundError( f"Double-buffer auto-sync non-swizzle library not found: {double_auto_noswizzle_lib}" @@ -280,8 +314,14 @@ def main(): print(f"=== N={n}, K={k} ===") for m in m_list: alloc = args.warmup + args.repeat - a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] - b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] + a_list = [ + torch.randn(m, k, dtype=torch.float16, device=device) + for _ in range(alloc) + ] + b_list = [ + torch.randn(n, k, dtype=torch.float16, device=device) + for _ in range(alloc) + ] double_auto_swizzle_us = _time_us( double_auto_swizzle_mm, a_list, b_list, args.warmup, args.repeat @@ -311,11 +351,17 @@ def main(): torch_matmul_tflops = flops / torch_matmul_us / 1e6 # Step 1: buffering effect (double-buffer vs single-buffer, both non-swizzle auto-sync). - step1_double_vs_single = double_auto_noswizzle_tflops / single_auto_noswizzle_tflops + step1_double_vs_single = ( + double_auto_noswizzle_tflops / single_auto_noswizzle_tflops + ) # Step 2: swizzle effect (double-buffer auto-sync swizzle vs non-swizzle). - step2_swizzle_vs_noswizzle = double_auto_swizzle_tflops / double_auto_noswizzle_tflops + step2_swizzle_vs_noswizzle = ( + double_auto_swizzle_tflops / double_auto_noswizzle_tflops + ) # Step 3: manual-sync effect (double-buffer swizzle manual-sync vs auto-sync). - step3_manual_vs_auto = double_manual_swizzle_tflops / double_auto_swizzle_tflops + step3_manual_vs_auto = ( + double_manual_swizzle_tflops / double_auto_swizzle_tflops + ) ratios_step1_double_vs_single_noswizzle.append(step1_double_vs_single) ratios_step2_swizzle_vs_noswizzle.append(step2_swizzle_vs_noswizzle) @@ -346,13 +392,19 @@ def main(): ) print("") - avg_step1 = sum(ratios_step1_double_vs_single_noswizzle) / len(ratios_step1_double_vs_single_noswizzle) + avg_step1 = sum(ratios_step1_double_vs_single_noswizzle) / len( + ratios_step1_double_vs_single_noswizzle + ) min_step1 = min(ratios_step1_double_vs_single_noswizzle) max_step1 = max(ratios_step1_double_vs_single_noswizzle) - avg_step2 = sum(ratios_step2_swizzle_vs_noswizzle) / len(ratios_step2_swizzle_vs_noswizzle) + avg_step2 = sum(ratios_step2_swizzle_vs_noswizzle) / len( + ratios_step2_swizzle_vs_noswizzle + ) min_step2 = min(ratios_step2_swizzle_vs_noswizzle) max_step2 = max(ratios_step2_swizzle_vs_noswizzle) - avg_step3 = sum(ratios_step3_manual_vs_auto_swizzle) / len(ratios_step3_manual_vs_auto_swizzle) + avg_step3 = sum(ratios_step3_manual_vs_auto_swizzle) / len( + ratios_step3_manual_vs_auto_swizzle + ) min_step3 = min(ratios_step3_manual_vs_auto_swizzle) max_step3 = max(ratios_step3_manual_vs_auto_swizzle) @@ -362,13 +414,25 @@ def main(): print(f"min FLOP ratio(double_noswizzle_auto/single_noswizzle): {min_step1:.3f}x") print(f"max FLOP ratio(double_noswizzle_auto/single_noswizzle): {max_step1:.3f}x") print("Step2 (swizzle speedup, both double-buffer auto-sync):") - print(f"avg FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {avg_step2:.3f}x") - print(f"min FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {min_step2:.3f}x") - print(f"max FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {max_step2:.3f}x") + print( + f"avg FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {avg_step2:.3f}x" + ) + print( + f"min FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {min_step2:.3f}x" + ) + print( + f"max FLOP ratio(double_swizzle_auto/double_noswizzle_auto): {max_step2:.3f}x" + ) print("Step3 (manual-sync speedup, both double-buffer swizzle):") - print(f"avg FLOP ratio(double_swizzle_manual/double_swizzle_auto): {avg_step3:.3f}x") - print(f"min FLOP ratio(double_swizzle_manual/double_swizzle_auto): {min_step3:.3f}x") - print(f"max FLOP ratio(double_swizzle_manual/double_swizzle_auto): {max_step3:.3f}x") + print( + f"avg FLOP ratio(double_swizzle_manual/double_swizzle_auto): {avg_step3:.3f}x" + ) + print( + f"min FLOP ratio(double_swizzle_manual/double_swizzle_auto): {min_step3:.3f}x" + ) + print( + f"max FLOP ratio(double_swizzle_manual/double_swizzle_auto): {max_step3:.3f}x" + ) _maybe_plot(plot_rows, plot_dir) diff --git a/examples/aot/matmul_optimization_guide/common_utils.py b/examples/aot/matmul_optimization_guide/common_utils.py index 0b152cae..58d8b801 100644 --- a/examples/aot/matmul_optimization_guide/common_utils.py +++ b/examples/aot/matmul_optimization_guide/common_utils.py @@ -23,15 +23,25 @@ def meta_data(): tile_view_b = pto.SubTensorType(shape=[K_TILE, N_FULL], dtype=dtype) tile_view_c = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) - b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) + b_l1_cfg = pto.TileBufConfig( + blayout="RowMajor", slayout="ColMajor", s_fractal_size=512 + ) - tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") + tile_buf_a_l1 = pto.TileBufType( + shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT" + ) tile_buf_b_l1 = pto.TileBufType( shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg ) - tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") - tile_buf_b_l0 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") - tile_buf_c = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") + tile_buf_a_l0 = pto.TileBufType( + shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT" + ) + tile_buf_b_l0 = pto.TileBufType( + shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT" + ) + tile_buf_c = pto.TileBufType( + shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC" + ) return { "ptr_type": ptr_type, diff --git a/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py b/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py index 695cc971..1c021b5e 100644 --- a/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py +++ b/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py @@ -208,7 +208,9 @@ def _maybe_plot(rows, plot_dir): linear_by_m = {} for m in m_values: candidates = [r for r in chunk if r["m"] == m] - linear_by_m[m] = sum(r["linear_tflops"] for r in candidates) / len(candidates) + linear_by_m[m] = sum(r["linear_tflops"] for r in candidates) / len( + candidates + ) plt.figure(figsize=(9, 5)) plt.plot( @@ -292,9 +294,7 @@ def _maybe_plot(rows, plot_dir): alpha = 1.0 if is_baseline else 0.7 color = cmap(idx % 10) base_label = ( - "no-swizzle baseline" - if is_baseline - else f"d={direction}, c={count}" + "no-swizzle baseline" if is_baseline else f"d={direction}, c={count}" ) speedup_label = f"speedup {base_label}" @@ -358,12 +358,20 @@ def main(): for n, k in SHAPES_NK: for m in m_list: alloc = args.warmup + args.repeat - a_list = [torch.randn(m, k, dtype=torch.float16, device=device) for _ in range(alloc)] - b_list = [torch.randn(n, k, dtype=torch.float16, device=device) for _ in range(alloc)] + a_list = [ + torch.randn(m, k, dtype=torch.float16, device=device) + for _ in range(alloc) + ] + b_list = [ + torch.randn(n, k, dtype=torch.float16, device=device) + for _ in range(alloc) + ] c_ref = F.linear(a_list[0], b_list[0]) torch.npu.synchronize() - linear_time_us = _time_fn(F.linear, a_list, b_list, args.warmup, args.repeat) + linear_time_us = _time_fn( + F.linear, a_list, b_list, args.warmup, args.repeat + ) flops = 2.0 * m * n * k linear_tflops = flops / linear_time_us / 1e6 @@ -389,7 +397,9 @@ def _custom(a, b, _d=swizzle_direction, _c=swizzle_count): torch.npu.synchronize() max_absdiff = float((c - c_ref).abs().max().item()) mean_absdiff = float((c - c_ref).abs().mean().item()) - custom_time_us = _time_fn(_custom, a_list, b_list, args.warmup, args.repeat) + custom_time_us = _time_fn( + _custom, a_list, b_list, args.warmup, args.repeat + ) custom_tflops = flops / custom_time_us / 1e6 flops_fraction_vs_linear = custom_tflops / linear_tflops diff --git a/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py b/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py index a3c1eb70..b6aff785 100644 --- a/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py +++ b/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py @@ -3,6 +3,7 @@ const = s.const + def build(): M_TILE = 128 K_QTILE = 64 @@ -26,16 +27,34 @@ def meta_data(): tile_view_c_256 = pto.SubTensorType(shape=[M_TILE, N_FULL], dtype=dtype) tile_view_c_128 = pto.SubTensorType(shape=[M_TILE, N_HALF], dtype=dtype) - b_l1_cfg = pto.TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=512) + b_l1_cfg = pto.TileBufConfig( + blayout="RowMajor", slayout="ColMajor", s_fractal_size=512 + ) - tile_buf_a_l1 = pto.TileBufType(shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT") - tile_buf_b_l1_256 = pto.TileBufType(shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg) - tile_buf_b_l1_128 = pto.TileBufType(shape=[K_TILE, N_HALF], dtype=dtype, memory_space="MAT", config=b_l1_cfg) - tile_buf_a_l0 = pto.TileBufType(shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT") - tile_buf_b_l0_256 = pto.TileBufType(shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT") - tile_buf_b_l0_128 = pto.TileBufType(shape=[K_QTILE, N_HALF], dtype=dtype, memory_space="RIGHT") - tile_buf_c_256 = pto.TileBufType(shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC") - tile_buf_c_128 = pto.TileBufType(shape=[M_TILE, N_HALF], dtype=acc_dtype, memory_space="ACC") + tile_buf_a_l1 = pto.TileBufType( + shape=[M_TILE, K_DTILE], dtype=dtype, memory_space="MAT" + ) + tile_buf_b_l1_256 = pto.TileBufType( + shape=[K_TILE, N_FULL], dtype=dtype, memory_space="MAT", config=b_l1_cfg + ) + tile_buf_b_l1_128 = pto.TileBufType( + shape=[K_TILE, N_HALF], dtype=dtype, memory_space="MAT", config=b_l1_cfg + ) + tile_buf_a_l0 = pto.TileBufType( + shape=[M_TILE, K_QTILE], dtype=dtype, memory_space="LEFT" + ) + tile_buf_b_l0_256 = pto.TileBufType( + shape=[K_QTILE, N_FULL], dtype=dtype, memory_space="RIGHT" + ) + tile_buf_b_l0_128 = pto.TileBufType( + shape=[K_QTILE, N_HALF], dtype=dtype, memory_space="RIGHT" + ) + tile_buf_c_256 = pto.TileBufType( + shape=[M_TILE, N_FULL], dtype=acc_dtype, memory_space="ACC" + ) + tile_buf_c_128 = pto.TileBufType( + shape=[M_TILE, N_HALF], dtype=acc_dtype, memory_space="ACC" + ) return { "ptr_type": ptr_type, @@ -182,7 +201,9 @@ def level2_loop_k(curr_id, next_id, a_curr, a_next): pto.cond( is_first_k_tile, lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul_acc( + c_l0, a_l0[ping], b_l0[ping], c_l0 + ), ) else: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) @@ -252,9 +273,19 @@ def matmul_kernel_ABt( core_loop = n_loop * m_loop k_dtile_num = k_total // c512 - tvA = pto.as_tensor(tv_a, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tvB = pto.as_tensor(tv_b, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tvC = pto.as_tensor(tv_c, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + tvA = pto.as_tensor( + tv_a, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1] + ) + tvB = pto.as_tensor( + tv_b, + ptr=b_ptr, + shape=[k_total, n_total], + strides=[c1, k_total], + layout="DN", + ) + tvC = pto.as_tensor( + tv_c, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1] + ) pto.record_event("MATMUL", "MOV_M2L", event_id=[0, 1]) pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) @@ -262,22 +293,55 @@ def matmul_kernel_ABt( def level1_loop_mn(m_offset, n_offset, li): # TODO: make a simpler version that only uses full-tile (256) branch, and reduce the types needed in meta_data n_tile_size = s.select(n_offset + c256 > n_total, c128n, c256) - shared_args = [m_offset, n_offset, k_dtile_num, li, core_loop, bid, num_blocks, tvA, tvB, tvC] + shared_args = [ + m_offset, + n_offset, + k_dtile_num, + li, + core_loop, + bid, + num_blocks, + tvA, + tvB, + tvC, + ] with pto.if_context(n_tile_size == c256, has_else=True) as branch: level1_loop_mn_dynamic_tilesize( - N_FULL, tile_view_b_256, tile_view_c_256, tile_buf_b_l1_256, tile_buf_b_l0_256, tile_buf_c_256, *shared_args) + N_FULL, + tile_view_b_256, + tile_view_c_256, + tile_buf_b_l1_256, + tile_buf_b_l0_256, + tile_buf_c_256, + *shared_args + ) with branch.else_context(): level1_loop_mn_dynamic_tilesize( - N_HALF, tile_view_b_128, tile_view_c_128, tile_buf_b_l1_128, tile_buf_b_l0_128, tile_buf_c_128, *shared_args) + N_HALF, + tile_view_b_128, + tile_view_c_128, + tile_buf_b_l1_128, + tile_buf_b_l0_128, + tile_buf_c_128, + *shared_args + ) for li in pto.range(bid, core_loop, num_blocks): - with pto.if_context(swizzle_direction == c0, has_else=True) as c0_branch: - m_idx, n_idx = swizzle_zn(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + with pto.if_context( + swizzle_direction == c0, has_else=True + ) as c0_branch: + m_idx, n_idx = swizzle_zn( + li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2 + ) level1_loop_mn(m_idx * c128, n_idx * c256, li) with c0_branch.else_context(): - with pto.if_context(swizzle_direction == c1, has_else=True) as c1_branch: - m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2) + with pto.if_context( + swizzle_direction == c1, has_else=True + ) as c1_branch: + m_idx, n_idx = swizzle_nz( + li, m_loop, n_loop, cSwizzle, cSwizzleM1, c1, c2 + ) level1_loop_mn(m_idx * c128, n_idx * c256, li) with c1_branch.else_context(): @@ -297,4 +361,4 @@ def level1_loop_mn(m_offset, n_offset, li): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md index 496d8785..2e7e48de 100644 --- a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md +++ b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md @@ -229,7 +229,7 @@ msprof op \ ``` 对4096x4096小矩阵,即使不用swizzled loop order,L2 hit就很高(97.88%): - + cachehit_N4096 对16384x16384大矩阵,由于超过了L2 size,不swizzle的话L2 hit低到了30.9%: diff --git a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md index 13639881..b36adf36 100644 --- a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md +++ b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md @@ -201,7 +201,7 @@ l2_size=100663296 # 96 MiB Swizzling improves L2 cache reuse across multiple cores. We borrow this figure [from Triton matmul](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#l2-cache-optimizations): Grouped vs row-major ordering (from Triton) -To read this figure, assume 9 cores computing a subset `C` matrix in the first iteration (the yellow area, each number 0 ~ 8 marks the core id). In the naive "row-major ordering", the full matrix B (assume larger than L2 cache!) needs to be loaded from global memory; while in the "grouped ordering", the data traffic w.r.t. global memory is much less. +To read this figure, assume 9 cores computing a subset `C` matrix in the first iteration (the yellow area, each number 0 ~ 8 marks the core id). In the naive "row-major ordering", the full matrix B (assume larger than L2 cache!) needs to be loaded from global memory; while in the "grouped ordering", the data traffic w.r.t. global memory is much less. [step3_swizzle.py](./step3_swizzle.py) incorporates a 10-line swizzling function `swizzle_nz`, while keeping the rest of the code same as step2. [step3_swizzle_numpy_sim.py](./step3_swizzle_numpy_sim.py) explains the swizzle scheme intuitively. The swizzle algorithm is one of the algorithms [from catlass](https://gitcode.com/cann/catlass/blob/v1.4.0/include/catlass/gemm/block/block_swizzle.hpp), which also [has a nice explanation](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/swizzle_explanation.md) @@ -222,7 +222,7 @@ msprof op \ ``` For a small 4096x4096 matrix, L2 cache hit is high (97.88%) even without a swizzled loop order: - + cachehit_N4096 For a larger 16384x16384 matrix that exceeds L2, L2 cache hit is low (30.9%) without swizzling: diff --git a/examples/aot/matmul_optimization_guide/run_matmul.py b/examples/aot/matmul_optimization_guide/run_matmul.py index b5a7497c..29d155b4 100644 --- a/examples/aot/matmul_optimization_guide/run_matmul.py +++ b/examples/aot/matmul_optimization_guide/run_matmul.py @@ -45,7 +45,7 @@ def load_lib(lib_path): ctypes.c_void_p, ctypes.c_int, ctypes.c_int, - ctypes.c_int + ctypes.c_int, ] lib.call_kernel.restype = None @@ -81,7 +81,7 @@ def matmul_abt( torch_to_ctypes(c), m, n, - k + k, ) return c @@ -89,11 +89,7 @@ def matmul_abt( def run_case(matmul_abt, a, b, c_ref, *, block_dim): - c = matmul_abt( - a, - b, - block_dim=block_dim - ) + c = matmul_abt(a, b, block_dim=block_dim) torch.npu.synchronize() return CaseResult( m=int(a.shape[0]), @@ -156,13 +152,7 @@ def test_matmul(): shape_worst = None for block_dim in BLOCK_DIM_LIST: - result = run_case( - matmul_abt, - a, - b, - c_ref, - block_dim=block_dim - ) + result = run_case(matmul_abt, a, b, c_ref, block_dim=block_dim) checked_cases += 1 if ( diff --git a/examples/aot/matmul_optimization_guide/step1_baseline.py b/examples/aot/matmul_optimization_guide/step1_baseline.py index 34c5541c..e192dd77 100644 --- a/examples/aot/matmul_optimization_guide/step1_baseline.py +++ b/examples/aot/matmul_optimization_guide/step1_baseline.py @@ -3,7 +3,15 @@ from ptodsl import pto, tile, to_ir_module from ptodsl import scalar as s -from common_utils import K_DTILE, K_QTILE, K_TILE, M_TILE, N_FULL, build_meta_data, const +from common_utils import ( + K_DTILE, + K_QTILE, + K_TILE, + M_TILE, + N_FULL, + build_meta_data, + const, +) def build(): @@ -36,9 +44,19 @@ def matmul_kernel_step1_baseline( core_loop = n_loop * m_loop k_dtile_num = k_total // c512 - tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + tv_a = pto.as_tensor( + tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1] + ) + tv_b = pto.as_tensor( + tv_2d, + ptr=b_ptr, + shape=[k_total, n_total], + strides=[c1, k_total], + layout="DN", + ) + tv_c = pto.as_tensor( + tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1] + ) a_l1 = pto.alloc_tile(tile_buf_a_l1) b_l1 = pto.alloc_tile(tile_buf_b_l1) @@ -85,7 +103,9 @@ def matmul_kernel_step1_baseline( tile.extract(b_l1, b_row, c0, b_l0) if phase == 0: - with pto.if_context(is_first_k_tile, has_else=True) as branch: + with pto.if_context( + is_first_k_tile, has_else=True + ) as branch: tile.matmul(a_l0, b_l0, c_l0) with branch.else_context(): tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) diff --git a/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py b/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py index 83be2721..ec34901b 100644 --- a/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py +++ b/examples/aot/matmul_optimization_guide/step1_baseline_numpy_sim.py @@ -9,7 +9,9 @@ def _print_tile_memory(name, arr): kib = arr.nbytes / 1024 - print(f"[tile-mem] {name}: shape={arr.shape}, dtype={arr.dtype}, bytes={arr.nbytes} ({kib:.1f} KiB)") + print( + f"[tile-mem] {name}: shape={arr.shape}, dtype={arr.dtype}, bytes={arr.nbytes} ({kib:.1f} KiB)" + ) def step1_numpy_sim(a, b): @@ -76,7 +78,10 @@ def step1_numpy_sim(a, b): b_half = phase // 4 h_off = b_half * K_TILE # b_l1 layout is [K_TILE, N_FULL], matching tile_buf_b_l1. - b_l1[:, :] = b[n_offset : n_offset + N_FULL, k_offset + h_off : k_offset + h_off + K_TILE].T + b_l1[:, :] = b[ + n_offset : n_offset + N_FULL, + k_offset + h_off : k_offset + h_off + K_TILE, + ].T # Corresponds to extract A/B quarter tiles a_col = phase * K_QTILE diff --git a/examples/aot/matmul_optimization_guide/step2_doublebuffer.py b/examples/aot/matmul_optimization_guide/step2_doublebuffer.py index 87bd4c03..cb2344e6 100644 --- a/examples/aot/matmul_optimization_guide/step2_doublebuffer.py +++ b/examples/aot/matmul_optimization_guide/step2_doublebuffer.py @@ -45,9 +45,19 @@ def matmul_kernel_ABt_autosync( core_loop = n_loop * m_loop k_dtile_num = k_total // c512 - tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + tv_a = pto.as_tensor( + tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1] + ) + tv_b = pto.as_tensor( + tv_2d, + ptr=b_ptr, + shape=[k_total, n_total], + strides=[c1, k_total], + layout="DN", + ) + tv_c = pto.as_tensor( + tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1] + ) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] @@ -101,8 +111,12 @@ def run_loop_k(a_curr, a_next): if phase == 0: pto.cond( is_first_k_tile, - lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul( + a_l0[ping], b_l0[ping], c_l0 + ), + lambda: tile.matmul_acc( + c_l0, a_l0[ping], b_l0[ping], c_l0 + ), ) else: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) diff --git a/examples/aot/matmul_optimization_guide/step3_swizzle.py b/examples/aot/matmul_optimization_guide/step3_swizzle.py index 47d7d598..93c0cf35 100644 --- a/examples/aot/matmul_optimization_guide/step3_swizzle.py +++ b/examples/aot/matmul_optimization_guide/step3_swizzle.py @@ -15,6 +15,7 @@ swizzle_nz, ) + def build(): meta_data = build_meta_data() @@ -48,9 +49,19 @@ def matmul_kernel_ABt_autosync( c_swizzle = const(SWIZZLE_COUNT) c_swizzle_m1 = c_swizzle - c1 - tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + tv_a = pto.as_tensor( + tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1] + ) + tv_b = pto.as_tensor( + tv_2d, + ptr=b_ptr, + shape=[k_total, n_total], + strides=[c1, k_total], + layout="DN", + ) + tv_c = pto.as_tensor( + tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1] + ) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] @@ -59,7 +70,9 @@ def matmul_kernel_ABt_autosync( c_l0 = pto.alloc_tile(tile_buf_c) for li in pto.range(bid, core_loop, num_blocks): - m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) + m_idx, n_idx = swizzle_nz( + li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2 + ) m_offset = m_idx * c128 n_offset = n_idx * c256 c_kt = const(K_TILE) @@ -102,8 +115,12 @@ def run_loop_k(a_curr, a_next): if phase == 0: pto.cond( is_first_k_tile, - lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul( + a_l0[ping], b_l0[ping], c_l0 + ), + lambda: tile.matmul_acc( + c_l0, a_l0[ping], b_l0[ping], c_l0 + ), ) else: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) diff --git a/examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py b/examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py index 82b19ff3..42b78887 100644 --- a/examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py +++ b/examples/aot/matmul_optimization_guide/step3_swizzle_numpy_sim.py @@ -32,7 +32,9 @@ def show_mapping(m_loop, n_loop, c_swizzle, preview=24): rows.append((li, m_linear, n_linear, m_swz, n_swz)) arr = np.array(rows, dtype=np.int32) - print(f"\n=== swizzle={c_swizzle}, m_loop={m_loop}, n_loop={n_loop}, core_loop={core_loop} ===") + print( + f"\n=== swizzle={c_swizzle}, m_loop={m_loop}, n_loop={n_loop}, core_loop={core_loop} ===" + ) print("li | linear(m,n) -> swizzle(m,n)") for li, ml, nl, ms, ns in arr: print(f"{li:2d} | ({ml:2d},{nl:2d}) -> ({ms:2d},{ns:2d})") diff --git a/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py b/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py index 22d5f6e3..a92d7ed5 100644 --- a/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py +++ b/examples/aot/matmul_optimization_guide/step4_manual_pipelining.py @@ -15,6 +15,7 @@ swizzle_nz, ) + def build(): meta_data = build_meta_data() @@ -48,9 +49,19 @@ def matmul_kernel_ABt( c_swizzle = const(SWIZZLE_COUNT) c_swizzle_m1 = c_swizzle - c1 - tv_a = pto.as_tensor(tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1]) - tv_b = pto.as_tensor(tv_2d, ptr=b_ptr, shape=[k_total, n_total], strides=[c1, k_total], layout="DN") - tv_c = pto.as_tensor(tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1]) + tv_a = pto.as_tensor( + tv_2d, ptr=a_ptr, shape=[m_total, k_total], strides=[k_total, c1] + ) + tv_b = pto.as_tensor( + tv_2d, + ptr=b_ptr, + shape=[k_total, n_total], + strides=[c1, k_total], + layout="DN", + ) + tv_c = pto.as_tensor( + tv_2d, ptr=c_ptr, shape=[m_total, n_total], strides=[n_total, c1] + ) a_l1 = [pto.alloc_tile(tile_buf_a_l1), pto.alloc_tile(tile_buf_a_l1)] b_l1 = [pto.alloc_tile(tile_buf_b_l1), pto.alloc_tile(tile_buf_b_l1)] @@ -62,7 +73,9 @@ def matmul_kernel_ABt( pto.record_event("MOV_M2L", "LOAD", event_id=[0, 1, 2, 3]) for li in pto.range(bid, core_loop, num_blocks): - m_idx, n_idx = swizzle_nz(li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2) + m_idx, n_idx = swizzle_nz( + li, m_loop, n_loop, c_swizzle, c_swizzle_m1, c1, c2 + ) m_offset = m_idx * c128 n_offset = n_idx * c256 c_kt = const(K_TILE) @@ -115,7 +128,9 @@ def run_loop_k(curr_id, next_id, a_curr, a_next): tile.extract(a_curr, c0, a_col, a_l0[ping]) if phase == 7: - pto.record_event("MOV_M2L", "LOAD", event_id=curr_id) + pto.record_event( + "MOV_M2L", "LOAD", event_id=curr_id + ) if quarter == 0: pto.wait_event("LOAD", "MOV_M2L", event_id=b_evt) @@ -130,8 +145,12 @@ def run_loop_k(curr_id, next_id, a_curr, a_next): if phase == 0: pto.cond( is_first_k_tile, - lambda: tile.matmul(a_l0[ping], b_l0[ping], c_l0), - lambda: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0), + lambda: tile.matmul( + a_l0[ping], b_l0[ping], c_l0 + ), + lambda: tile.matmul_acc( + c_l0, a_l0[ping], b_l0[ping], c_l0 + ), ) else: tile.matmul_acc(c_l0, a_l0[ping], b_l0[ping], c_l0) diff --git a/examples/aot/print_tile/print_builder.py b/examples/aot/print_tile/print_builder.py index 80a17f7d..9321fb33 100644 --- a/examples/aot/print_tile/print_builder.py +++ b/examples/aot/print_tile/print_builder.py @@ -10,11 +10,18 @@ def meta_data(): index_dtype = pto.int32 ptr_type = pto.PtrType(dtype) tensor_type = pto.TensorType(rank=2, dtype=dtype) - subtensor_type = pto.SubTensorType(shape=[32, 32], dtype=dtype) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 + subtensor_type = pto.SubTensorType( + shape=[32, 32], dtype=dtype + ) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 tile_cfg = pto.TileBufConfig() # defaults to pto.TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null") tile_type = pto.TileBufType( - shape=[32, 32], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=tile_cfg) + shape=[32, 32], + valid_shape=[-1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) return { "ptr_type": ptr_type, "index_dtype": index_dtype, @@ -30,8 +37,8 @@ def vec_add_kernel_2d_dynamic( arg1: "ptr_type", arg2: "ptr_type", arg_vrow_i32: "index_dtype", - arg_vcol_i32: "index_dtype" - ) -> None: + arg_vcol_i32: "index_dtype", +) -> None: c0 = const(0) c1 = const(1) c32 = const(32) @@ -52,9 +59,15 @@ def vec_add_kernel_2d_dynamic( vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 # every core loads 32 rows of data - sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) - sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) - sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32]) + sv0 = pto.slice_view( + subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv1 = pto.slice_view( + subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv2 = pto.slice_view( + subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32] + ) with pto.vector_section(): tb0 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) diff --git a/examples/aot/print_tile/run_print.py b/examples/aot/print_tile/run_print.py index a0621095..1316bc19 100644 --- a/examples/aot/print_tile/run_print.py +++ b/examples/aot/print_tile/run_print.py @@ -9,12 +9,7 @@ def torch_to_ctypes(tensor): def lib_to_func(lib): - def add_func( - x, - y, - z, - stream_ptr=None - ): + def add_func(x, y, z, stream_ptr=None): vrow, vcol = 32, 32 # local tile shape hard-coded as the kernel @@ -26,8 +21,10 @@ def add_func( torch_to_ctypes(x), torch_to_ctypes(y), torch_to_ctypes(z), - vrow, vcol + vrow, + vcol, ) + return add_func @@ -42,12 +39,13 @@ def test_add(): shape = [1280, 32] # tensor shape hard-coded as the kernel torch.manual_seed(0) dtype = torch.float32 - x = torch.arange(shape[0]*shape[1], device=device, dtype=dtype).reshape(shape) - y = torch.arange(shape[0]*shape[1], device=device, dtype=dtype).reshape(shape) + x = torch.arange(shape[0] * shape[1], device=device, dtype=dtype).reshape(shape) + y = torch.arange(shape[0] * shape[1], device=device, dtype=dtype).reshape(shape) z = torch.empty(shape, device=device, dtype=dtype) add_func(x, y, z) torch.npu.synchronize() + if __name__ == "__main__": test_add() diff --git a/examples/aot/simple_static/add_static_multicore/.gitignore b/examples/aot/simple_static/add_static_multicore/.gitignore index 79b9aff4..1a4c7666 100644 --- a/examples/aot/simple_static/add_static_multicore/.gitignore +++ b/examples/aot/simple_static/add_static_multicore/.gitignore @@ -1,2 +1,2 @@ add.cpp -add.pto \ No newline at end of file +add.pto diff --git a/examples/aot/simple_static/add_static_multicore/add_builder.py b/examples/aot/simple_static/add_static_multicore/add_builder.py index 93865a47..1c790077 100644 --- a/examples/aot/simple_static/add_static_multicore/add_builder.py +++ b/examples/aot/simple_static/add_static_multicore/add_builder.py @@ -10,11 +10,18 @@ def meta_data(): index_dtype = pto.int32 ptr_type = pto.PtrType(dtype) tensor_type = pto.TensorType(rank=2, dtype=dtype) - subtensor_type = pto.SubTensorType(shape=[32, 32], dtype=dtype) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 + subtensor_type = pto.SubTensorType( + shape=[32, 32], dtype=dtype + ) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 tile_cfg = pto.TileBufConfig() # defaults to pto.TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null") tile_type = pto.TileBufType( - shape=[32, 32], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=tile_cfg) + shape=[32, 32], + valid_shape=[-1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) return { "ptr_type": ptr_type, "index_dtype": index_dtype, @@ -30,8 +37,8 @@ def vec_add_kernel_2d_dynamic( arg1: "ptr_type", arg2: "ptr_type", arg_vrow_i32: "index_dtype", - arg_vcol_i32: "index_dtype" - ) -> None: + arg_vcol_i32: "index_dtype", +) -> None: c0 = const(0) c1 = const(1) c32 = const(32) @@ -52,9 +59,15 @@ def vec_add_kernel_2d_dynamic( vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 # every core loads 32 rows of data - sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) - sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) - sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32]) + sv0 = pto.slice_view( + subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv1 = pto.slice_view( + subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv2 = pto.slice_view( + subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32] + ) with pto.vector_section(): tb0 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) diff --git a/examples/aot/simple_static/add_static_multicore/run_add.py b/examples/aot/simple_static/add_static_multicore/run_add.py index 44c42615..53344ff8 100644 --- a/examples/aot/simple_static/add_static_multicore/run_add.py +++ b/examples/aot/simple_static/add_static_multicore/run_add.py @@ -9,12 +9,7 @@ def torch_to_ctypes(tensor): def lib_to_func(lib): - def add_func( - x, - y, - z, - stream_ptr=None - ): + def add_func(x, y, z, stream_ptr=None): vrow, vcol = 32, 32 # local tile shape hard-coded as the kernel @@ -26,8 +21,10 @@ def add_func( torch_to_ctypes(x), torch_to_ctypes(y), torch_to_ctypes(z), - vrow, vcol + vrow, + vcol, ) + return add_func @@ -53,5 +50,6 @@ def test_add(): torch.testing.assert_close(z, z_ref) print("result equal!") + if __name__ == "__main__": test_add() diff --git a/examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py b/examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py index 3bb3464e..6b11b015 100644 --- a/examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py +++ b/examples/aot/simple_static/matmul_static_singlecore/matmul_builder.py @@ -28,11 +28,21 @@ def meta_data(): tile_view_out = pto.SubTensorType(shape=[M, N], dtype=dtype) tile_view_bias = pto.SubTensorType(shape=[1, N], dtype=dtype) - tile_buf_aMat = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="MAT") - tile_buf_bMat = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="MAT") - tile_buf_biasData = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="MAT") - tile_buf_aTile = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="RIGHT") + tile_buf_aMat = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="MAT" + ) + tile_buf_bMat = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_biasData = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_aTile = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="LEFT" + ) + tile_buf_bTile = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="RIGHT" + ) tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype, memory_space="ACC") tile_buf_biasTile = pto.TileBufType( shape=[1, N], dtype=dtype, memory_space="BIAS" @@ -76,10 +86,18 @@ def RunTMATMULSplitK( cTileM = const(M) cTileN = const(N) - tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cM, cK], strides=[cK, c1]) - tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tvOut = pto.as_tensor(tensor_type, ptr=out_ptr, shape=[cM, cN], strides=[cN, c1]) - tvBias = pto.as_tensor(tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) + tvA = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[cM, cK], strides=[cK, c1] + ) + tvB = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1] + ) + tvOut = pto.as_tensor( + tensor_type, ptr=out_ptr, shape=[cM, cN], strides=[cN, c1] + ) + tvBias = pto.as_tensor( + tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1] + ) aMatTile = pto.alloc_tile(tile_buf_aMat) bMatTile = pto.alloc_tile(tile_buf_bMat) @@ -148,4 +166,4 @@ def _first_iter(): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/examples/aot/simple_static/matmul_static_singlecore/run_matmul.py b/examples/aot/simple_static/matmul_static_singlecore/run_matmul.py index 46822a5c..1eddc067 100644 --- a/examples/aot/simple_static/matmul_static_singlecore/run_matmul.py +++ b/examples/aot/simple_static/matmul_static_singlecore/run_matmul.py @@ -13,11 +13,7 @@ def load_lib(lib_path): default_block_dim = 1 # NOTE: kernel is single-core for now - def matmul_func( - c, a, b, - block_dim=default_block_dim, - stream_ptr=None - ): + def matmul_func(c, a, b, block_dim=default_block_dim, stream_ptr=None): if stream_ptr is None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( @@ -39,8 +35,8 @@ def test_matmul(): m, k, n = 32, 256, 32 torch.manual_seed(0) - a = torch.rand((m,k), device=device, dtype=dtype) - b = torch.rand((k,n), device=device, dtype=dtype) + a = torch.rand((m, k), device=device, dtype=dtype) + b = torch.rand((k, n), device=device, dtype=dtype) c = torch.zeros((m, n), device=device, dtype=dtype) matmul_func = load_lib("./matmul_kernel.so") @@ -49,7 +45,7 @@ def test_matmul(): c_ref = torch.matmul(a, b) diff = (c - c_ref).abs().max() - print('max diff: ', diff) + print("max diff: ", diff) if __name__ == "__main__": diff --git a/examples/jit/add_dynamic_multicore/run_add.py b/examples/jit/add_dynamic_multicore/run_add.py index b2a063c9..d9ece92b 100644 --- a/examples/jit/add_dynamic_multicore/run_add.py +++ b/examples/jit/add_dynamic_multicore/run_add.py @@ -85,13 +85,22 @@ def vec_add_1d_dynamic( offset_global = tile_offset_global * c_tile sv0 = pto.slice_view( - subtensor_type, source=tv0, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv0, + offsets=[offset_global], + sizes=[c_tile], ) sv1 = pto.slice_view( - subtensor_type, source=tv1, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv1, + offsets=[offset_global], + sizes=[c_tile], ) sv2 = pto.slice_view( - subtensor_type, source=tv2, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv2, + offsets=[offset_global], + sizes=[c_tile], ) pto.load(sv0, tb0) @@ -109,7 +118,14 @@ def test_add(): tile_size = 1024 # Keep shapes aligned to tile size, but vary tile counts so they are not # required to be multiples of `num_cores`. - tile_counts = [1, 7, num_cores - 1, num_cores + 3, 2 * num_cores + 7, 5 * num_cores - 5] + tile_counts = [ + 1, + 7, + num_cores - 1, + num_cores + 3, + 2 * num_cores + 7, + 5 * num_cores - 5, + ] shape_list = [tile_size * tiles for tiles in tile_counts] torch.manual_seed(0) diff --git a/examples/jit/add_static_multicore/run_add_1d.py b/examples/jit/add_static_multicore/run_add_1d.py index e8388794..b47a0c15 100644 --- a/examples/jit/add_static_multicore/run_add_1d.py +++ b/examples/jit/add_static_multicore/run_add_1d.py @@ -13,9 +13,14 @@ def meta_data(): ptr_type = pto.PtrType(dtype) tensor_type = pto.TensorType(rank=2, dtype=dtype) - subtensor_type = pto.SubTensorType(shape=[1, 1024], dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, 1024], dtype=dtype) tile_type = pto.TileBufType( - shape=[1, 1024], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=pto.TileBufConfig()) + shape=[1, 1024], + valid_shape=[-1, -1], + dtype=dtype, + memory_space="VEC", + config=pto.TileBufConfig(), + ) return { "ptr_type": ptr_type, @@ -26,11 +31,7 @@ def meta_data(): @jit(meta_data=meta_data, block_dim=20) -def vec_add_kernel( - arg0: "ptr_type", - arg1: "ptr_type", - arg2: "ptr_type" - ) -> None: +def vec_add_kernel(arg0: "ptr_type", arg1: "ptr_type", arg2: "ptr_type") -> None: c0 = const(0) c1 = const(1) c1024 = const(1024) @@ -48,9 +49,15 @@ def vec_add_kernel( vid_idx = s.index_cast(vid) offset = vid_idx * c1024 # every core loads 1024 elements of data - sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[c0, offset], sizes=[c1, c1024]) - sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[c0, offset], sizes=[c1, c1024]) - sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[c0, offset], sizes=[c1, c1024]) + sv0 = pto.slice_view( + subtensor_type, source=tv0, offsets=[c0, offset], sizes=[c1, c1024] + ) + sv1 = pto.slice_view( + subtensor_type, source=tv1, offsets=[c0, offset], sizes=[c1, c1024] + ) + sv2 = pto.slice_view( + subtensor_type, source=tv2, offsets=[c0, offset], sizes=[c1, c1024] + ) with pto.vector_section(): tb0 = pto.alloc_tile(tile_type, valid_row=c1, valid_col=c1024) @@ -81,5 +88,6 @@ def test_add(): torch.testing.assert_close(z, z_ref) print("result equal!") + if __name__ == "__main__": test_add() diff --git a/examples/jit/add_static_multicore/run_add_2d.py b/examples/jit/add_static_multicore/run_add_2d.py index 127bcd9f..663c7a91 100644 --- a/examples/jit/add_static_multicore/run_add_2d.py +++ b/examples/jit/add_static_multicore/run_add_2d.py @@ -13,11 +13,18 @@ def meta_data(): index_dtype = pto.int32 ptr_type = pto.PtrType(dtype) tensor_type = pto.TensorType(rank=2, dtype=dtype) - subtensor_type = pto.SubTensorType(shape=[32, 32], dtype=dtype) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 + subtensor_type = pto.SubTensorType( + shape=[32, 32], dtype=dtype + ) # TODO: omit shape https://github.com/zhangstevenunity/PTOAS/issues/31 tile_cfg = pto.TileBufConfig() # defaults to pto.TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null") tile_type = pto.TileBufType( - shape=[32, 32], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=tile_cfg) + shape=[32, 32], + valid_shape=[-1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) return { "ptr_type": ptr_type, "index_dtype": index_dtype, @@ -33,8 +40,8 @@ def vec_add_kernel( arg1: "ptr_type", arg2: "ptr_type", vrow: "index_dtype", - vcol: "index_dtype" - ) -> None: + vcol: "index_dtype", +) -> None: c0 = const(0) c1 = const(1) c32 = const(32) @@ -55,9 +62,15 @@ def vec_add_kernel( vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 # every core loads 32 rows of data - sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) - sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) - sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32]) + sv0 = pto.slice_view( + subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv1 = pto.slice_view( + subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv2 = pto.slice_view( + subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32] + ) with pto.vector_section(): tb0 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) @@ -88,5 +101,6 @@ def test_add(): torch.testing.assert_close(z, z_ref) print("result equal!") + if __name__ == "__main__": test_add() diff --git a/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py b/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py index 815fc78a..0f668aa9 100644 --- a/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py +++ b/examples/jit/matmul_dynamic_multicore/run_batch_matmul.py @@ -34,14 +34,26 @@ def meta_data(): tile_view_out = pto.SubTensorType(shape=[M, N], dtype=dtype) tile_view_bias = pto.SubTensorType(shape=[1, N], dtype=dtype) - tile_buf_aMat = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="MAT") - tile_buf_bMat = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="MAT") - tile_buf_biasData = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="MAT") - - tile_buf_aTile = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="RIGHT") + tile_buf_aMat = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="MAT" + ) + tile_buf_bMat = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_biasData = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="MAT" + ) + + tile_buf_aTile = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="LEFT" + ) + tile_buf_bTile = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="RIGHT" + ) tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype, memory_space="ACC") - tile_buf_biasTile = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="BIAS") + tile_buf_biasTile = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="BIAS" + ) return { "ptr_type": ptr_dtype, @@ -91,10 +103,18 @@ def RunTMATMULSplitK( b_end_unclamped = b_start + batches_per_core b_end = s.min_u(b_end_unclamped, batch) - tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) - tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tvOut = pto.as_tensor(tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1]) - tvBias = pto.as_tensor(tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) + tvA = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1] + ) + tvB = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1] + ) + tvOut = pto.as_tensor( + tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1] + ) + tvBias = pto.as_tensor( + tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1] + ) aMatTile = pto.alloc_tile(tile_buf_aMat) bMatTile = pto.alloc_tile(tile_buf_bMat) diff --git a/examples/validate_all_examples.py b/examples/validate_all_examples.py index 04521c1f..bd80e4e1 100644 --- a/examples/validate_all_examples.py +++ b/examples/validate_all_examples.py @@ -71,7 +71,9 @@ def extract_commands(readme_path: Path) -> list[str]: return [] -def run_example(example_name: str, readme_path: Path, commands: list[str]) -> ExampleResult: +def run_example( + example_name: str, readme_path: Path, commands: list[str] +) -> ExampleResult: example_start = time.time() if not commands: return ExampleResult( diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 02acb5ed..c8f649ea 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -87,13 +87,13 @@ def store(source, dest): def print(format, scalar): """ - Example: - `print("hello %d\n", const(5))` - is equivalent to - `cce::printf("hello%d\n", 5);` + Example: + `print("hello %d\n", const(5))` + is equivalent to + `cce::printf("hello%d\n", 5);` - NOTE: may not print if the print buffer is full from previous - prints (typical when printing big tiles). + NOTE: may not print if the print buffer is full from previous + prints (typical when printing big tiles). """ if isinstance(scalar, Value): scalar = _unwrap(scalar) @@ -114,4 +114,4 @@ def print(format, scalar): "load", "store", "print", -] \ No newline at end of file +] diff --git a/ptodsl/api/scalar.py b/ptodsl/api/scalar.py index 8c10a4d6..90938daa 100644 --- a/ptodsl/api/scalar.py +++ b/ptodsl/api/scalar.py @@ -143,7 +143,9 @@ def ge(a, b): def select(cond, true_val, false_val): - return Value(arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result) + return Value( + arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result + ) __all__ = [ diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index 3a186dc3..0526cd08 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -85,7 +85,9 @@ def matmul_acc(acc, lhs, rhs, out): def extract(source, index_row, index_col, out): - _pto.TExtractOp(src=source, indexRow=_unwrap(index_row), indexCol=_unwrap(index_col), dst=out) + _pto.TExtractOp( + src=source, indexRow=_unwrap(index_row), indexCol=_unwrap(index_col), dst=out + ) def row_sum(src, tmp, dst): @@ -121,19 +123,19 @@ def row_expand_mul(src0, src1, dst): def col_sum(src, tmp, dst, is_binary=True): - _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) + _pto.TColSumOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) def col_min(src, dst): - _pto.TColMinOp(src=src, dst=dst) + _pto.TColMinOp(src=src, dst=dst) def col_max(src, dst): - _pto.TColMaxOp(src=src, dst=dst) + _pto.TColMaxOp(src=src, dst=dst) def col_prod(src, tmp, dst, is_binary=True): - _pto.TColProdOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) + _pto.TColProdOp(src=src, dst=dst, tmp=tmp, isBinary=BoolAttr.get(is_binary)) def col_expand(src, dst): @@ -182,4 +184,4 @@ def print(source): "col_prod", "col_expand", "subset", -] \ No newline at end of file +] diff --git a/ptodsl/api/type_def.py b/ptodsl/api/type_def.py index 24fbfcf3..4f66eebb 100644 --- a/ptodsl/api/type_def.py +++ b/ptodsl/api/type_def.py @@ -24,7 +24,9 @@ def SubTensorType(*, shape, dtype): class TileBufConfig: - def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null"): + def __init__( + self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null" + ): # TODO: expose and validate a broader set of tile buffer knobs if PTO adds # more layout/padding/fractal settings that should be configurable here. self._bl = _pto.BLayoutAttr.get(getattr(_pto.BLayout, blayout)) @@ -34,7 +36,9 @@ def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pa @property def attr(self): - return _pto.TileBufConfigAttr.get(self._bl, self._sl, self._s_fractal_size, self._pd) + return _pto.TileBufConfigAttr.get( + self._bl, self._sl, self._s_fractal_size, self._pd + ) def _default_tile_config(memory_space, shape): @@ -78,7 +82,9 @@ def _default_tile_config(memory_space, shape): ) if space == "VEC": return TileBufConfig() - raise ValueError(f"Unsupported memory_space '{memory_space}' for default tile config.") + raise ValueError( + f"Unsupported memory_space '{memory_space}' for default tile config." + ) def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py index 677aa0dc..b32730ef 100644 --- a/ptodsl/compiler/ir.py +++ b/ptodsl/compiler/ir.py @@ -9,7 +9,9 @@ def _resolve_meta(meta_fn): values = meta_fn() if not isinstance(values, dict): - raise ValueError("`meta_data()` must return a dict of named symbols to MLIR/PTO types.") + raise ValueError( + "`meta_data()` must return a dict of named symbols to MLIR/PTO types." + ) return dict(values) diff --git a/ptodsl/compiler/jit.py b/ptodsl/compiler/jit.py index 772a8c42..820fc00b 100644 --- a/ptodsl/compiler/jit.py +++ b/ptodsl/compiler/jit.py @@ -141,7 +141,9 @@ def _generate_caller_cpp(self, kernel_cpp_name): def _compile_shared_library(self, caller_cpp_path, lib_path): toolkit_home = os.environ.get("ASCEND_TOOLKIT_HOME") if not toolkit_home: - raise RuntimeError("ASCEND_TOOLKIT_HOME is required to compile generated caller.cpp.") + raise RuntimeError( + "ASCEND_TOOLKIT_HOME is required to compile generated caller.cpp." + ) cmd = [ "bisheng", f"-I{toolkit_home}/include", @@ -197,7 +199,9 @@ def _build(self): ptoas_cmd += [str(pto_path), "-o", str(cpp_path)] subprocess.run(ptoas_cmd, check=True, cwd=str(self._output_dir)) - caller_path.write_text(self._generate_caller_cpp(cpp_path.name), encoding="utf-8") + caller_path.write_text( + self._generate_caller_cpp(cpp_path.name), encoding="utf-8" + ) self._compile_shared_library(caller_path, lib_path) self._lib = ctypes.CDLL(str(lib_path)) @@ -219,7 +223,9 @@ def _convert_ptr(self, value): def _prepare_call_args(self, args): params = list(self._sig.parameters.values()) if len(args) > len(params): - raise TypeError(f"Expected at most {len(params)} arguments, got {len(args)}.") + raise TypeError( + f"Expected at most {len(params)} arguments, got {len(args)}." + ) filled_args = list(args) for idx in range(len(args), len(params)): diff --git a/ptodsl/utils/bench.py b/ptodsl/utils/bench.py index 44496414..9bc0db7e 100644 --- a/ptodsl/utils/bench.py +++ b/ptodsl/utils/bench.py @@ -48,7 +48,9 @@ def do_bench( torch_npu.npu.synchronize() factor = {"s": 1e-3, "ms": 1e0, "us": 1e3, "ns": 1e6}[unit] - times = [factor * start.elapsed_time(end) for start, end in zip(start_events, end_events)] + times = [ + factor * start.elapsed_time(end) for start, end in zip(start_events, end_events) + ] if aggregation == "mean": return sum(times) / len(times) return times diff --git a/tests/frontend/test_add_dynamic_ir.py b/tests/frontend/test_add_dynamic_ir.py index 9d661f28..ebabd0ac 100644 --- a/tests/frontend/test_add_dynamic_ir.py +++ b/tests/frontend/test_add_dynamic_ir.py @@ -83,13 +83,22 @@ def vec_add_1d_dynamic( offset_global = tile_offset_global * c_tile sv0 = pto.slice_view( - subtensor_type, source=tv0, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv0, + offsets=[offset_global], + sizes=[c_tile], ) sv1 = pto.slice_view( - subtensor_type, source=tv1, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv1, + offsets=[offset_global], + sizes=[c_tile], ) sv2 = pto.slice_view( - subtensor_type, source=tv2, offsets=[offset_global], sizes=[c_tile] + subtensor_type, + source=tv2, + offsets=[offset_global], + sizes=[c_tile], ) pto.load(sv0, tb0) @@ -117,7 +126,9 @@ def build_verbose(): sl = _pto.SLayoutAttr.get(_pto.SLayout.NoneBox) pd = _pto.PadValueAttr.get(_pto.PadValue.Null) cfg = _pto.TileBufConfigAttr.get(bl, sl, 512, pd) - tile_buf = _pto.TileBufType.get([1, tile_length], f32, vec, [1, tile_length], cfg) + tile_buf = _pto.TileBufType.get( + [1, tile_length], f32, vec, [1, tile_length], cfg + ) fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32, i32], []) with InsertionPoint(module.body): @@ -149,9 +160,15 @@ def build_verbose(): vec_section = _pto.SectionVectorOp() vec_block = vec_section.body.blocks.append() with InsertionPoint(vec_block): - tv0 = _pto.MakeTensorViewOp(tensor_view, arg0, [total_elements], [c1]).result - tv1 = _pto.MakeTensorViewOp(tensor_view, arg1, [total_elements], [c1]).result - tv2 = _pto.MakeTensorViewOp(tensor_view, arg2, [total_elements], [c1]).result + tv0 = _pto.MakeTensorViewOp( + tensor_view, arg0, [total_elements], [c1] + ).result + tv1 = _pto.MakeTensorViewOp( + tensor_view, arg1, [total_elements], [c1] + ).result + tv2 = _pto.MakeTensorViewOp( + tensor_view, arg2, [total_elements], [c1] + ).result tb0 = _pto.AllocTileOp(tile_buf).result tb1 = _pto.AllocTileOp(tile_buf).result @@ -183,8 +200,12 @@ def build_verbose(): work_if = scf.IfOp(has_elements) with InsertionPoint(work_if.then_block): for i in scf.for_(c0, tiles_to_process, c1): - tile_offset_global = arith.AddIOp(i, tile_offset_this_core).result - offset_global = arith.MulIOp(tile_offset_global, c_tile).result + tile_offset_global = arith.AddIOp( + i, tile_offset_this_core + ).result + offset_global = arith.MulIOp( + tile_offset_global, c_tile + ).result sv0 = _pto.PartitionViewOp( tile_view, tv0, offsets=[offset_global], sizes=[c_tile] diff --git a/tests/frontend/test_add_ir.py b/tests/frontend/test_add_ir.py index d0ce8a05..4a321e39 100644 --- a/tests/frontend/test_add_ir.py +++ b/tests/frontend/test_add_ir.py @@ -15,7 +15,11 @@ def meta_data(): subtensor_type = pto.SubTensorType(shape=[32, 32], dtype=dtype) tile_cfg = pto.TileBufConfig() tile_type = pto.TileBufType( - shape=[32, 32], valid_shape=[-1, -1], dtype=dtype, memory_space="VEC", config=tile_cfg + shape=[32, 32], + valid_shape=[-1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, ) return { "ptr_type": ptr_type, @@ -53,9 +57,15 @@ def vec_add_2d_static( vid_idx = s.index_cast(vid) offset_row = vid_idx * c32 - sv0 = pto.slice_view(subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32]) - sv1 = pto.slice_view(subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32]) - sv2 = pto.slice_view(subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32]) + sv0 = pto.slice_view( + subtensor_type, source=tv0, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv1 = pto.slice_view( + subtensor_type, source=tv1, offsets=[offset_row, c0], sizes=[c32, c32] + ) + sv2 = pto.slice_view( + subtensor_type, source=tv2, offsets=[offset_row, c0], sizes=[c32, c32] + ) with pto.vector_section(): tb0 = pto.alloc_tile(tile_type, valid_row=v_row_idx, valid_col=v_col_idx) @@ -132,9 +142,15 @@ def build(): vec_section = _pto.SectionVectorOp() vec_block = vec_section.body.blocks.append() with InsertionPoint(vec_block): - tb0 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result - tb1 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result - tb2 = _pto.AllocTileOp(tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx).result + tb0 = _pto.AllocTileOp( + tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx + ).result + tb1 = _pto.AllocTileOp( + tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx + ).result + tb2 = _pto.AllocTileOp( + tile_buf_dynamic, valid_row=v_row_idx, valid_col=v_col_idx + ).result _pto.TLoadOp(None, sv0, tb0) _pto.TLoadOp(None, sv1, tb1) diff --git a/tests/frontend/test_caller_gen.py b/tests/frontend/test_caller_gen.py index c03938fa..47e01a02 100644 --- a/tests/frontend/test_caller_gen.py +++ b/tests/frontend/test_caller_gen.py @@ -59,7 +59,10 @@ def mixed_kernel(data: "ptr_i8", count: "i64_type", idx: "index_dtype") -> None: 'extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *data, ' "int64_t count, int64_t idx)" ) in caller_cpp - assert "mixed_kernel<<>>((int8_t *)data, count, idx);" in caller_cpp + assert ( + "mixed_kernel<<>>((int8_t *)data, count, idx);" + in caller_cpp + ) def test_generate_caller_cpp_for_dynamic_1d_add_signature(): diff --git a/tests/frontend/test_matmul_dynamic_ir.py b/tests/frontend/test_matmul_dynamic_ir.py index fabe89bd..4dacbfed 100644 --- a/tests/frontend/test_matmul_dynamic_ir.py +++ b/tests/frontend/test_matmul_dynamic_ir.py @@ -1,7 +1,15 @@ from mlir.dialects import arith, func, pto as _pto, scf from mlir.dialects.arith import CmpIPredicate from mlir.dialects.pto import EVENT_ID0, TLOAD, TMATMUL, TMOV_M2L, TSTORE_ACC -from mlir.ir import Context, F32Type, IndexType, InsertionPoint, IntegerType, Location, Module +from mlir.ir import ( + Context, + F32Type, + IndexType, + InsertionPoint, + IntegerType, + Location, + Module, +) from ptodsl import to_ir_module from ptodsl import pto, tile from ptodsl import scalar as s @@ -35,13 +43,25 @@ def meta_data(): tile_view_out = pto.SubTensorType(shape=[M, N], dtype=dtype) tile_view_bias = pto.SubTensorType(shape=[1, N], dtype=dtype) - tile_buf_aMat = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="MAT") - tile_buf_bMat = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="MAT") - tile_buf_biasData = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="MAT") - tile_buf_aTile = pto.TileBufType(shape=[M, BASEK], dtype=dtype, memory_space="LEFT") - tile_buf_bTile = pto.TileBufType(shape=[BASEK, N], dtype=dtype, memory_space="RIGHT") + tile_buf_aMat = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="MAT" + ) + tile_buf_bMat = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_biasData = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="MAT" + ) + tile_buf_aTile = pto.TileBufType( + shape=[M, BASEK], dtype=dtype, memory_space="LEFT" + ) + tile_buf_bTile = pto.TileBufType( + shape=[BASEK, N], dtype=dtype, memory_space="RIGHT" + ) tile_buf_cTile = pto.TileBufType(shape=[M, N], dtype=dtype, memory_space="ACC") - tile_buf_biasTile = pto.TileBufType(shape=[1, N], dtype=dtype, memory_space="BIAS") + tile_buf_biasTile = pto.TileBufType( + shape=[1, N], dtype=dtype, memory_space="BIAS" + ) return { "ptr_type": ptr_dtype, @@ -93,10 +113,18 @@ def RunTMATMULSplitK( b_end_unclamped = b_start + batches_per_core b_end = s.min_u(b_end_unclamped, batch) - tvA = pto.as_tensor(tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1]) - tvB = pto.as_tensor(tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tvOut = pto.as_tensor(tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1]) - tvBias = pto.as_tensor(tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) + tvA = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[cBM, cK], strides=[cK, c1] + ) + tvB = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1] + ) + tvOut = pto.as_tensor( + tensor_type, ptr=out_ptr, shape=[cBM, cN], strides=[cN, c1] + ) + tvBias = pto.as_tensor( + tensor_type, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1] + ) aMatTile = pto.alloc_tile(tile_buf_aMat) bMatTile = pto.alloc_tile(tile_buf_bMat) @@ -245,13 +273,21 @@ def build_verbose( tile_buf_aMat = pto.TileBufType.get([M, BASEK], dtype, mat, [M, BASEK], cfg_mat) tile_buf_bMat = pto.TileBufType.get([BASEK, N], dtype, mat, [BASEK, N], cfg_mat) - tile_buf_biasData = pto.TileBufType.get([1, N], dtype, mat, [1, N], cfg_mat_bias) - tile_buf_aTile = pto.TileBufType.get([M, BASEK], dtype, left, [M, BASEK], cfg_left) - tile_buf_bTile = pto.TileBufType.get([BASEK, N], dtype, right, [BASEK, N], cfg_right) + tile_buf_biasData = pto.TileBufType.get( + [1, N], dtype, mat, [1, N], cfg_mat_bias + ) + tile_buf_aTile = pto.TileBufType.get( + [M, BASEK], dtype, left, [M, BASEK], cfg_left + ) + tile_buf_bTile = pto.TileBufType.get( + [BASEK, N], dtype, right, [BASEK, N], cfg_right + ) tile_buf_cTile = pto.TileBufType.get([M, N], dtype, acc, [M, N], cfg_acc) tile_buf_biasTile = pto.TileBufType.get([1, N], dtype, bias, [1, N], cfg_bias) - fn_ty = func.FunctionType.get([ptr_dtype, ptr_dtype, ptr_dtype, ptr_dtype, i1, i32], []) + fn_ty = func.FunctionType.get( + [ptr_dtype, ptr_dtype, ptr_dtype, ptr_dtype, i1, i32], [] + ) with InsertionPoint(module.body): fn = func.FuncOp("RunTMATMULSplitK", fn_ty) entry = fn.add_entry_block() @@ -276,17 +312,29 @@ def build_verbose( batch = arith.IndexCastOp(IndexType.get(), batch_i32).result cBM = arith.MulIOp(batch, cM).result - num_blocks = arith.IndexCastOp(IndexType.get(), pto.GetBlockNumOp().result).result + num_blocks = arith.IndexCastOp( + IndexType.get(), pto.GetBlockNumOp().result + ).result batches_per_core = arith.CeilDivSIOp(batch, num_blocks).result - bid = arith.IndexCastOp(IndexType.get(), pto.GetBlockIdxOp().result).result + bid = arith.IndexCastOp( + IndexType.get(), pto.GetBlockIdxOp().result + ).result b_start = arith.MulIOp(bid, batches_per_core).result b_end_unclamped = arith.AddIOp(b_start, batches_per_core).result b_end = arith.MinUIOp(b_end_unclamped, batch).result - tvA = pto.MakeTensorViewOp(tensor_type, a_ptr, [cBM, cK], [cK, c1]).result - tvB = pto.MakeTensorViewOp(tensor_type, b_ptr, [cK, cN], [cN, c1]).result - tvOut = pto.MakeTensorViewOp(tensor_type, out_ptr, [cBM, cN], [cN, c1]).result - tvBias = pto.MakeTensorViewOp(tensor_type, bias_ptr, [c1, cN], [cN, c1]).result + tvA = pto.MakeTensorViewOp( + tensor_type, a_ptr, [cBM, cK], [cK, c1] + ).result + tvB = pto.MakeTensorViewOp( + tensor_type, b_ptr, [cK, cN], [cN, c1] + ).result + tvOut = pto.MakeTensorViewOp( + tensor_type, out_ptr, [cBM, cN], [cN, c1] + ).result + tvBias = pto.MakeTensorViewOp( + tensor_type, bias_ptr, [c1, cN], [cN, c1] + ).result aMatTile = pto.AllocTileOp(tile_buf_aMat).result bMatTile = pto.AllocTileOp(tile_buf_bMat).result @@ -304,7 +352,10 @@ def build_verbose( for i in scf.for_(c0, cIter, c1): kOff = arith.MulIOp(i, cBASEK).result svA = pto.PartitionViewOp( - tile_view_a, tvA, offsets=[row_off, kOff], sizes=[cTileM, cBASEK] + tile_view_a, + tvA, + offsets=[row_off, kOff], + sizes=[cTileM, cBASEK], ).result svB = pto.PartitionViewOp( tile_view_b, tvB, offsets=[kOff, c0], sizes=[cBASEK, cTileN] @@ -357,7 +408,10 @@ def build_verbose( pto.record_event(TMATMUL, TSTORE_ACC, EVENT_ID0) pto.wait_event(TMATMUL, TSTORE_ACC, EVENT_ID0) svOut = pto.PartitionViewOp( - tile_view_out, tvOut, offsets=[row_off, c0], sizes=[cTileM, cTileN] + tile_view_out, + tvOut, + offsets=[row_off, c0], + sizes=[cTileM, cTileN], ).result pto.TStoreOp(None, cTile, svOut) pto.record_event(TSTORE_ACC, TMATMUL, EVENT_ID0) diff --git a/tests/npu/elementwise_binary_dynamic_multicore/README.md b/tests/npu/elementwise_binary_dynamic_multicore/README.md index 91a50697..bd9f6145 100644 --- a/tests/npu/elementwise_binary_dynamic_multicore/README.md +++ b/tests/npu/elementwise_binary_dynamic_multicore/README.md @@ -59,4 +59,4 @@ Output: `__lib.so` in the same directory. - `ptoas` and `bisheng` on `PATH` - `/sources/pto-isa` present -- `torch_npu` installed \ No newline at end of file +- `torch_npu` installed diff --git a/tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py b/tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py index fa57f50a..25c4b6a1 100644 --- a/tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py +++ b/tests/npu/elementwise_binary_dynamic_multicore/binary_builder.py @@ -174,9 +174,7 @@ def _2d( rows_end = row_start + rows_per_core need_truncate = rows_end > rows remaining_rows = rows - row_start - rows_to_process = s.select( - need_truncate, remaining_rows, rows_per_core - ) + rows_to_process = s.select(need_truncate, remaining_rows, rows_per_core) for r in pto.range(c0, rows_to_process, c1): row_idx = r + row_start diff --git a/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py b/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py index 331f169d..bf91ad50 100644 --- a/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py +++ b/tests/npu/elementwise_binary_dynamic_multicore/test_binary_builder.py @@ -19,7 +19,7 @@ ("div", lambda x, y: x / y), ("max", lambda x, y: torch.max(x, y)), ("min", lambda x, y: torch.min(x, y)), - #("or", lambda x, y: x | y), #TODO add back bitwise or test after fixing int16 support in the builder + # ("or", lambda x, y: x | y), #TODO add back bitwise or test after fixing int16 support in the builder ] DTYPES = ["float32", "float16", "int16"] @@ -133,6 +133,7 @@ def test_build_binary_kernels(compiled_lib): @pytest.mark.require_npu def test_binary_1d_precision(compiled_lib): import torch_npu + torch.npu.set_device(_DEVICE) ref_fn = compiled_lib["ref_fn"] torch_dtype = TORCH_DTYPES[compiled_lib["dtype"]] @@ -164,6 +165,7 @@ def test_binary_1d_precision(compiled_lib): @pytest.mark.require_npu def test_binary_2d_precision(compiled_lib): import torch_npu + torch.npu.set_device(_DEVICE) ref_fn = compiled_lib["ref_fn"] torch_dtype = TORCH_DTYPES[compiled_lib["dtype"]] diff --git a/tests/npu/expand_dynamic_multicore/caller.py b/tests/npu/expand_dynamic_multicore/caller.py index 698d9b17..ca5f5090 100644 --- a/tests/npu/expand_dynamic_multicore/caller.py +++ b/tests/npu/expand_dynamic_multicore/caller.py @@ -4,7 +4,6 @@ python caller.py --mode colexpand|rowexpand|rowexpand_mul|rowexpand_sub|rowexpand_div """ - _FUSED_MODES = {"rowexpand_mul", "rowexpand_sub", "rowexpand_div"} diff --git a/tests/npu/expand_dynamic_multicore/expand_builder.py b/tests/npu/expand_dynamic_multicore/expand_builder.py index b36a6b66..aea958f7 100644 --- a/tests/npu/expand_dynamic_multicore/expand_builder.py +++ b/tests/npu/expand_dynamic_multicore/expand_builder.py @@ -301,10 +301,14 @@ def _kernel( ) # src0 = one row of Y, src1 = scalar x[row], dst = one row of Z - tb_src0 = pto.alloc_tile(tile_type, valid_row=c1, valid_col=cols_this) + tb_src0 = pto.alloc_tile( + tile_type, valid_row=c1, valid_col=cols_this + ) pto.load(sv_y, tb_src0) - tb_dst = pto.alloc_tile(tile_type, valid_row=c1, valid_col=cols_this) + tb_dst = pto.alloc_tile( + tile_type, valid_row=c1, valid_col=cols_this + ) row_op(tb_src0, tb_src1, tb_dst) pto.store(tb_dst, sv_z) @@ -341,8 +345,8 @@ def build_row_expand_div(dtype="fp32"): args = parser.parse_args() builders = { - "colexpand": build_col_expand, - "rowexpand": build_row_expand, + "colexpand": build_col_expand, + "rowexpand": build_row_expand, "rowexpand_mul": build_row_expand_mul, "rowexpand_sub": build_row_expand_sub, "rowexpand_div": build_row_expand_div, diff --git a/tests/npu/expand_dynamic_multicore/gen_ir.py b/tests/npu/expand_dynamic_multicore/gen_ir.py index d63ae353..649cef89 100644 --- a/tests/npu/expand_dynamic_multicore/gen_ir.py +++ b/tests/npu/expand_dynamic_multicore/gen_ir.py @@ -19,8 +19,8 @@ ) _BUILDERS = { - "colexpand": build_col_expand, - "rowexpand": build_row_expand, + "colexpand": build_col_expand, + "rowexpand": build_row_expand, "rowexpand_mul": build_row_expand_mul, "rowexpand_sub": build_row_expand_sub, "rowexpand_div": build_row_expand_div, diff --git a/tests/npu/expand_dynamic_multicore/test_expand.py b/tests/npu/expand_dynamic_multicore/test_expand.py index 5cd0d30a..ac3a4ec9 100644 --- a/tests/npu/expand_dynamic_multicore/test_expand.py +++ b/tests/npu/expand_dynamic_multicore/test_expand.py @@ -21,10 +21,7 @@ "rowexpand_div", ] -_LIB_PATHS = { - name: os.path.join(_DIR, f"{name}_lib.so") - for name in _KERNELS -} +_LIB_PATHS = {name: os.path.join(_DIR, f"{name}_lib.so") for name in _KERNELS} _SHAPES = [ (1, 1), diff --git a/tests/npu/gather_dynamic_multicore/test_gather_dynamic.py b/tests/npu/gather_dynamic_multicore/test_gather_dynamic.py index 507c7259..1be4f4f5 100644 --- a/tests/npu/gather_dynamic_multicore/test_gather_dynamic.py +++ b/tests/npu/gather_dynamic_multicore/test_gather_dynamic.py @@ -23,7 +23,7 @@ ("float16", "P0101"), ("float16", "P1111"), ("float16", "P0001"), - ("float16", "P1010") + ("float16", "P1010"), ] # Runtime shapes (B, N). N must be a multiple of 32. @@ -168,7 +168,9 @@ def test_build_gather(compiled_lib): @pytest.mark.require_npu -@pytest.mark.xfail(reason="Known unsolved issues of indeterministic output values", strict=False) +@pytest.mark.xfail( + reason="Known unsolved issues of indeterministic output values", strict=False +) @pytest.mark.parametrize("B, N", _SHAPE_PARAMS) def test_gather_dynamic(compiled_lib, B, N): import torch_npu @@ -195,9 +197,7 @@ def test_gather_dynamic(compiled_lib, B, N): ref = _gather_ref_blocked(src, indices, mask_pattern, num_blocks=NUM_BLOCKS) - torch.testing.assert_close( - out, ref, msg=f"shape=({B},{N}), mask={mask_pattern}" - ) + torch.testing.assert_close(out, ref, msg=f"shape=({B},{N}), mask={mask_pattern}") if __name__ == "__main__": diff --git a/tests/npu/reduce_dynamic_multicore/caller.py b/tests/npu/reduce_dynamic_multicore/caller.py index 2149a565..e22aa443 100644 --- a/tests/npu/reduce_dynamic_multicore/caller.py +++ b/tests/npu/reduce_dynamic_multicore/caller.py @@ -46,4 +46,4 @@ def generate_caller(mode, dtype): parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") args = parser.parse_args() - print(generate_caller(args.mode, args.dtype)) \ No newline at end of file + print(generate_caller(args.mode, args.dtype)) diff --git a/tests/npu/reduce_dynamic_multicore/compile.sh b/tests/npu/reduce_dynamic_multicore/compile.sh index f2915e68..eae6231b 100755 --- a/tests/npu/reduce_dynamic_multicore/compile.sh +++ b/tests/npu/reduce_dynamic_multicore/compile.sh @@ -42,4 +42,4 @@ for MODE in "${MODES[@]}"; do -o "$SCRIPT_DIR/${MODE}_lib.so" echo "Built ${MODE}_lib.so successfully." -done \ No newline at end of file +done diff --git a/tests/npu/reduce_dynamic_multicore/gen_ir.py b/tests/npu/reduce_dynamic_multicore/gen_ir.py index 8224f566..d5c4ffe0 100644 --- a/tests/npu/reduce_dynamic_multicore/gen_ir.py +++ b/tests/npu/reduce_dynamic_multicore/gen_ir.py @@ -42,4 +42,4 @@ parser.add_argument("--dtype", choices=["fp16", "fp32"], default="fp32") args = parser.parse_args() - print(_BUILDERS[args.mode](dtype=args.dtype)) \ No newline at end of file + print(_BUILDERS[args.mode](dtype=args.dtype)) diff --git a/tests/npu/reduce_dynamic_multicore/reduce_builder.py b/tests/npu/reduce_dynamic_multicore/reduce_builder.py index a7dadac9..17522ee8 100644 --- a/tests/npu/reduce_dynamic_multicore/reduce_builder.py +++ b/tests/npu/reduce_dynamic_multicore/reduce_builder.py @@ -142,9 +142,7 @@ def _kernel( tv_x = pto.as_tensor( tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] ) - tv_y = pto.as_tensor( - tensor_type, ptr=y_ptr, shape=[batch], strides=[c1] - ) + tv_y = pto.as_tensor(tensor_type, ptr=y_ptr, shape=[batch], strides=[c1]) with pto.if_context(num_rows > c0): tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) @@ -244,9 +242,7 @@ def _kernel( tile_type, valid_row=c1, valid_col=cols_this ) else: - tb_acc = pto.alloc_tile( - tile_out_type, valid_col=cols_this - ) + tb_acc = pto.alloc_tile(tile_out_type, valid_col=cols_this) sv_x0 = pto.slice_view( subtensor_in, @@ -275,9 +271,7 @@ def _kernel( tile_type, valid_row=c1, valid_col=cols_this ) else: - tb_part = pto.alloc_tile( - tile_out_type, valid_col=cols_this - ) + tb_part = pto.alloc_tile(tile_out_type, valid_col=cols_this) sv_x = pto.slice_view( subtensor_in, @@ -364,4 +358,4 @@ def build_colprod(dtype="fp32"): "colprod": build_colprod, } - print(builders[args.mode](dtype=args.dtype)) \ No newline at end of file + print(builders[args.mode](dtype=args.dtype)) diff --git a/tests/npu/reduce_dynamic_multicore/test_reduce.py b/tests/npu/reduce_dynamic_multicore/test_reduce.py index 879c5b88..c420f06e 100644 --- a/tests/npu/reduce_dynamic_multicore/test_reduce.py +++ b/tests/npu/reduce_dynamic_multicore/test_reduce.py @@ -24,10 +24,7 @@ # "colprod", ] -_LIB_PATHS = { - name: os.path.join(_DIR, f"{name}_lib.so") - for name in _KERNELS -} +_LIB_PATHS = {name: os.path.join(_DIR, f"{name}_lib.so") for name in _KERNELS} _SHAPES = [ (1, 1), @@ -105,7 +102,9 @@ def _output_shape(name, batch, n_cols): def _make_input(name, batch, n_cols, device): if name.endswith("prod"): - return torch.empty(batch, n_cols, device=device, dtype=torch.float32).uniform_(0.5, 1.5) + return torch.empty(batch, n_cols, device=device, dtype=torch.float32).uniform_( + 0.5, 1.5 + ) return torch.randn(batch, n_cols, device=device, dtype=torch.float32) @@ -154,4 +153,4 @@ def test_kernel_precision(compiled_kernels, name, batch, n_cols): if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) \ No newline at end of file + pytest.main([__file__, "-v", "-s"]) From b0a552064f2f3d748b2a63777a2e8e759a7ff1f1 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 07:37:12 +0100 Subject: [PATCH 30/53] all pre-commit check to CI --- .github/workflows/ci.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b1cbe8e3..ffd0cafa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,6 +8,20 @@ on: workflow_dispatch: jobs: + pre-commit: + name: pre-commit + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Run pre-commit checks + run: | + python -m pip install --upgrade pip + python -m pip install pre-commit + pre-commit run --all-files + test: name: test (${{ matrix.arch }}, ${{ matrix.install-mode }}) strategy: From 07da8424eca7f5b51b4c206e250035a030a9555c Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 07:41:20 +0100 Subject: [PATCH 31/53] minor syntax --- examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md index 2e7e48de..c8bdc750 100644 --- a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md +++ b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md @@ -184,7 +184,7 @@ l2_size=201326592 # 192 MiB 8192x8192 矩阵(float16 下 64 MiB)小于 L2;而16384x16384(float16 下 256 MiB)大于 L2,所以后者的性能显著更差。 -`910B4` 的 HBM 和 L2 都是 910B2 的一半(因此更小矩阵就会触发cache eviction): +`910B4` 的 HBM 和 L2 都是 `910B2` 的一半(因此更小矩阵就会触发cache eviction): ```bash grep -A 8 "SoCInfo" ${ASCEND_HOME_PATH}/arm64-linux/data/platform_config/Ascend910B4.ini From c5eba2ef090fac452e5e09e78b30ecf1a15102a3 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 08:04:29 +0100 Subject: [PATCH 32/53] move pyproject.toml to root dir to enable easy pip install from git --- .github/workflows/ci.yml | 4 ++-- README.md | 18 ++++++++++++++---- ptodsl/pyproject.toml => pyproject.toml | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) rename ptodsl/pyproject.toml => pyproject.toml (96%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ffd0cafa..be69c3fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -89,9 +89,9 @@ jobs: - name: Install ptodsl (${{ matrix.install-mode }}) run: | if [ "${{ matrix.install-mode }}" = "standard" ]; then - pip install ./ptodsl + pip install . else - pip install -e ./ptodsl + pip install -e . fi - name: Run frontend tests diff --git a/README.md b/README.md index c85b7e6b..78c6df67 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,24 @@ PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvid **Compare to other kernel programming frameworks** (e.g. [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend), [triton-ascend](https://gitcode.com/Ascend/triton-ascend), and [catlass](https://gitcode.com/cann/catlass)): - PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**, filling the gap of a [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html)-like low-level Python programming for NPU. -## Environment +## Installation -See [docker](./docker) +Recommend using [Docker-ascend](https://gitcode.com/Ascend/mind-cluster). See [docker](./docker) directory to build all dependencies. -## Installation +Then, install this lightweight DSL package itself: + +```bash +# install +pip install "git+https://github.com/huawei-csl/pto-dsl.git" + +# or stable tag +pip install "git+https://github.com/huawei-csl/pto-dsl.git@" +``` + +For in-place development: ```bash -pip install -e ./ptodsl +pip install -e . ``` ## Usage diff --git a/ptodsl/pyproject.toml b/pyproject.toml similarity index 96% rename from ptodsl/pyproject.toml rename to pyproject.toml index 3d0c169e..b5ef41cd 100644 --- a/ptodsl/pyproject.toml +++ b/pyproject.toml @@ -23,4 +23,4 @@ packages = [ ] [tool.setuptools.package-dir] -ptodsl = "." +ptodsl = "ptodsl" From 16e3c7e7e39eff0f7f15d51079bf78a4a2a718fb Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 08:13:07 +0100 Subject: [PATCH 33/53] add docker links --- README.md | 2 +- docker/README.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 78c6df67..cab188da 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvid ## Installation -Recommend using [Docker-ascend](https://gitcode.com/Ascend/mind-cluster). See [docker](./docker) directory to build all dependencies. +See [docker/README.md](./docker/README.md) directory to build all dependencies for NPU. Then, install this lightweight DSL package itself: diff --git a/docker/README.md b/docker/README.md index e7db3e11..ff1f404c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,4 +1,6 @@ -Usage: +Recommend using [Ascend Docker Runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) for a reproducible env. Install it on top of normal Docker, using `Ascend-docker-runtime*.run` files in the [Release page](https://gitcode.com/Ascend/mind-cluster/releases). + +Then, build and run docker image: ```bash RELEASE_TAG=20260309 From b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 08:54:24 +0100 Subject: [PATCH 34/53] black --- .../matmul_optimization_guide/experimental/matmul_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py b/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py index b6aff785..a807414c 100644 --- a/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py +++ b/examples/aot/matmul_optimization_guide/experimental/matmul_builder.py @@ -313,7 +313,7 @@ def level1_loop_mn(m_offset, n_offset, li): tile_buf_b_l1_256, tile_buf_b_l0_256, tile_buf_c_256, - *shared_args + *shared_args, ) with branch.else_context(): level1_loop_mn_dynamic_tilesize( @@ -323,7 +323,7 @@ def level1_loop_mn(m_offset, n_offset, li): tile_buf_b_l1_128, tile_buf_b_l0_128, tile_buf_c_128, - *shared_args + *shared_args, ) for li in pto.range(bid, core_loop, num_blocks): From 0a1c0310b37de037a5aefe83c789953796df7d25 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 09:09:07 +0100 Subject: [PATCH 35/53] update repo links in matmul guide --- .../mamtul_optim_guide_zh.md | 12 ++++++------ .../matmul_optimization_guide/matmul_optim_guide.md | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md index c8bdc750..23a502c1 100644 --- a/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md +++ b/examples/aot/matmul_optimization_guide/mamtul_optim_guide_zh.md @@ -26,7 +26,7 @@ For English version see [matmul_optim_guide.md](./matmul_optim_guide.md) 我们会逐步把一个基于**约100行Python DSL**的算子优化到持平主线库的性能。对照的性能基线是NPU上的`torch.matmul`,内部调用[aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md)(NPU的“cuBLAS平替”),实现方式为[上万行的AscendC代码](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel)。 -本教程的代码坚持:**极简、易于魔改、不带黑盒模板封装**,只提炼**少数最关键的**性能优化点。还有些更全面的、对corner case考虑更细致的Matmul实现例如[Catlass的矩阵乘模板总结](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md)和[AscendC的Matmul性能优化策略总览](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html),把大量优化都藏在了模板和封装里,适合作为后续进阶材料。 +本教程的代码坚持:**极简、易于魔改、不带黑盒模板封装**,只提炼**少数最关键的**性能优化点。还有些更全面的、对corner case考虑更细致的Matmul实现例如[Catlass的矩阵乘模板总结](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/matmul_template_summary.md)和[AscendC的Matmul性能优化策略总览](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html),把大量优化都藏在了模板和封装里,适合作为后续进阶材料。 # 第 0 步:给 CUDA/Triton 用户的 NPU 编程速通 @@ -37,12 +37,12 @@ For English version see [matmul_optim_guide.md](./matmul_optim_guide.md) ## NPU kernel launch行为 NPU上[SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data)风格的kernel看起来和CUDA/Triton语法**似乎很像**: -- 内置变量`block_idx`和`block_num`用于每个core的地址offset计算 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/jit/add_dynamic_multicore/run_add.py#L46-L51) -- CUDA画风的`kernel_name<<>>(args)`kernel launch方式 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/aot/add_dynamic_multicore/caller.cpp#L11) +- 内置变量`block_idx`和`block_num`用于每个core的地址offset计算 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py) +- CUDA画风的`kernel_name<<>>(args)`kernel launch方式 -- [示例](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/aot/elementwise/add_dynamic_multicore/caller.cpp#L11) 其实二者有个关键区别:NPU算子的写法基本都属于CUDA术语里的["persistent kernels"](https://triton-lang.org/main/getting-started/tutorials/09-persistent-matmul.html),也就是`block_dim`等于硬件的核数,而不是随着输入数据size增长。 -例如这个[基于PTO的动态shape向量相加](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L46-L100):每个core不仅自己算好global memory offset,计算的循环迭代次数也会[随着动态的输入数据size而增加](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L83)。这和常规的(非“persistent”)CUDA/Triton kernel 不一样。比如 [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) 设定 `grid = (ceil_div(n_elements, BLOCK_SIZE),)`,用launch时动态计算的`block_dim`匹配动态input size;而我们大多数的NPU kernel(不管基于PTO、AscendC、CCE 还是其他框架)通常都是 `grid = (num_cores,)`。 +例如这个[基于PTO的动态shape向量相加](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py#L46-L100):每个core不仅自己算好global memory offset,计算的循环迭代次数也会[随着动态的输入数据size而增加](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py#L83)。这和常规的(非“persistent”)CUDA/Triton kernel 不一样。比如 [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) 设定 `grid = (ceil_div(n_elements, BLOCK_SIZE),)`,用launch时动态计算的`block_dim`匹配动态input size;而我们大多数的NPU kernel(不管基于PTO、AscendC、CCE 还是其他框架)通常都是 `grid = (num_cores,)`。 (在NPU上,大于核数的`block_dim`在简单场景能跑通,但Cube-Vector核间同步容易出bug。而且`block_dim >= 65536`会溢出,远小于CUDA的`maxGridSize`。我们遇过这个bug,通过切回“persistent-kernel”写法[修好了](https://github.com/huawei-csl/pto-kernels/pull/39)) @@ -51,7 +51,7 @@ NPU上[SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data)风格 NPU的片上缓存为[scratchpad memory](https://en.wikipedia.org/wiki/Scratchpad_memory),而非硬件管理的cache。所以要避免[data hazards](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Data_hazards)需要开发者或编译器正确地使用[set_flag & wait_flag 接口](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0106.html),本质上是基于 [binary semaphore](https://en.wikipedia.org/wiki/Semaphore_(programming)#Producer%E2%80%93consumer_problem) 的同步机制。CUDA里最接近的是[`cp.async`+`wait`那一套](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/async-copies.html)。可以参考这个[基于PTO-ISA手动同步的vector add示例](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/demos/torch_jit/add/add_custom.cpp#L78-L115)。对更复杂的融合算子如[FlashAttention](https://github.com/PTO-ISA/pto-isa/tree/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/kernels/manual/common/flash_atten),思考手动同步、software pipelining 和 prefetching, 对算子开发人员过于烧脑。 -为了解决这个痛点,[PTO-DSL](https://github.com/huawei-csl/pto-dsl) 提供了自动同步,内部由基于[PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/8eb9e23fa95e18c3db789e0a171a98df07a8a846/docs/PTO_IR_manual.md)的[InsertSync pass](https://github.com/zhangstevenunity/PTOAS/tree/8eb9e23fa95e18c3db789e0a171a98df07a8a846/lib/PTO/Transforms/InsertSync)实现。对用户而言,算子代码看起来还是“串行的”(在pipelining意义上),写起来更接近Triton/CuTile的手感。 +为了解决这个痛点,[PTO-DSL](https://github.com/huawei-csl/pto-dsl) 提供了自动同步,内部由基于[PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/v0.9/docs/PTO_IR_manual.md)的[InsertSync pass](https://github.com/zhangstevenunity/PTOAS/tree/v0.9/lib/PTO/Transforms/InsertSync)实现。对用户而言,算子代码看起来还是“串行的”(在pipelining意义上),写起来更接近Triton/CuTile的手感。 # 第 1 步:功能正确的基础版本 @@ -252,7 +252,7 @@ msprof op \ # 附录 A:PTO-DSL 语法说明 -当前的 [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/3f0860b1e750f2c4d26a93c6501a212b60196863/ptodsl) 只是在 PTO dialect 的 [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/)上做了很薄的封装。整个DSL包只有 **约1000行Python**(可以用 `cd ptodsl && find . -name "*.py" | xargs wc -l` 自行确认) +当前的 [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/ptodsl) 只是在 PTO dialect 的 [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/)上做了很薄的封装。整个DSL包只有 **约1000行Python**(可以用 `cd ptodsl && find . -name "*.py" | xargs wc -l` 自行确认) 为了在开发阶段维持一个简单好改的框架,我们目前**不**做Python AST parsing / AST rewriting。因此,所有 Python 原生语法(包括`if`/`for` 控制流、Python class、iterator 等)都按普通Python代码执行。这点和其他Python DSL的做法不太相同:有的是纯 AST 路线(如 Triton、CuTile),有的是 AST+tracing 混合路线(如 Tilelang、CuteDSL),它们 *可能会,也可能不会* 把原生 `if`/`range` rewrite成特殊 IR builder(可参考 [CuteDSL 的复杂规则](https://github.com/Dao-AILab/quack/blob/v0.3.2/docs/dsl_control_flow.rst))。当前 PTO-DSL frontend 是纯 Python tracing,更接近 JAX 的思路。 diff --git a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md index b36adf36..f8311a67 100644 --- a/examples/aot/matmul_optimization_guide/matmul_optim_guide.md +++ b/examples/aot/matmul_optimization_guide/matmul_optim_guide.md @@ -25,7 +25,7 @@ This guide is the NPU version of "step-by-step matmul optimization", a popular a We show step-by-step how to match the performance of a carefully optimized official library, using **only ~100 lines of Python DSL**. The target to compare is `torch.matmul`, which invokes [aclnnMatmul](https://www.hiascend.com/document/detail/zh/canncommercial/850/API/aolapi/context/ops-nn/aclnnMatmul.md) (our "cuBLAS" for NPU), internally implemented by [many thousands of lines of AscendC](https://gitcode.com/cann/ops-nn/tree/v8.5.0/matmul/mat_mul_v3/op_kernel). -I intentionally keep the code samples **minimal, hackable, from-scratch, and without magical templates and wrappers**, to highlight the few key optimizations. There are more comprehensive "Matmul optimizations lists" [in catlass](https://gitcode.com/cann/catlass/blob/master/docs/contents/advanced/matmul_template_summary.md) or [in AscendC](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html), which hide optimization tricks behind templates and wrappers. They are more suited for later, more advanced study. +I intentionally keep the code samples **minimal, hackable, from-scratch, and without magical templates and wrappers**, to highlight the few key optimizations. There are more comprehensive "Matmul optimizations lists" [in catlass](https://gitcode.com/cann/catlass/blob/v1.4.0/docs/contents/advanced/matmul_template_summary.md) or [in AscendC](https://www.hiascend.com/document/detail/zh/canncommercial/850/opdevg/Ascendcopdevg/atlas_ascendc_best_practices_10_10006.html), which hide optimization tricks behind templates and wrappers. They are more suited for later, more advanced study. # Step 0: NPU programming crash course for CUDA/Triton programmers @@ -34,12 +34,12 @@ I intentionally keep the code samples **minimal, hackable, from-scratch, and wit ## Typical kernel launch syntax The [SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data)-style kernels on NPU look **deceptively similar** to CUDA/Triton kernel syntax: -- The `block_idx` and `block_num` built-in variables assist offset calculations for each core -- [example here](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/jit/add_dynamic_multicore/run_add.py#L46-L51) -- The CUDA-style `kernel_name<<>>(args)` kernel launch -- [example here](https://github.com/huawei-csl/pto-dsl/blob/7f8176a648c7c4ca03b09bd75f8b615d4bac0eaf/examples/aot/add_dynamic_multicore/caller.cpp#L11) +- The `block_idx` and `block_num` built-in variables assist offset calculations for each core -- [example here](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py) +- The CUDA-style `kernel_name<<>>(args)` kernel launch -- [example here](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/aot/elementwise/add_dynamic_multicore/caller.cpp#L11) However, there is an important difference: all NPU kernels are ["persistent kernels"](https://triton-lang.org/main/getting-started/tutorials/09-persistent-matmul.html) in CUDA terminology, i.e. the `block_dim` is forced to be the number of cores instead of growing with the input data size. -Check this [PTO dynamic-shape vector-add example](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L46-L100) -- each core calculates its own global memory offsets, and the required number of iterations [depends dynamically on the input data size](https://github.com/huawei-csl/pto-dsl/blob/d923ac2ed3c1a2180475c1d279699ea952022e77/examples/jit/add_dynamic_multicore/run_add.py#L83). This is **unlike** conventional ("non-persistent") CUDA/Triton kernels, where a data-dependent `block_dim` handles the dynamic input size. For example, unlike [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) that sets `grid = (ceil_div(n_elements, BLOCK_SIZE),)`, most of our NPU kernels (no matter whether they are written in PTO, AscendC, CCE, or other frameworks) always have `grid = (num_cores,)`. +Check this [PTO dynamic-shape vector-add example](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py#L46-L100) -- each core calculates its own global memory offsets, and the required number of iterations [depends dynamically on the input data size](https://github.com/huawei-csl/pto-dsl/blob/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/examples/jit/add_dynamic_multicore/run_add.py#L83). This is **unlike** conventional ("non-persistent") CUDA/Triton kernels, where a data-dependent `block_dim` handles the dynamic input size. For example, unlike [Triton vector add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#compute-kernel) that sets `grid = (ceil_div(n_elements, BLOCK_SIZE),)`, most of our NPU kernels (no matter whether they are written in PTO, AscendC, CCE, or other frameworks) always have `grid = (num_cores,)`. (A data-dependent large `block_dim` *might* work for simple cases on NPU, but it can often hit bugs during Cube-Vector synchronization, and can also overflow if `block_dim >= 65536` -- a bug [that we fixed](https://github.com/huawei-csl/pto-kernels/pull/39) by switching to persistent-kernel style.) @@ -47,7 +47,7 @@ Check this [PTO dynamic-shape vector-add example](https://github.com/huawei-csl/ Our NPU uses on-chip [scratchpad memory](https://en.wikipedia.org/wiki/Scratchpad_memory) instead of hardware-managed cache, so [data hazards](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Data_hazards) must be avoided by the programmer or software using [set_flag & wait_flag APIs](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0106.html), essentially a [binary-semaphore](https://en.wikipedia.org/wiki/Semaphore_(programming)#Producer%E2%80%93consumer_problem) synchronization mechanism. The closest analogy in CUDA is [all the `cp.async` stuff](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/async-copies.html) that needs manual waits. See this [manually synchronized vector-add example](https://github.com/PTO-ISA/pto-isa/blob/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/demos/torch_jit/add/add_custom.cpp#L78-L115). For complex fused kernels like [FlashAttention](https://github.com/PTO-ISA/pto-isa/tree/5de2d24d53e8cf39dec5fc11f997d1e74fa7190c/kernels/manual/common/flash_atten), it can be hard to reason about manual synchronization, software pipelining, and prefetching. -To solve this headache, [PTO-DSL](https://github.com/huawei-csl/pto-dsl) offers automatic synchronization, internally achieved by the [InsertSync](https://github.com/zhangstevenunity/PTOAS/tree/8eb9e23fa95e18c3db789e0a171a98df07a8a846/lib/PTO/Transforms/InsertSync) compile pass based on the [PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/8eb9e23fa95e18c3db789e0a171a98df07a8a846/docs/PTO_IR_manual.md). The kernel code still looks "sequential" (in the pipelining dimension), similar to writing Triton or CuTile code. +To solve this headache, [PTO-DSL](https://github.com/huawei-csl/pto-dsl) offers automatic synchronization, internally achieved by the [InsertSync](https://github.com/zhangstevenunity/PTOAS/tree/v0.9/lib/PTO/Transforms/InsertSync) compile pass based on the [PTO MLIR dialect](https://github.com/zhangstevenunity/PTOAS/blob/v0.9/docs/PTO_IR_manual.md). The kernel code still looks "sequential" (in the pipelining dimension), similar to writing Triton or CuTile code. # Step 1: Functionally-correct naive version @@ -244,7 +244,7 @@ Even with manual sync, the code only increases from ~100 lines to ~150 lines of # Appendix A: PTO-DSL syntax note -The current [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/3f0860b1e750f2c4d26a93c6501a212b60196863/ptodsl) is just a very thin wrapper over the [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/) of PTO dialect. The entire package has **only ~1000 lines of Python** (you can check by `cd ptodsl && find . -name "*.py" | xargs wc -l`). +The current [PTO-DSL package](https://github.com/huawei-csl/pto-dsl/tree/b9b0c4abdcb84b84db53f27ffcb4ce8aa1b67316/ptodsl) is just a very thin wrapper over the [MLIR Python bindings](https://mlir.llvm.org/docs/Bindings/Python/) of PTO dialect. The entire package has **only ~1000 lines of Python** (you can check by `cd ptodsl && find . -name "*.py" | xargs wc -l`). To keep the framework simple during rapid development, we are NOT using Python AST parsing or AST rewriting. Thus, all Python-native constructs (`if`/`for` control flows, Python classes, iterators, etc.) execute like normal Python code. This is unlike other pure-AST (the case for Triton & CuTile) or hybrid AST+tracing (the case for Tilelang & CuteDSL) frontends that *might or might not* rewrite native `if`/`range` as special IR builders (e.g. see the [complex rules for CuteDSL](https://github.com/Dao-AILab/quack/blob/v0.3.2/docs/dsl_control_flow.rst)). The current PTO-DSL frontend is pure Python tracing, most like JAX's approach. From 577097c80b486ebbc0f287dcf39d038910f1746f Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 09:38:25 +0100 Subject: [PATCH 36/53] polish README for 0.1.0 release --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index cab188da..8754caa4 100644 --- a/README.md +++ b/README.md @@ -11,26 +11,30 @@ PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvid - Easily interface with [torch-npu](https://gitcode.com/ascend/pytorch) - Lightweight, open-source compiler stack using [PTO Assembler](https://github.com/zhangstevenunity/PTOAS) -**Compare to other kernel programming frameworks** (e.g. [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend), [triton-ascend](https://gitcode.com/Ascend/triton-ascend), and [catlass](https://gitcode.com/cann/catlass)): -- PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**, filling the gap of a [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html)-like low-level Python programming for NPU. +PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**. Compared to other (also very good) kernel programming frameworks, it has a bit different scope by design: +- vs [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend): tilelang can also [use PTO-ISA as codegen backend](https://github.com/tile-ai/tilelang-ascend/blob/76553755da078479a7f60cce9c5f0e9a24d0008b/src/target/codegen_ascend_pto.cc). PTO-DSL intentionally exposes lower-level control, for example L2 swizzling is one-liner `T.use_swizzle` in tilelang, but is a user-defined custom function in PTO-DSL -- see this [matmul optimization example](examples/aot/matmul_optimization_guide/matmul_optim_guide.md). Once PTO-DSL is more stabilized, it might serve as a component like the [CuteDSL backend for tilelang](https://github.com/tile-ai/tilelang/blob/v0.1.8/src/target/codegen_cutedsl.cc). +- vs [triton-ascend](https://gitcode.com/Ascend/triton-ascend) -- Both frameworks automate software pipelining based on some MLIR dialects for NPU. PTO-DSL exposes more NPU-native memory hierarchy such as `L0`/`L1`/`UB`. Also, `pto.load`/`pto.store` always maps to native efficient DMA instructions, while `tl.load`/`tl.store` tries to do GPU-style memory coalescing. +- vs [Catlass](https://gitcode.com/cann/catlass): Catlass provides expert-optimized template collections, while PTO-DSL is more like the [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html) layer of Cutlass, offering explicit low-level primitives. ## Installation -See [docker/README.md](./docker/README.md) directory to build all dependencies for NPU. +See [docker/README.md](./docker/README.md) for full reproducible dependencies on NPU. Then, install this lightweight DSL package itself: ```bash -# install -pip install "git+https://github.com/huawei-csl/pto-dsl.git" +# install latest commit +pip install git+https://github.com/huawei-csl/pto-dsl.git # or stable tag -pip install "git+https://github.com/huawei-csl/pto-dsl.git@" +pip install git+https://github.com/huawei-csl/pto-dsl.git@0.1.0 ``` For in-place development: ```bash +git clone https://github.com/huawei-csl/pto-dsl.git +cd pto-dsl pip install -e . ``` From 0f5255b05f46ffc98a690006b3ff57c74a27da7c Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sat, 14 Mar 2026 09:54:45 +0100 Subject: [PATCH 37/53] add comparison to pypto --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8754caa4..20c92bff 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvid PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**. Compared to other (also very good) kernel programming frameworks, it has a bit different scope by design: - vs [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend): tilelang can also [use PTO-ISA as codegen backend](https://github.com/tile-ai/tilelang-ascend/blob/76553755da078479a7f60cce9c5f0e9a24d0008b/src/target/codegen_ascend_pto.cc). PTO-DSL intentionally exposes lower-level control, for example L2 swizzling is one-liner `T.use_swizzle` in tilelang, but is a user-defined custom function in PTO-DSL -- see this [matmul optimization example](examples/aot/matmul_optimization_guide/matmul_optim_guide.md). Once PTO-DSL is more stabilized, it might serve as a component like the [CuteDSL backend for tilelang](https://github.com/tile-ai/tilelang/blob/v0.1.8/src/target/codegen_cutedsl.cc). -- vs [triton-ascend](https://gitcode.com/Ascend/triton-ascend) -- Both frameworks automate software pipelining based on some MLIR dialects for NPU. PTO-DSL exposes more NPU-native memory hierarchy such as `L0`/`L1`/`UB`. Also, `pto.load`/`pto.store` always maps to native efficient DMA instructions, while `tl.load`/`tl.store` tries to do GPU-style memory coalescing. +- vs [triton-ascend](https://gitcode.com/Ascend/triton-ascend): Both frameworks automate software pipelining based on some MLIR dialects for NPU. PTO-DSL exposes more NPU-native memory hierarchy such as `L0`/`L1`/`UB`. Also, `pto.load`/`pto.store` always maps to native efficient DMA instructions, while `tl.load`/`tl.store` tries to do GPU-style memory coalescing. - vs [Catlass](https://gitcode.com/cann/catlass): Catlass provides expert-optimized template collections, while PTO-DSL is more like the [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html) layer of Cutlass, offering explicit low-level primitives. +- vs [PyPTO](https://gitcode.com/cann/pypto): PyPTO is a full [MPMD](https://en.wikipedia.org/wiki/Flynn%27s_taxonomy#Multiple_programs,_multiple_data_streams_(MPMD)) dynamic runtime stack, which also [uses PTO-ISA as lowest-level primitive](https://gitcode.com/cann/pypto/tree/r0.1.1/framework/src/interface/tileop). PyPTO's Tensor API abstraction is closer to PyTorch/JAX level, while a PTO-DSL kernel is still [SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data) and is closer to CuTile/CuteDSL level. ## Installation From bf107ecda9d1d444e3406176bb9ce34e2bd515ea Mon Sep 17 00:00:00 2001 From: learning-chip Date: Sun, 15 Mar 2026 15:51:08 +0100 Subject: [PATCH 38/53] move framework comparison to bottom --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 20c92bff..84ab41cc 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,6 @@ PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvid - Easily interface with [torch-npu](https://gitcode.com/ascend/pytorch) - Lightweight, open-source compiler stack using [PTO Assembler](https://github.com/zhangstevenunity/PTOAS) -PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**. Compared to other (also very good) kernel programming frameworks, it has a bit different scope by design: -- vs [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend): tilelang can also [use PTO-ISA as codegen backend](https://github.com/tile-ai/tilelang-ascend/blob/76553755da078479a7f60cce9c5f0e9a24d0008b/src/target/codegen_ascend_pto.cc). PTO-DSL intentionally exposes lower-level control, for example L2 swizzling is one-liner `T.use_swizzle` in tilelang, but is a user-defined custom function in PTO-DSL -- see this [matmul optimization example](examples/aot/matmul_optimization_guide/matmul_optim_guide.md). Once PTO-DSL is more stabilized, it might serve as a component like the [CuteDSL backend for tilelang](https://github.com/tile-ai/tilelang/blob/v0.1.8/src/target/codegen_cutedsl.cc). -- vs [triton-ascend](https://gitcode.com/Ascend/triton-ascend): Both frameworks automate software pipelining based on some MLIR dialects for NPU. PTO-DSL exposes more NPU-native memory hierarchy such as `L0`/`L1`/`UB`. Also, `pto.load`/`pto.store` always maps to native efficient DMA instructions, while `tl.load`/`tl.store` tries to do GPU-style memory coalescing. -- vs [Catlass](https://gitcode.com/cann/catlass): Catlass provides expert-optimized template collections, while PTO-DSL is more like the [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html) layer of Cutlass, offering explicit low-level primitives. -- vs [PyPTO](https://gitcode.com/cann/pypto): PyPTO is a full [MPMD](https://en.wikipedia.org/wiki/Flynn%27s_taxonomy#Multiple_programs,_multiple_data_streams_(MPMD)) dynamic runtime stack, which also [uses PTO-ISA as lowest-level primitive](https://gitcode.com/cann/pypto/tree/r0.1.1/framework/src/interface/tileop). PyPTO's Tensor API abstraction is closer to PyTorch/JAX level, while a PTO-DSL kernel is still [SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data) and is closer to CuTile/CuteDSL level. - ## Installation See [docker/README.md](./docker/README.md) for full reproducible dependencies on NPU. @@ -46,3 +40,11 @@ See [examples](./examples) and [tests](./tests) ## Contribute See [contribute_guide.md](./contribute_guide.md) + +## Compare to other frameworks + +PTO-DSL aims for **low-level, explicit, NPU-native primitives** that can match the performance of **programming in [hardware intrinsics](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850/API/cceintrinsicapi/cceapi_0001.html)**. Compared to other (also very good) kernel programming frameworks, it has a bit different scope by design: +- vs [tilelang-ascend](https://github.com/tile-ai/tilelang-ascend): tilelang can also [use PTO-ISA as codegen backend](https://github.com/tile-ai/tilelang-ascend/blob/76553755da078479a7f60cce9c5f0e9a24d0008b/src/target/codegen_ascend_pto.cc). PTO-DSL intentionally exposes lower-level control, for example L2 swizzling is one-liner `T.use_swizzle` in tilelang, but is a user-defined custom function in PTO-DSL -- see this [matmul optimization example](examples/aot/matmul_optimization_guide/matmul_optim_guide.md). Once PTO-DSL is more stabilized, it might serve as a component like the [CuteDSL backend for tilelang](https://github.com/tile-ai/tilelang/blob/v0.1.8/src/target/codegen_cutedsl.cc). +- vs [triton-ascend](https://gitcode.com/Ascend/triton-ascend): Both frameworks automate software pipelining based on some MLIR dialects for NPU. PTO-DSL exposes more NPU-native memory hierarchy such as `L0`/`L1`/`UB`. Also, `pto.load`/`pto.store` always maps to native efficient DMA instructions, while `tl.load`/`tl.store` tries to do GPU-style memory coalescing. +- vs [Catlass](https://gitcode.com/cann/catlass): Catlass provides expert-optimized template collections, while PTO-DSL is more like the [CuteDSL](https://docs.nvidia.com/cutlass/latest/media/docs/pythonDSL/overview.html) layer of Cutlass, offering explicit low-level primitives. +- vs [PyPTO](https://gitcode.com/cann/pypto): PyPTO is a full [MPMD](https://en.wikipedia.org/wiki/Flynn%27s_taxonomy#Multiple_programs,_multiple_data_streams_(MPMD)) dynamic runtime stack, which also [uses PTO-ISA as lowest-level primitive](https://gitcode.com/cann/pypto/tree/r0.1.1/framework/src/interface/tileop). PyPTO's Tensor API abstraction is closer to PyTorch/JAX level, while a PTO-DSL kernel is still [SPMD](https://en.wikipedia.org/wiki/Single_program,_multiple_data) and is closer to CuTile/CuteDSL level. From 8a64455fa9739a8faa7b1108adefddf6909a8e77 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Mon, 16 Mar 2026 17:11:37 +0100 Subject: [PATCH 39/53] minimum agent skills to translate PTO-ISA cpp to PTO-DSL python (#89) * minimum agent skills to translate PTO-ISA cpp to PTO-DSL python * move example generation to Non-Negotiable Rules * more explicit rule for ref example checking * pre-commit run --all-files --------- Co-authored-by: jiawei_zhuang --- .agent/skills/translate_cpp2py/SKILL.md | 176 + .../references/example_translation/.gitkeep | 0 .../references/ptoas_source/PTOOps.td | 3648 ++++++++ .../references/ptoas_source/PTOToEmitC.cpp | 7713 +++++++++++++++++ .../references/ptoas_source/README.md | 4 + .../references/ptoas_source/pto.py | 280 + .../references/ptoisa_source/README.md | 6 + .../references/ptoisa_source/pto-inst.hpp | 830 ++ .../scripts/collect_example_translate.py | 276 + .../scripts/example_list.json | 141 + 10 files changed, 13074 insertions(+) create mode 100644 .agent/skills/translate_cpp2py/SKILL.md create mode 100644 .agent/skills/translate_cpp2py/references/example_translation/.gitkeep create mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td create mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp create mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/README.md create mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/pto.py create mode 100644 .agent/skills/translate_cpp2py/references/ptoisa_source/README.md create mode 100644 .agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp create mode 100644 .agent/skills/translate_cpp2py/scripts/collect_example_translate.py create mode 100644 .agent/skills/translate_cpp2py/scripts/example_list.json diff --git a/.agent/skills/translate_cpp2py/SKILL.md b/.agent/skills/translate_cpp2py/SKILL.md new file mode 100644 index 00000000..533bfabf --- /dev/null +++ b/.agent/skills/translate_cpp2py/SKILL.md @@ -0,0 +1,176 @@ +--- +name: translate-cpp2py +description: Translate manual PTO-ISA C++ kernels into PTO-DSL Python builders and verification harnesses. Use when converting pto-isa kernel code to ptodsl, generating .pto/.cpp via ptoas, handling manual vs auto sync variants, separating vector vs cube APIs, or adding missing ptodsl API wrappers. +--- + +# Translate PTO-ISA C++ to PTO-DSL + +## Scope + +This skill converts a manually written PTO C++ kernel into: +- a **manual-sync** PTO-DSL Python builder (must mirror source C++ behavior), +- an **auto-sync** PTO-DSL variant (same math/control flow, sync removed), +- generated `.pto` and `.cpp`, +- launcher and runtime correctness test scripts. + +Primary references are under `references/example_translation`. Only consult long compiler/dialect sources when mapping is missing. + +## Required Outputs Per Translation Task + +Produce all of the following unless user asks otherwise: +- Python builder for **manual-sync** kernel. +- Python builder for **auto-sync** kernel. +- Compile scripts: + - manual: `python builder.py > kernel.pto && ptoas kernel.pto -o kernel.cpp` + - auto: `python builder.py > kernel.pto && ptoas --enable-insert-sync kernel.pto -o kernel.cpp` +- `caller.cpp` kernel launcher with correct ABI and launch geometry. +- `run_*.py` load-and-test script to validate numerical correctness. +- `README.md` with minimal usage commands (compile + run + optional bench), following concise style used in `examples/aot/*/README.md`. + +## Non-Negotiable Rules + +1. Input C++ is manual-sync by default. Port to manual-sync Python first. +2. Then create auto-sync variant by removing explicit sync APIs and compiling with `--enable-insert-sync`. +3. Preserve ABI exactly: function name, argument order/types, launch contract. +4. Match section type exactly: vector (`__DAV_VEC__`) vs cube (`__DAV_CUBE__`). +5. Prefer compact Python; preserve semantics, not C++ verbosity. +6. If wrapper is missing in `ptodsl/api`, add it instead of forcing awkward translation. +7. First check if the directory `references/example_translation` is empty or contains too few examples, + If empty, ask for running `scripts/collect_example_translate.py` to generate full Python-C++ mapping examples. + + +## Translation Workflow + +1. **Classify kernel** + - Determine section: vector vs cube. + - Determine sync style: manual vs auto (source C++ is manual). + - Identify core partitioning pattern (block/subblock/batch split). + +2. **Rebuild signature + metadata first** + - Define `meta_data()` with scalar/index/pointer/tensor/subtensor/tile types. + - Use `@to_ir_module(meta_data=meta_data)`. + - Keep argument order identical to C++ kernel ABI. + +3. **Port runtime control flow** + - Use `pto.range`, `pto.if_context`, `pto.cond` for runtime logic. + - Keep all tail guards and truncation branches. + +4. **Port data movement + tile math** + - Build tensors via `pto.as_tensor`. + - Create subviews with `pto.slice_view`. + - Allocate tiles with `pto.alloc_tile`. + - Map load/store/compute ops 1:1 (see mapping rules below). + +5. **Handle synchronization** + - Manual variant: keep explicit event/barrier calls. + - Auto variant: remove manual sync calls, keep op order, compile with insert-sync pass. + +6. **Generate and verify round-trip** + - Emit `.pto`, compile to `.cpp`, and sanity-check structural equivalence. + - Build `.so` with `caller.cpp`. + - Run Python test script against reference (`torch` or equivalent). + +## Sync Modes (Must Explain in Every Task) + +- **Manual sync mode** + - Python uses explicit sync APIs in `ptodsl/api/synchronization.py`. + - Typical APIs: `record_event`, `wait_event`, `record_wait_pair`, `barrier`. + - Compile with plain `ptoas` (no `--enable-insert-sync`). + - Use for direct mirroring of manual C++ or for hand-tuned pipelines. + +- **Auto sync mode** + - Remove explicit sync APIs from Python DSL. + - Compile with `ptoas --enable-insert-sync`. + - Compiler inserts hazard-handling synchronization. + - Use for simpler maintainable variant with same algorithmic behavior. + +Rule of thumb: one kernel variant should use one sync strategy only. + +## Vector vs Cube Section/API Boundaries + +- **Vector kernels** + - Use `with pto.vector_section():` + - Lowers to `#if defined(__DAV_VEC__)`. + - Typical ops: elementwise/reduction/vector dataflow (`tile.add/sub/mul/div/...`). + +- **Cube kernels** + - Use `with pto.cube_section():` + - Lowers to `#if defined(__DAV_CUBE__)`. + - Typical ops: matrix engines (`tile.matmul`, `tile.matmul_acc`, `tile.matmul_bias`). + +- **API surface filtering** + - Vector-only example: `tile.add` in `ptodsl/api/tile.py`. + - Cube-only example: `tile.matmul` in `ptodsl/api/tile.py`. + - Keep agent search narrow: choose section first, then look only at relevant API family. + +## Compact Mapping Rules (Python -> C++) + +1. `@to_ir_module` function -> emitted `__global__ AICORE void ...`. +2. `PtrType(dtype)` -> C++ GM pointer arg type. +3. `TensorType/SubTensorType` + `as_tensor/slice_view` -> `GlobalTensor` objects/views. +4. `TileBufType(memory_space=...)` + `alloc_tile` -> tile declarations in corresponding memory space. +5. `pto.get_block_idx/get_block_num/get_subblock_idx/get_subblock_num` -> runtime core/subcore intrinsics. +6. `s.const/s.index_cast/s.ceil_div/s.select/min` -> scalar arithmetic + branch/select expressions. +7. `pto.range(...)` -> runtime loop in IR/C++. +8. Python `range(...)` -> build-time unroll/metaprogramming. +9. `pto.if_context(...)` / `pto.cond(...)` -> runtime conditional branches. +10. Python `if` -> build-time branch while constructing IR. +11. `pto.load` / `pto.store` -> load/store tile movement ops. +12. `tile.add/sub/mul/div/relu/exp/...` -> corresponding PTO compute intrinsics. +13. `tile.matmul*` family -> cube matmul intrinsics. +14. Multicore distribution usually maps via: + - vector core id = `block_idx * subblock_num + subblock_idx` (vector core is 2x than cube core, `subblock_num` equals 2) + - tiles per core = ceil-div(total tiles, total cores) + - guarded tail processing for final core(s). +15. Dynamic-shape kernels require explicit bound guards before slicing/loading/storing. + +## Runtime Semantics Reminder (Critical) + +PTO-DSL is Python tracing, not AST rewriting: +- Python-native `if/for` executes at build time, similar to C++ compile-time metaprogramming or loop unrolling +- Only `pto.range` and `pto.if_context` represent runtime control flow in generated kernel. + +Never translate runtime C++ control logic into Python-native `if/range` by mistake. + +## Missing API Wrapper Protocol + +If required C++ op has no convenient Python wrapper: + +1. Add thin wrapper in the right module: + - tile/instruction ops -> `ptodsl/api/tile.py` + - general tensor/control helpers -> `ptodsl/api/pto_general.py` + - sync helpers -> `ptodsl/api/synchronization.py` +2. Re-export through `ptodsl/api/pto.py` when needed. +3. Keep wrapper minimal: pass through to MLIR Python binding op with light argument normalization. + +## Escalation Path (Only When Mapping Is Missing) + +Check in order: +1. MLIR Python op bindings: `references/ptoas_source/pto.py` +2. Dialect op definitions/contracts: `references/ptoas_source/PTOOps.td` +3. C++ codegen lowering: `references/ptoas_source/PTOToEmitC.cpp` +4. ISA semantics: `references/ptoisa_source/pto-inst.hpp` + +If op exists in dialect but not lowered in `PTOToEmitC.cpp`, translation requires PTOAS compiler work (not only DSL wrapper work). +In this case, suggest an issue report to PTOAS project (https://github.com/zhangstevenunity/PTOAS) + +## Round-Trip Verification Checklist + +- [ ] Manual-sync Python version created first and compiles with plain `ptoas`. +- [ ] Auto-sync variant created and compiles with `--enable-insert-sync`. +- [ ] Generated C++ keeps ABI/section/loop/tail semantics. +- [ ] Launcher `caller.cpp` matches kernel symbol and launch parameters. +- [ ] Test script loads `.so`, runs multiple shapes (including tail/non-divisible cases), compares against trusted reference. +- [ ] If multicore kernel: test cases include shapes not multiples of core count. +- [ ] `README.md` documents the exact local commands to compile and run verification. + +## Reference Priority + +Use these first: +- `references/example_translation/**` (primary mapping corpus) +- `references/example_translation/fast_hadamard/**` (manual vs auto sync pair) +- `references/example_translation/batch_matmul/**` (cube kernels) +- `examples/aot/elementwise/add_dynamic_multicore/*` (caller/test/build pattern) +- `examples/aot/matmul_optimization_guide/matmul_optim_guide.md` (sync and runtime-control semantics) + +Consult `references/ptoas_source/**` and ISA headers only for patterns not covered by examples. diff --git a/.agent/skills/translate_cpp2py/references/example_translation/.gitkeep b/.agent/skills/translate_cpp2py/references/example_translation/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td new file mode 100644 index 00000000..b46efe33 --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td @@ -0,0 +1,3648 @@ +//===- PTOOps.td - Pattern descriptor operations -----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the PTO dialect operations. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_PTO_IR_PTOOPS +#define MLIR_DIALECT_PTO_IR_PTOOPS + +include "PTO/IR/PTODialect.td" +include "PTO/IR/PTOAttrs.td" +include "PTO/IR/PTOTypeDefs.td" +include "PTO/IR/PTOInterfaces.td" + +include "mlir/IR/OpBase.td" +include "mlir/IR/OpAsmInterface.td" +include "mlir/IR/SymbolInterfaces.td" + +include "mlir/Interfaces/DestinationStyleOpInterface.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ViewLikeInterface.td" + +//===----------------------------------------------------------------------===// +// Types +//===----------------------------------------------------------------------===// + +def TensorOrMemref : + AnyTypeOf<[AnyMemRef, AnyRankedTensor], "Tensor or Memref">; + +def PTODpsType : + AnyTypeOf<[AnyRankedTensor, AnyMemRef, PartitionTensorViewType, TileBufType]>; + +def PtrOrMemRef : + AnyTypeOf<[PtrType, AnyMemRef], "Ptr or MemRef">; + +def ScalarPtrOrMemRef : + TypeConstraint< + CPred<"::mlir::pto::isScalarPtrOrMemRef($_self)">, + "Ptr or MemRef in GM">; + +def PrintScalarType : + AnyTypeOf<[Index, AnySignlessInteger, AnyFloat], "numeric (index/integer/float)">; + +//===----------------------------------------------------------------------===// +// Op Class +//===----------------------------------------------------------------------===// +class PTO_TOp traits = []> + : Op; + +class PTO_DpsOp traits = []> + : Op { + let extraClassDeclaration = [{ + ::mlir::MutableOperandRange getDpsInitsMutable() { + return getDstMutable(); + } + }]; +} + +class PTO_Op traits = []> + : Op; + +//===----------------------------------------------------------------------===// +// Pointer/View Ops (for your front-end IR) +//===----------------------------------------------------------------------===// + +def AddPtrOp : PTO_Op<"addptr", [ + Pure, + AllTypesMatch<["ptr", "result"]>, + DeclareOpInterfaceMethods + ]> { + let summary = "Add an element offset to a !pto.ptr"; + let description = [{ + Computes a new pointer by adding an element offset to the base pointer. + The offset is in elements (not bytes). + }]; + + let arguments = (ins + PtrType:$ptr, + Index:$offset + ); + + let results = (outs PtrType:$result); + + let hasVerifier = 1; + + let assemblyFormat = [{ + $ptr `,` $offset attr-dict `:` type($ptr) `->` type($result) + }]; +} + +//===----------------------------------------------------------------------===// +// Scalar pointer load/store +//===----------------------------------------------------------------------===// + +def LoadScalarOp : PTO_Op<"load_scalar", [ + DeclareOpInterfaceMethods + ]> { + let summary = "Load a single scalar element from a pointer at offset."; + + let arguments = (ins + ScalarPtrOrMemRef:$ptr, + Index:$offset + ); + + let results = (outs AnyType:$value); + + let hasVerifier = 1; + + let assemblyFormat = [{ + $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value) + }]; +} + +def StoreScalarOp : PTO_Op<"store_scalar", [ + DeclareOpInterfaceMethods + ]> { + let summary = "Store a single scalar element to a pointer at offset."; + + let arguments = (ins + ScalarPtrOrMemRef:$ptr, + Index:$offset, + AnyType:$value + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value) + }]; +} + +def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> { + let summary = "Wrap a pointer as a tensor_view descriptor (no allocation, no copy)."; + + let arguments = (ins + AnyType:$ptr, + Variadic:$shape, + Variadic:$strides, + OptionalAttr:$layout + ); + + let results = (outs AnyType:$result); + + let hasCustomAssemblyFormat = 1; + let hasVerifier = 1; +} + +// ============================================================================= +// PartitionViewOp +// ============================================================================= +def PartitionViewOp : PTO_Op<"partition_view", [AttrSizedOperandSegments]> { + let summary = "Partition a tensor view into a smaller logical view (logical slicing)."; + let description = [{ + Captures a specific calculation region from a large view. + It carries offsets (where to read) and sizes (how much to read). + + IR Example: + %1 = pto.partition_view %0, offsets=[...], sizes=[...] + : !pto.tensor_view -> !pto.partition_tensor_view + }]; + + let arguments = (ins + TensorViewType:$source, // 输入: 物理大底座 (MakeTensorViewOp 的结果) + Variadic:$offsets, // 动态 offsets + Variadic:$sizes // 动态 sizes + ); + + let results = (outs PartitionTensorViewType:$result); // 输出: 逻辑切片 + + let assemblyFormat = [{ + $source `,` `offsets` `=` `[` $offsets `]` `,` `sizes` `=` `[` $sizes `]` + attr-dict `:` qualified(type($source)) `->` qualified(type($result)) + }]; +} + +// Helper: tensor_view or memref (after lowering tensor_view to memref). +def TensorViewOrMemRef : + AnyTypeOf<[TensorViewType, AnyMemRef], "TensorView or MemRef">; + +// Get the size of a dimension of a tensor_view or its lowered memref view. +// Result type: Index (use arith.index_cast if i32 is needed). +def GetTensorViewDimOp : PTO_Op<"get_tensor_view_dim", [Pure]> { + let summary = "Get the size of a dimension of a tensor_view."; + let description = [{ + Returns the size of the given dimension of a logical tensor view. + This op accepts either !pto.tensor_view or the memref it is lowered to. + IR (tensor_view form): + %dim_size = pto.get_tensor_view_dim %tv, %dim_index + : !pto.tensor_view, index -> index + IR (memref form, after lowering): + %dim_size = pto.get_tensor_view_dim %mr, %dim_index + : memref<...>, index -> index + }]; + let arguments = (ins + TensorViewOrMemRef:$tensor_view, + Index:$dim_index + ); + let results = (outs Index:$result); + let assemblyFormat = [{ + $tensor_view `,` $dim_index `:` qualified(type($tensor_view)) `->` qualified(type($result)) + attr-dict + }]; +} + +def AllocTileOp : PTO_Op<"alloc_tile", [AttrSizedOperandSegments]> { + let summary = "Allocates a tile buffer (logical buffer)."; + + let arguments = (ins + Optional:$addr, + Optional:$valid_row, + Optional:$valid_col + ); + + let results = (outs TileBufType:$result); + + let assemblyFormat = [{ + (`addr` `=` $addr^)? + (`valid_row` `=` $valid_row^)? + (`valid_col` `=` $valid_col^)? + attr-dict `:` qualified(type($result)) + }]; + + let extraClassDeclaration = [{ + ::mlir::LogicalResult verify(); + }]; +} + + +// ============================================================================ +// BindTileOp: 将 Config 和 Valid Dims 绑定到 MemRef 上 +// ============================================================================ +def BindTileOp : PTO_Op<"bind_tile", [ + Pure, + AttrSizedOperandSegments + // 允许输入 offset:0 -> 输出 offset:? + ]> { + let summary = "Binds metadata and implicitly casts layout"; + let description = [{ + Wraps a memref with PTO metadata (valid dimensions and config). + }]; + + // [修改] 以前是 Variadic:$valid_dims + // 现在改为明确的 Optional Row/Col,与 PointerCastOp 保持一致 + let arguments = (ins + AnyMemRef:$source, + Optional:$valid_row, + Optional:$valid_col, + TileBufConfigAttr:$config + ); + + let results = (outs AnyMemRef:$result); + + // [修改] assemblyFormat: 去掉 valid_dims,改为 ($row, $col)? + let assemblyFormat = [{ + $source (`,` $valid_row^ `,` $valid_col)? attr-dict `:` qualified(type($source)) `->` qualified(type($result)) + }]; +} + +def SubsetOp : PTO_Op<"subset", [ + Pure, + ViewLikeOpInterface, + DeclareOpInterfaceMethods // 启用 C++ 推导 + ]> { + + let summary = "Create a strided view (subset) from a parent tile."; + let description = [{ + Creates a view into the source tile. + - Result Shape: Defined by static `sizes`. + - Result Strides: Inherited from `source`. + - Result Offset: Represented as multi-dimensional symbols (s0, s1...) in the layout map. + }]; + + let arguments = (ins + TileBufType:$source, + Variadic:$offsets, // 运行时动态偏移 [i, j] + I64ArrayAttr:$sizes // 静态形状 [32, 32] + ); + + let results = (outs TileBufType:$result); + let hasVerifier = 1; + + // 语法示例: %sub = pto.subset %src[%i, %j] sizes [32, 32] : !type + // 注意:没有 -> qualified(type($result)) + let assemblyFormat = [{ + $source `[` $offsets `]` `sizes` $sizes attr-dict `:` qualified(type($source)) + }]; + + // [新增] 显式实现 ViewLikeOpInterface 缺失的方法 + let extraClassDeclaration = [{ + // 接口要求 getViewSource,我们转发给自动生成的 getSource + ::mlir::Value getViewSource() { return getSource(); } + + // ViewLikeOpInterface 可能还需要 getOffsets (如果 Variadic 不自动匹配) + // 但通常 Variadic:$offsets 会生成 getOffsets(),这应该没问题。 + // 如果后续报 getOffsets 错,也可以在这里加。 + }]; +} + +// ============================================================================ +// SSA TileBuf Config Ops (aliasing views) +// ============================================================================ + +def BitcastOp : PTO_Op<"bitcast", [ + Pure, + ViewLikeOpInterface + ]> { + let summary = "SSA dtype reinterpretation of a tile buffer view (aliases src storage)"; + let description = [{ + Returns a tile buffer view with a different element type (dtype) while + reusing the same underlying storage as the source. + + This op is a metadata/config rewrite only (no data movement). + }]; + + let arguments = (ins + TileBufType:$src + ); + + let results = (outs + TileBufType:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))"; + + let extraClassDeclaration = [{ + ::mlir::Value getViewSource() { return getSrc(); } + }]; +} + +//===----------------------------------------------------------------------===// +// DMA Ops +//===----------------------------------------------------------------------===// +// ------------------------- +// DPS versions in tile world +// ------------------------- +def TLoadOp : PTO_TOp<"tload", [ + PTO_DpsInitOpInterface, + AttrSizedOperandSegments, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO data load operation (Dimension Collapse: PartitionView -> TileBuf)."; + let description = [{ + Loads data from a high-dimensional logical partition view into a 2D physical tile buffer. + Constraint: The product of partition view sizes must match the product of tile buffer valid dims. + DPS form: + pto.tload ins(%partition_view) outs(%tile_buf) + }]; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst, + OptionalAttr:$pad_mode, + Optional:$pad_value, + Optional:$left_padding_num, + Optional:$right_padding_num, + DefaultValuedOptionalAttr:$init_out_buffer, + Optional:$init_condition + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let builders = [ + // 1. Basic: (src, dst) + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst), [{ + build($_builder, $_state, res, src, dst, + /*pad_mode=*/nullptr, /*pad_value=*/nullptr, + /*left=*/nullptr, /*right=*/nullptr, + /*init_out=*/nullptr, /*init_cond=*/nullptr); + }]>, + + // 2. With left_padding_num + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, + "Value":$left_padding_num), [{ + build($_builder, $_state, res, src, dst, + nullptr, nullptr, left_padding_num, nullptr, nullptr, nullptr); + }]>, + + // 3. With pad_mode, pad_value + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, + "pto::PadModeAttr":$pad_mode, "Value":$pad_value), [{ + build($_builder, $_state, res, src, dst, + pad_mode, pad_value, nullptr, nullptr, nullptr, nullptr); + }]>, + + // 4. ... + left + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, + "pto::PadModeAttr":$pad_mode, "Value":$pad_value, + "Value":$left_padding_num), [{ + build($_builder, $_state, res, src, dst, + pad_mode, pad_value, left_padding_num, nullptr, nullptr, nullptr); + }]>, + + // 5. ... + left + right + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, + "pto::PadModeAttr":$pad_mode, "Value":$pad_value, + "Value":$left_padding_num, "Value":$right_padding_num), [{ + build($_builder, $_state, res, src, dst, + pad_mode, pad_value, left_padding_num, right_padding_num, nullptr, nullptr); + }]>, + + // 6. ... + left + right + bool + OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, + "pto::PadModeAttr":$pad_mode, "Value":$pad_value, + "Value":$left_padding_num, "Value":$right_padding_num, + "bool":$init_out_buffer), [{ + build($_builder, $_state, res, src, dst, + pad_mode, pad_value, left_padding_num, right_padding_num, + init_out_buffer, nullptr); + }]> + ]; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + (`pad_mode` `=` $pad_mode^)? + (`pad_value` `=` $pad_value^ `:` type($pad_value))? + (`left_padding_num` `=` $left_padding_num^ `:` type($left_padding_num))? + (`init_out_buffer` `=` $init_out_buffer^ )? + (`right_padding_num` `=` $right_padding_num^ `:` type($right_padding_num))? + (`init_condition` `=` $init_condition^ `:` type($init_condition))? + (`->` qualified(type($result))^)? + }]; + + let extraClassDeclaration = [{ + static StringRef getOpName() { return "pto_load"; } + ShapedType getSrcOperandType() { + return cast(getSrc().getType()); + } + ShapedType getDstOperandType() { + return cast(getDst().getType()); + } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_MTE2; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TStoreOp: PTO_TOp<"tstore", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO data store operation (TileBuf -> PartitionView)."; + let description = [{ + Stores data from a 2D physical tile buffer back to a high-dimensional logical partition view. + + DPS form: + pto.tstore ins(%tile_buf) outs(%partition_view) + }]; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + (`->` qualified(type($result))^)? + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + // pto-isa lowers TSTORE differently depending on the source tile domain: + // - ACC (L0C) -> GM uses the FIX pipe (copy_matrix_cc_to_gm). + // - VEC/UB and MAT -> GM use the MTE3 pipe. + // + // Sync insertion must use the correct pipe, otherwise the generated + // set_flag/wait_flag pairs won't order the actual hardware instructions. + auto isAcc = [](Type ty) -> bool { + if (auto tb = ::mlir::dyn_cast<::mlir::pto::TileBufType>(ty)) { + if (auto as = ::mlir::dyn_cast_or_null<::mlir::pto::AddressSpaceAttr>( + tb.getMemorySpace())) + return as.getAddressSpace() == ::mlir::pto::AddressSpace::ACC; + return false; + } + if (auto mr = ::mlir::dyn_cast<::mlir::MemRefType>(ty)) { + if (auto ms = mr.getMemorySpace()) { + if (auto as = ::mlir::dyn_cast<::mlir::pto::AddressSpaceAttr>(ms)) + return as.getAddressSpace() == ::mlir::pto::AddressSpace::ACC; + } + return false; + } + return false; + }; + if (isAcc(getSrc().getType())) + return ::mlir::pto::PIPE::PIPE_FIX; + return ::mlir::pto::PIPE::PIPE_MTE3; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TTransOp : PTO_TOp<"ttrans", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO matrix trans operation (destination-style, DPS)."; + let description = [{ + DPS form: + tile buf: pto.ttrans ins(%src %tmp) outs(%dst) + }]; + let arguments = (ins + PTODpsType:$src, + PTODpsType:$tmp, + PTODpsType:$dst + ); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_V; // Vector Pipe + } + ::mlir::MutableOperandRange getDpsInitsMutable() { + return getDstMutable(); + } + }]; +} +//===----------------------------------------------------------------------===// +// TMATMUL_BIAS / TMATMUL_MX family +// - DPS: AnyMemRef +// - TOp: PTODpsType +// - All in ins/outs form +//===----------------------------------------------------------------------===// + +// ------------------------- +// DPS versions (AnyMemRef) +// ------------------------- + +// ------------------------- +// Tile-world TOp versions (PTODpsType) +// ------------------------- + +def TMatmulBiasOp : PTO_TOp<"tmatmul.bias", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Matmul with bias add (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$a, + PTODpsType:$b, + PTODpsType:$bias, + PTODpsType:$dst + ); + + let results = (outs Optional:$result); + let hasVerifier = 1; + + //let builders = [ + // OpBuilder<(ins "TypeRange":$resultTypes, "Value":$a, "Value":$b, "Value":$bias, "Value":$dst), [{ + // build($_builder, $_state, resultTypes, a, b, bias, dst, ValueRange{}); + // }]> + //]; + + let assemblyFormat = [{ + `ins` `(` $a `,` $b `,` $bias `:` qualified(type($a)) `,` qualified(type($b)) `,` qualified(type($bias)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "TMATMUL_BIAS"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_M; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +// pto.tmatmul.mx ins(%a,%a_scale,%b,%b_scale) outs(%dst) -> tensor? +def TMatmulMxOp : PTO_TOp<"tmatmul.mx", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods + ]> { + let summary = "Matmul mx (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$a, + PTODpsType:$a_scale, + PTODpsType:$b, + PTODpsType:$b_scale, + PTODpsType:$dst); + + let results = (outs Optional:$result); + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $a `,` $a_scale `,` $b `,` $b_scale + `:` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + (`->` qualified(type($result))^)? + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "TMATMUL_MX"; } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +// pto.tmatmul.mx.acc ins(%c_in,%a,%a_scale,%b,%b_scale) outs(%dst) -> tensor? +def TMatmulMxAccOp : PTO_TOp<"tmatmul.mx.acc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Matmul mx accumulate (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$c_in, + PTODpsType:$a, + PTODpsType:$a_scale, + PTODpsType:$b, + PTODpsType:$b_scale, + PTODpsType:$dst); + + let results = (outs Optional:$result); + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $c_in `,` $a `,` $a_scale `,` $b `,` $b_scale + `:` type($c_in) `,` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + (`->` qualified(type($result))^)? + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "TMATMUL_MX_ACC"; } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +// pto.tmatmul.mx.bias ins(%a,%a_scale,%b,%b_scale,%bias) outs(%dst) -> tensor? +def TMatmulMxBiasOp : PTO_TOp<"tmatmul.mx.bias",[ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Matmul mx with bias (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$a, + PTODpsType:$a_scale, + PTODpsType:$b, + PTODpsType:$b_scale, + PTODpsType:$bias, + PTODpsType:$dst); + + let results = (outs Optional:$result); + let hasVerifier = 1; + + + let assemblyFormat = [{ + `ins` `(` $a `,` $a_scale `,` $b `,` $b_scale `,` $bias + `:` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `,` qualified(type($bias)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + (`->` qualified(type($result))^)? + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "TMATMUL_MX_BIAS"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_M; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMatmulOp : PTO_TOp<"tmatmul", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO matrix multiplication operation (optional bias), destination-style (DPS) in tile world."; + + let arguments = (ins + PTODpsType:$lhs, + PTODpsType:$rhs, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $lhs `,` $rhs `:` qualified(type($lhs)) `,` qualified(type($rhs)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + static StringRef getOpName() { return "matmul_dps"; } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMatmulAccOp : PTO_TOp<"tmatmul.acc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO matrix multiplication accumulate operation, destination-style (DPS, no bias)."; + + let arguments = (ins + PTODpsType:$acc_in, + PTODpsType:$lhs, + PTODpsType:$rhs, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $acc_in `,` $lhs `,` $rhs `:` qualified(type($acc_in) ) `,` qualified(type($lhs)) `,` qualified(type($rhs)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TGemvOp : PTO_TOp<"tgemv", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO matrix-vector multiplication operation (optional bias), destination-style (DPS) in tile world."; + + let arguments = (ins + PTODpsType:$lhs, + PTODpsType:$rhs, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $lhs `,` $rhs `:` qualified(type($lhs)) `,` qualified(type($rhs)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + static StringRef getOpName() { return "gemv"; } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TGemvAccOp : PTO_TOp<"tgemv.acc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "PTO matrix-vector multiplication accumulate operation, destination-style (DPS, no bias)."; + + let arguments = (ins + PTODpsType:$acc_in, + PTODpsType:$lhs, + PTODpsType:$rhs, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $acc_in `,` $lhs `,` $rhs `:` qualified(type($acc_in) ) `,` qualified(type($lhs)) `,` qualified(type($rhs)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TGemvBiasOp : PTO_TOp<"tgemv.bias", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "GEMV with bias add (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType :$a, + PTODpsType :$b, + PTODpsType :$bias, + PTODpsType :$dst + ); + + let results = (outs Optional:$result); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $a `,` $b `,` $bias `:` qualified(type($a)) `,` qualified(type($b)) `,` qualified(type($bias)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "TGEMV_BIAS"; } + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_M; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMovOp : PTO_TOp<"tmov", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Move data between domains (DPS version)."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs + Optional:$result + ); + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + ( `->` qualified(type($result))^ )? + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + // TMOV spans multiple hardware pipelines depending on the source/dest + // domains. Most tile-domain moves are executed by MTE1 (e.g. MAT->L0), + // while UB->UB copies are vector-pipe operations. + auto getASFromType = [](Type ty) + -> std::optional<::mlir::pto::AddressSpace> { + // Pre-lowering: tile_buf carries the address space in its memorySpace. + if (auto tb = llvm::dyn_cast<::mlir::pto::TileBufType>(ty)) { + if (auto as = llvm::dyn_cast_or_null<::mlir::pto::AddressSpaceAttr>( + tb.getMemorySpace())) + return as.getAddressSpace(); + return std::nullopt; + } + // Post PTOViewToMemref: tile_buf is erased to memref but memorySpace is + // preserved in memref's memorySpace attribute. + if (auto mr = llvm::dyn_cast<::mlir::MemRefType>(ty)) { + if (auto ms = mr.getMemorySpace()) { + if (auto as = + llvm::dyn_cast<::mlir::pto::AddressSpaceAttr>(ms)) + return as.getAddressSpace(); + } + return std::nullopt; + } + return std::nullopt; + }; + + auto sOpt = getASFromType(getSrc().getType()); + auto dOpt = getASFromType(getDst().getType()); + if (!sOpt.has_value() || !dOpt.has_value()) + return ::mlir::pto::PIPE::PIPE_V; + + const auto s = sOpt.value(); + const auto d = dOpt.value(); + + // UB -> UB copy is vector pipe. + if (s == ::mlir::pto::AddressSpace::VEC && d == ::mlir::pto::AddressSpace::VEC) { + return ::mlir::pto::PIPE::PIPE_V; + } + + // MAT -> L0 (Left/Right/Bias/Scaling) and ACC -> MAT are MTE1 moves. + if ((s == ::mlir::pto::AddressSpace::MAT && + (d == ::mlir::pto::AddressSpace::LEFT || d == ::mlir::pto::AddressSpace::RIGHT || + d == ::mlir::pto::AddressSpace::BIAS || d == ::mlir::pto::AddressSpace::SCALING)) || + (s == ::mlir::pto::AddressSpace::ACC && d == ::mlir::pto::AddressSpace::MAT)) { + return ::mlir::pto::PIPE::PIPE_MTE1; + } + + // Fallback: treat as vector pipe (safe default for most intra-domain moves). + return ::mlir::pto::PIPE::PIPE_V; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + + +//===----------------------------------------------------------------------===// +// Pointer Cast Op (existing) +//===----------------------------------------------------------------------===// + +def PointerCastOp : PTO_Op<"pointer_cast", [AttrSizedOperandSegments, Pure]> { + let summary = "Casts an integer address to a MemRef with optional valid dims"; + + // 参数定义 (保持 Optional) + let arguments = (ins + Variadic:$addrs, + Optional:$valid_row, + Optional:$valid_col, + OptionalAttr:$config + ); + + let results = (outs Res:$result); + + // Assembly Format (去掉了 []) + let assemblyFormat = [{ + `(` $addrs `)` ($valid_row^ `,` $valid_col)? attr-dict `:` qualified(type($result)) + }]; + + // Builder (修复 addOperand -> addOperands) + let builders = [ + OpBuilder<(ins "Type":$result, "ValueRange":$addrs, "Value":$vRow, "Value":$vCol, "Attribute":$config), [{ + $_state.addTypes(result); + $_state.addOperands(addrs); + // [关键修复] addOperand -> addOperands + if (vRow) $_state.addOperands(vRow); + if (vCol) $_state.addOperands(vCol); + if (config) $_state.addAttribute("config", config); + + int32_t addrsSize = addrs.size(); + int32_t vRowSize = vRow ? 1 : 0; + int32_t vColSize = vCol ? 1 : 0; + $_state.addAttribute("operandSegmentSizes", + $_builder.getDenseI32ArrayAttr({addrsSize, vRowSize, vColSize})); + }]> + ]; +} + +// ============================================================================= +// System/Runtime Query Ops +// ============================================================================= + +def GetBlockIdxOp : PTO_Op<"get_block_idx", [Pure]> { + let summary = "Get the current block index (core ID)."; + let description = [{ + Returns the linear index of the current compute unit (CUBE Core) within the task. + The return value is in the range [0, BlockNum - 1]. + }]; + + let arguments = (ins); + let results = (outs I64:$result); + + let assemblyFormat = "attr-dict"; +} + +def GetSubBlockIdxOp : PTO_Op<"get_subblock_idx", [Pure]> { + let summary = "Get the current vector core ID."; + let description = [{ + Returns the ID of the current compute unit (Vector Core). + The return value is in the range [0, 1]. + }]; + + let arguments = (ins); + let results = (outs I64:$result); + + let assemblyFormat = "attr-dict"; +} + +def GetBlockNumOp : PTO_Op<"get_block_num", [Pure]> { + let summary = "Get the total number of blocks (cores)."; + let description = [{ + Returns the total number of compute units (Blocks) configured for the current task. + }]; + + let arguments = (ins); + let results = (outs I64:$result); + + let assemblyFormat = "attr-dict"; +} + +def GetSubBlockNumOp : PTO_Op<"get_subblock_num", [Pure]> { + let summary = "Get the number of vector cores."; + let description = [{ + Returns the total number of vector compute units. + }]; + + let arguments = (ins); + let results = (outs I64:$result); + + let assemblyFormat = "attr-dict"; +} + +//===----------------------------------------------------------------------===// +// High-Level Synchronization Ops +//===----------------------------------------------------------------------===// + +def PTO_PipeEventTypeLikeAttr : AnyAttrOf<[PTO_PipeEventTypeAttr, PTO_SyncOpTypeAttr]>; + +def RecordEventOp : PTO_Op<"record_event"> { + let summary = "Record an event for synchronization (High Level)"; + let description = [{ + Records an event from a source operation type to a destination operation type. + Will be lowered to `pto.set_flag` based on op-to-pipe mapping. + }]; + let arguments = (ins + PTO_PipeEventTypeLikeAttr:$src_op, + PTO_PipeEventTypeLikeAttr:$dst_op, + PTO_EventAttr:$event_id + ); + let results = (outs); + let assemblyFormat = [{ + `[` $src_op `,` $dst_op `,` $event_id `]` attr-dict + }]; +} + +def WaitEventOp : PTO_Op<"wait_event"> { + let summary = "Wait for an event (High Level)"; + let description = [{ + Waits for an event from a source operation type to a destination operation type. + Will be lowered to `pto.wait_flag` based on op-to-pipe mapping. + }]; + let arguments = (ins + PTO_PipeEventTypeLikeAttr:$src_op, + PTO_PipeEventTypeLikeAttr:$dst_op, + PTO_EventAttr:$event_id + ); + let results = (outs); + let assemblyFormat = [{ + `[` $src_op `,` $dst_op `,` $event_id `]` attr-dict + }]; +} + +// High-Level Barrier (single pipe) with op type mapping +def BarrierSyncOp : PTO_Op<"barrier_sync"> { + let summary = "High-level barrier mapped from SyncOpType to PIPE"; + let description = [{ + A convenience barrier that specifies a SyncOpType instead of PIPE. The lowering + pass maps the op type to the corresponding hardware pipe and emits `pto.barrier`. + }]; + let arguments = (ins + PTO_SyncOpTypeAttr:$op_type + ); + let results = (outs); + let assemblyFormat = [{ + `[` $op_type `]` attr-dict + }]; +} + +//===----------------------------------------------------------------------===// +// Section Ops (Macros Containers) +//===----------------------------------------------------------------------===// + +class PTO_SectionOp + : PTO_Op { + let summary = "Container for core-specific code guarded by macros"; + let description = [{ + During conversion to EmitC, this op is lowered to: + emitc.verbatim("#if defined(MACRO)") + ... inlined body ... + emitc.verbatim("#endif") + }]; + + let regions = (region SizedRegion<1>:$body); + let assemblyFormat = "$body attr-dict"; +} + +def SectionCubeOp : PTO_SectionOp<"section.cube">; +def SectionVectorOp : PTO_SectionOp<"section.vector">; + +//===----------------------------------------------------------------------===// +// Synchronization Ops +//===----------------------------------------------------------------------===// + +def SetFlagOp : PTO_Op<"set_flag"> { + let summary = "Set synchronization flag between pipes"; + let arguments = (ins + PTO_PipeAttr:$src_pipe, + PTO_PipeAttr:$dst_pipe, + PTO_EventAttr:$event_id + ); + let results = (outs); + let assemblyFormat = [{ + `[` $src_pipe `,` $dst_pipe `,` $event_id `]` attr-dict + }]; +} + +def WaitFlagOp : PTO_Op<"wait_flag"> { + let summary = "Wait for synchronization flag"; + let arguments = (ins + PTO_PipeAttr:$src_pipe, + PTO_PipeAttr:$dst_pipe, + PTO_EventAttr:$event_id + ); + let results = (outs); + let assemblyFormat = [{ + `[` $src_pipe `,` $dst_pipe `,` $event_id `]` attr-dict + }]; +} + +//===----------------------------------------------------------------------===// +// Buffer-ID Synchronization (A5) +//===----------------------------------------------------------------------===// + +def GetBufOp : PTO_Op<"get_buf"> { + let summary = "Acquire a buffer-id token on a given pipe (A5)"; + let description = [{ + `pto.get_buf` participates in a buffer-id based ordering model. Operations + in the same pipe that are guarded by the same buffer-id are enforced to + execute in program order relative to other pipes using the same buffer-id. + + This op is intended to be lowered to the CCEC builtin intrinsic `get_buf`. + }]; + + let arguments = (ins + PTO_PipeAttr:$pipe, + I32Attr:$buf_id, + DefaultValuedAttr:$mode + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `[` $pipe `,` $buf_id `]` attr-dict + }]; +} + +def RlsBufOp : PTO_Op<"rls_buf"> { + let summary = "Release a buffer-id token on a given pipe (A5)"; + let description = [{ + Releases the previously acquired buffer-id token for the given pipe. + + This op is intended to be lowered to the CCEC builtin intrinsic `rls_buf`. + }]; + + let arguments = (ins + PTO_PipeAttr:$pipe, + I32Attr:$buf_id, + DefaultValuedAttr:$mode + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `[` $pipe `,` $buf_id `]` attr-dict + }]; +} + +def SyncSetOp : PTO_Op<"sync.set"> { + let summary = "Set a synchronization signal (trigger) between cube and vector."; + let description = [{ + Sets a synchronization signal on the specified pipeline stage. + Corresponds to `ffts_cross_core_sync` (A3) or `set_intra_block` (A5). + }]; + + let arguments = (ins + PTO_PipeAttr:$pipe, + I32Attr:$event_id + ); + + let assemblyFormat = "$pipe `,` $event_id attr-dict"; +} + +def SyncWaitOp : PTO_Op<"sync.wait"> { + let summary = "Wait for a synchronization signal (barrier) between cube and vector."; + let description = [{ + Waits for a synchronization signal on the specified pipeline stage. + Corresponds to `wait_flag_dev` (A3) or `wait_intra_block` (A5). + }]; + + let arguments = (ins + PTO_PipeAttr:$pipe, + I32Attr:$event_id + ); + + let assemblyFormat = "$pipe `,` $event_id attr-dict"; +} + +def BarrierOp : PTO_Op<"barrier"> { + let summary = "Intra-pipeline memory barrier"; + let arguments = (ins PTO_PipeAttr:$pipe); + let assemblyFormat = "$pipe attr-dict"; +} + + +//===----------------------------------------------------------------------===// +// FFT Configuration Operation +//===----------------------------------------------------------------------===// + +def SetFFTsOp : PTO_Op<"set_ffts", [MemoryEffects<[MemRead, MemWrite]>]> { + let summary = "Set FFTS/flags pointer for runtime (side-effecting)."; + let arguments = (ins AnyMemRef:$ffts); + let results = (outs); + + let assemblyFormat = "$ffts attr-dict `:` type($ffts)"; + + let hasVerifier = 1; +} + + +def PrintOp : PTO_Op<"print", [MemoryEffects<[MemRead, MemWrite]>]> { + let summary = "Print debug: format string (attribute) and scalar value."; + let description = [{ + Debug print op. First argument is the format string (string attribute), second is a scalar value. + Format in IR: `pto.print ins("format", %scalar : type(%scalar))` + }]; + let arguments = (ins StrAttr:$format, PrintScalarType:$scalar); + let results = (outs); + let assemblyFormat = [{ + `ins` `(` $format `,` $scalar `:` type($scalar) `)` + attr-dict + }]; +} + +def TrapOp : PTO_Op<"trap"> { + let summary = "Trap: abort execution (no operands)."; + let description = [{ + Inserts a trap to stop execution. No arguments, no results. + Format in IR: pto.trap + }]; + let arguments = (ins); + let results = (outs); + let assemblyFormat = "attr-dict"; +} + +// ---- tile-world TOp version (with 't') ---- +// pto.tmgather ins(%mem, %idx) outs(%dst) [ ...] -> tensor? +def MGatherOp : PTO_TOp<"mgather", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Gather-load elements from memory into a tile using per-element indices (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$mem, + PTODpsType:$idx, + PTODpsType:$dst); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $mem `,` $idx `:` type($mem) `,` type($idx) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "MGATHER"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE2; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + + + +// ---- tile-world TOp version ---- +// pto.tsetval ins(%offset, %val) outs(%dst) : PTODpsType, index, T +def TSetValOp : PTO_TOp<"tsetval", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Write a scalar value into a single element of dst at offset (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$dst, + Index:$offset, + AnyType:$val + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $offset `,` $val `:` type($offset) `,` type($val) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "SETVAL"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +// ---- tile-world TOp version ---- +// pto.tgetval ins(%src, %offset) outs(%dst) : PTODpsType, index -> T +def TGetValOp : PTO_TOp<"tgetval", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Read a single element from tile-like src at offset into a scalar (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$src, + Index:$offset + ); + + let results = (outs AnyType:$dst); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $offset `:` qualified(type($src)) `,` type($offset) `)` + `outs` `:` qualified(type($dst) ) + attr-dict + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "GETVAL"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + }]; +} + + +// ---- tile-world TOp version (with 't') ---- +// pto.mscatter ins(%src, %idx) outs(%mem) [ ...] +def MScatterOp : PTO_TOp<"mscatter", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Scatter-store elements from a tile into memory using per-element indices (tile world, ins/outs)."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$idx, + PTODpsType:$mem // outs target + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $idx `:` qualified(type($src)) `,` type($idx) `)` + `outs` `(` $mem `:` type($mem) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + static StringRef getIntrinsicName() { return "MSCATTER"; } + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE3; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getMemMutable(); } + }]; +} + +//===----------------------------------------------------------------------===// +// PTO_TOPs(Tilebuffer, DPS) +//===----------------------------------------------------------------------===// + +def TAbsOp : PTO_TOp<"tabs", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise absolute value of a tile "; + let description = [{ + For each element (i, j): dst[i,j] = |src[i,j]|. + }]; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_V; // Vector pipe for elementwise ops + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TAddOp : PTO_TOp<"tadd", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise add of two tiles "; + let description = [{ + For each element (i, j): dst[i,j] = src0[i,j] + src1[i,j]. + }]; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + return ::mlir::pto::PIPE::PIPE_V; + } + + ::mlir::MutableOperandRange getDpsInitsMutable() { + return getDstMutable(); + } + }]; +} + +def TAddCOp : PTO_TOp<"taddc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise ternary add of tiles "; + let description = [{ + For each element (i, j): dst[i,j] = src0[i,j] + src1[i,j] + src2[i,j]. + }]; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$src2, + PTODpsType:$dst + ); + + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `,` $src2 `:` + qualified(type($src0)) `,` qualified(type($src1)) `,` qualified(type($src2)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TAddSOp : PTO_TOp<"tadds", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise add a scalar to a tile "; + let description = [{ + For each element (i, j): dst[i,j] = src[i,j] + scalar. + Tile operands are tilebuffers; scalar is a builtin scalar type (e.g. f32). + }]; + + let arguments = (ins + PTODpsType:$src, + AnyType:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TAddSCOp : PTO_TOp<"taddsc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "dst = src0 + scalar + src1 "; + let arguments = (ins + PTODpsType:$src0, + AnyType:$scalar, + PTODpsType:$src1, + PTODpsType:$dst + ); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $scalar `,` $src1 `:` qualified(type($src0)) `,` type($scalar) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + + +def TAndOp : PTO_TOp<"tand", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TAND(dst, src0, src1) bitwise-and on tiles"; + let arguments = (ins PTODpsType:$src0, PTODpsType:$src1, PTODpsType:$dst); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TAndSOp : PTO_TOp<"tands", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TANDS(dst, src, scalar) bitwise-and tile with scalar"; + let arguments = (ins PTODpsType:$src, AnyType:$scalar, PTODpsType:$dst); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + + +def TCIOp : PTO_TOp<"tci", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Generate contiguous integer sequence into dst tile."; + + let arguments = (ins + AnyInteger:$S, + PTODpsType:$dst, + DefaultValuedAttr:$descending + ); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $S + attr-dict + `:` type($S) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TCmpOp : PTO_TOp<"tcmp", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Compare two tiles and write a packed predicate mask"; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst, + OptionalAttr:$cmpMode + ); + let results = (outs); + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 + attr-dict + `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + }]; + + let builders = [ + OpBuilder<(ins "TypeRange":$res, "Value":$src0, "Value":$src1, "Value":$dst), [{ + build($_builder, $_state, res, src0, src1, dst, /*cmpMode=*/nullptr); + }]>]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TCmpSOp : PTO_TOp<"tcmps", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Compare scalar value against a tile and write a packed predicate mask"; + + let arguments = (ins + PTODpsType:$src, + AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, + DefaultValuedAttr:$cmpMode, + PTODpsType:$dst + ); + + let results = (outs); + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar + attr-dict + `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + + }]; +} + +def TColExpandOp : PTO_TOp<"tcolexpand", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Broadcast src(0, j) to all rows in column j "; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; +} + +def TColMaxOp : PTO_TOp<"tcolmax", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Reduce each column by taking the maximum across rows "; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; +} + +def TColMinOp : PTO_TOp<"tcolmin", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Reduce each column by taking the minimum across rows "; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; +} + +def TColSumOp : PTO_TOp<"tcolsum", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Reduce each column by summing across rows (tilebuf-based, explicit tmp)"; + + let arguments = (ins + PTODpsType:$src, + Optional:$tmp, + PTODpsType:$dst, + DefaultValuedOptionalAttr:$isBinary + ); + + let results = (outs); + + let hasVerifier = 1; + + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { + return ::mlir::MutableOperandRange(getOperation(), 1, getOperation()->getNumOperands()); + } + }]; +} + +def TCvtOp : PTO_TOp<"tcvt", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise type conversion with rounding mode (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst, + DefaultValuedAttr:$rmode + ); + + let results = (outs); + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src + attr-dict + `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + }]; +} + +def TDivOp : PTO_TOp<"tdiv", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise division of two tiles (tilebuf, DPS)"; + let description = [{ + For each element (i, j): dst[i,j] = src0[i,j] / src1[i,j]. + Division-by-zero behavior is target-defined. + }]; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + +} + +def TDivSOp : PTO_TOp<"tdivs", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise division with a scalar (tilebuf, DPS)"; + + let arguments = (ins + AnyType:$src, + AnyType:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + +} + +def TExpOp : PTO_TOp<"texp", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise exponential (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TExpandsOp : PTO_TOp<"texpands", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Broadcast scalar into dst (tilebuf, DPS)"; + + let arguments = (ins + AnyTypeOf<[F16, F32, I16, I32, I8, UI8, UI16, UI32]>:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $scalar `:` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TExtractOp : PTO_TOp<"textract", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Extract sub-tile window from src into dst (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + Index:$indexRow, + Index:$indexCol, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $indexRow `,` $indexCol `:` qualified(type($src)) `,` type($indexRow) `,` type($indexCol) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + // TEXTRACT moves data between memory domains (L1/cbuf -> L0A/L0B/L0C), + // which is executed by the MTE1 pipeline. + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TFillPadOp : PTO_TOp<"tfillpad", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Copy src into dst and fill padded elements using dst PadVal (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TGatherOp : PTO_TOp<"tgather", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Gather/select elements using an index tile or a mask pattern (tilebuf, DPS)"; + + // --- operands (DPS): src0 + optional indices + outs(dst) --- + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst, + Optional:$indices, + OptionalAttr:$maskPattern + ); + + // --- DPS op: no SSA results --- + let results = (outs); + + let hasVerifier = 1; + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TGatherBOp : PTO_TOp<"tgatherb", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Gather elements using byte offsets (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$offsets, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $offsets `:` qualified(type($src)) `,` type($offsets) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TLogOp : PTO_TOp<"tlog", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise natural logarithm (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TLReluOp : PTO_TOp<"tlrelu", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Leaky ReLU with a scalar slope (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + F32:$slope, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $slope `:` qualified(type($src)) `,` type($slope) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMaxOp : PTO_TOp<"tmax", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise maximum of two tiles (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMaxSOp : PTO_TOp<"tmaxs", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise max of a tile and a scalar (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + F32:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMinOp : PTO_TOp<"tmin", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise minimum of two tiles (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMinSOp : PTO_TOp<"tmins", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Elementwise minimum of a tile and a scalar (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + F32:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMovFPOp : PTO_TOp<"tmov.fp", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TMOV_FP: move/convert using fp (scaling) tile (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$fp, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $fp `:` qualified(type($src)) `,` qualified(type($fp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + // TMOV_FP is an ACC->MAT move (Cc->Cb) with vector quant parameters in + // SCALING (fbuf). Treat it as a data-movement op for sync insertion. + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +def TMrgSortOp: PTO_TOp<"tmrgsort", [ + AttrSizedOperandSegments, + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TMRGSORT: Merge sort (format1: ins(src,blockLen) out(dst); format2: ins(src0..src3) outs(dst,tmp,executed))."; + + let arguments = (ins + Variadic:$srcs, + Optional:$blockLen, + Variadic:$dsts, + Optional:$excuted, + DefaultValuedAttr:$exhausted + ); + + let results = (outs); + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool isFormat1() { return getSrcs().size() == 1u && getBlockLen() && getDsts().size() == 1u; } + bool isFormat2() { return getSrcs().size() == 4u && getDsts().size() == 2u && getExcuted(); } + Value getSrc() { return getSrcs().front(); } + Value getDst() { return getDsts().front(); } + Value getTmp() { return getDsts()[1]; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstsMutable(); } + void print(::mlir::OpAsmPrinter &p); + static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result); + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + }]; +} + +def TMulOp: PTO_TOp<"tmul", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TMUL: Elementwise multiply of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +//===----------------------------------------------------------------------===// +// PTOOps.td (add TMULS DPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TMulSOp: PTO_TOp<"tmuls", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TMULS: Elementwise multiply a tile by a scalar."; + + let arguments = (ins + PTODpsType:$src0, + F32:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $scalar `:` qualified(type($src0)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TNEG TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TNegOp: PTO_TOp<"tneg", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TNEG: Elementwise negation of a tile."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TNOT TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TNotOp: PTO_TOp<"tnot", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TNOT: Elementwise bitwise NOT of a tile."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TOR TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TOrOp: PTO_TOp<"tor", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TOR: Elementwise bitwise OR of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TORS TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TOrSOp: PTO_TOp<"tors", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TORS: Elementwise bitwise OR of a tile and a scalar."; + + let arguments = (ins + PTODpsType:$src, + AnySignlessInteger:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TPARTADD TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TPartAddOp: PTO_TOp<"tpartadd", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TPARTADD: Partial elementwise add with implementation-defined handling of mismatched valid regions."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TPARTMAX TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TPartMaxOp: PTO_TOp<"tpartmax", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Partial elementwise max with implementation-defined handling of mismatched valid regions."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TPARTMIN TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TPartMinOp: PTO_TOp<"tpartmin", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Partial elementwise min with implementation-defined handling of mismatched valid regions."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TPRELU TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TPReluOp: PTO_TOp<"tprelu", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TPRELU: Elementwise PReLU (parametric ReLU) with a per-element slope tile."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TRECIP TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRecipOp: PTO_TOp<"trecip", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TRECIP: Elementwise reciprocal of a tile."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TRELU TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TReluOp: PTO_TOp<"trelu", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TRELU: Elementwise ReLU of a tile."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TREM TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRemOp: PTO_TOp<"trem", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TREM: Elementwise remainder of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TREMS TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRemSOp: PTO_TOp<"trems", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TREMS: Elementwise remainder with a scalar"; + + let arguments = (ins + PTODpsType:$src, + F32:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (TRESHAPE: SSA view op; aliases src storage) +//===----------------------------------------------------------------------===// + +def TReshapeOp: PTO_TOp<"treshape", [ + OpPipeInterface, + Pure, + ViewLikeOpInterface +]> { + let summary = "TRESHAPE: Reinterpret a tile buffer view (SSA; aliases src storage)"; + + let arguments = (ins + PTODpsType:$src + ); + + let results = (outs + PTODpsType:$result + ); + + let hasVerifier = 1; + + let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))"; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::Value getViewSource() { return getSrc(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWEXPAND TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowExpandOp: PTO_TOp<"trowexpand", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWEXPAND: Broadcast the first element of each source row across the destination row."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWEXPANDDIV TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowExpandDivOp: PTO_TOp<"trowexpanddiv", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWEXPANDDIV: Row-wise broadcast divide: divide each row of src0 by a per-row scalar vector src1."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWEXPANDMUL TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowExpandMulOp: PTO_TOp<"trowexpandmul", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWEXPANDMUL: Row-wise broadcast divide: divide each row of src0 by a per-row scalar vector src1."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWEXPANDSUB TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowExpandSubOp: PTO_TOp<"trowexpandsub", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWEXPANDSUB: Row-wise broadcast subtract: subtract a per-row scalar vector src1 from each row of src0."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWMAX TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowMaxOp: PTO_TOp<"trowmax", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWMAX: Reduce each row by taking the maximum across columns."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$tmp, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWMIN TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowMinOp: PTO_TOp<"trowmin", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWMIN: Reduce each row by taking the minimum across columns."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$tmp, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TROWSUM TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRowSumOp: PTO_TOp<"trowsum", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TROWSUM: Reduce each row by summing across columns."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$tmp, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TRSQRT TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TRsqrtOp: PTO_TOp<"trsqrt", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TRSQRT: Elementwise reciprocal square root."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSCATTER TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TScatterOp: PTO_TOp<"tscatter", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSCATTER: Scatter elements of a source tile into a destination tile using per-element indices."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$indexes, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $indexes `:` qualified(type($src)) `,` qualified(type($indexes) ) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { + // NOTE: On dav-c220 (Ascend910 A2/A3), pto-isa implements TSCATTER as a + // scalar loop over UB pointers, which executes on the scalar pipeline + // (PIPE_S). Waiting on PIPE_V does not block scalar UB accesses and can + // lead to using uninitialized indices/data (crash / aivec exception). + // + // On A5 instruction set devices, TSCATTER is implemented with vector + // scatter instructions and should be treated as PIPE_V. + auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>(); + if (moduleOp) { + if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) { + auto s = spec.getValue(); + if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) { + return ::mlir::pto::PIPE::PIPE_V; + } + } + } + return ::mlir::pto::PIPE::PIPE_S; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSEL TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSelOp: PTO_TOp<"tsel", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSEL: Select between two tiles using a mask tile (per-element selection)."; + + let arguments = (ins + PTODpsType:$mask, + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $mask `,` $src0 `,` $src1 `:` qualified(type($mask)) `,` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSELS TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSelSOp: PTO_TOp<"tsels", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSELS: Select one of two source tiles using a scalar selectMode (global select)."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + AnyInteger:$selectMode, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `,` $selectMode `:` qualified(type($src0)) `,` qualified(type($src1)) `,` type($selectMode) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSHL TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TShlOp: PTO_TOp<"tshl", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSHL: Elementwise shift-left of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSHR TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TShrOp: PTO_TOp<"tshr", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSHR: Elementwise shift-right of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSHLS/TSHRS TBDPS: shift-left/right by scalar) +//===----------------------------------------------------------------------===// + +def TShlSOp : PTO_TOp<"tshls", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSHLS: Elementwise shift-left of a tile by a scalar (shift count)."; + let arguments = (ins + PTODpsType:$src, + AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, + PTODpsType:$dst + ); + let results = (outs); + let hasVerifier = 1; + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; +} + +def TShrSOp : PTO_TOp<"tshrs", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSHRS: Elementwise shift-right of a tile by a scalar (shift count)."; + let arguments = (ins + PTODpsType:$src, + AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, + PTODpsType:$dst + ); + let results = (outs); + let hasVerifier = 1; + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSORT32 TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSort32Op: PTO_TOp<"tsort32", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSORT32: Sort a fixed-size 32-element block and produce an index mapping."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst, + PTODpsType:$idx + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `,` $idx `:` qualified(type($dst) ) `,` qualified(type($idx)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSQRT TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSqrtOp: PTO_TOp<"tsqrt", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSQRT: Elementwise square root."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSTORE_FP TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TStoreFPOp: PTO_TOp<"tstore_fp", [ + PTO_DpsInitOpInterface, +]> { + let summary = "TSTORE_FP: Store an accumulator tile into global memory using a scaling (fp) tile for vector quantization parameters."; + + let arguments = (ins + PTODpsType:$src, + PTODpsType:$fp, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $fp `:` qualified(type($src)) `,` qualified(type($fp)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + let extraClassDeclaration = [{ + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSUB TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSubOp: PTO_TOp<"tsub", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSUB: Elementwise subtract of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSUBC TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSubCOp: PTO_TOp<"tsubc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSUBC: Elementwise ternary op: src0 - src1 + src2."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$src2, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `,` $src2 `:` + qualified(type($src0)) `,` qualified(type($src1)) `,` qualified(type($src2)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSUBS TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSubSOp: PTO_TOp<"tsubs", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSUBS: Elementwise subtract a scalar from a tile."; + + let arguments = (ins + PTODpsType:$src, + F32:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + + +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSUBSC TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSubSCOp: PTO_TOp<"tsubsc", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TSUBSC: Elementwise fused op: src0 - scalar + src1."; + + let arguments = (ins + PTODpsType:$src0, + F32:$scalar, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $scalar `,` $src1 `:` + qualified(type($src0)) `,` type($scalar) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + +} + +//===----------------------------------------------------------------------===// +// PTOOps.td (add TXORS TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TXorSOp: PTO_TOp<"txors", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TXORS: Elementwise bitwise XOR of a tile and a scalar."; + + let arguments = (ins + PTODpsType:$src, + AnyInteger:$scalar, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + +} +//===----------------------------------------------------------------------===// +// PTOOps.td (add TSYNC TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TSyncOp: PTO_TOp<"tsync", [ + PTO_DpsInitOpInterface, +]> { + let summary = "TSYNC: Synchronize PTO execution with event tokens or single-op barrier."; + + let arguments = (ins + PTODpsType:$events, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $events `:` type($events) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + let extraClassDeclaration = [{ + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + +//===----------------------------------------------------------------------===// +// PTOOps.td (add TXOR TBDPS/tile buffer op) +//===----------------------------------------------------------------------===// + +def TXorOp: PTO_TOp<"txor", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TXOR: Elementwise bitwise XOR of two tiles."; + + let arguments = (ins + PTODpsType:$src0, + PTODpsType:$src1, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; + +} + +def TPrintOp: PTO_TOp<"tprint", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "TPRINT: Print the contents of a Tile or GlobalTensor for debugging purposes directly from device code.."; + + let arguments = (ins + PTODpsType:$src + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `:` qualified(type($src)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } + }]; + } + +#endif // MLIR_DIALECT_PTO_IR_PTOOPS diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp new file mode 100644 index 00000000..45b1798d --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp @@ -0,0 +1,7713 @@ +//===- PTOToEmitC.cpp - PTO to EmitC conversion pass ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/Passes.h" + +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" +#include "mlir/Analysis/DataFlowFramework.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" + +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeRange.h" + +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Target/Cpp/CppEmitter.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/Func/Transforms/FuncConversions.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" + +#include +#include +#include +#include +namespace mlir { +#define GEN_PASS_DEF_EMITPTOMANUAL +#include "PTO/Transforms/Passes.h.inc" +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +static const char *addrSpaceQualifier(pto::AddressSpace as) { + switch (as) { + case pto::AddressSpace::Zero: + return "__gm__"; + case pto::AddressSpace::VEC: + return "__ubuf__"; + case pto::AddressSpace::GM: + return "__gm__"; + case pto::AddressSpace::MAT: + return "__cbuf__"; + case pto::AddressSpace::LEFT: + return "__ca__"; + case pto::AddressSpace::RIGHT: + return "__cb__"; + case pto::AddressSpace::ACC: + return "__cc__"; + case pto::AddressSpace::BIAS: + // Bias tiles are special in pto-isa; keep a safe fallback qualifier. + return "__gm__"; + case pto::AddressSpace::SCALING: + // pto-isa TileType::Scaling maps to __fbuf__ (see pto/common/memory.hpp). + return "__fbuf__"; + } + return "__gm__"; +} + +static Value peelUnrealized(Value v) { + if (auto castOp = v.getDefiningOp()) + return castOp.getOperand(0); + return v; +} + +static std::optional getLayoutAttrFromOp(Operation *op) { + if (!op) + return std::nullopt; + if (auto attr = op->getAttrOfType("layout")) + return attr.getLayout(); + return std::nullopt; +} + +static std::optional resolveLayoutFromValueChain(Value v) { + v = peelUnrealized(v); + while (Operation *def = v.getDefiningOp()) { + if (auto layout = getLayoutAttrFromOp(def)) + return layout; + if (auto subview = dyn_cast(def)) { + v = peelUnrealized(subview.getSource()); + continue; + } + if (auto reinterpret = dyn_cast(def)) { + v = peelUnrealized(reinterpret.getSource()); + continue; + } + if (auto cast = dyn_cast(def)) { + v = peelUnrealized(cast.getSource()); + continue; + } + if (auto unrealized = dyn_cast(def)) { + if (unrealized->getNumOperands() == 0) + break; + v = peelUnrealized(unrealized.getOperand(0)); + continue; + } + break; + } + return std::nullopt; +} + +static std::optional +resolveLayoutForGlobalTensor(Operation *anchor, Value basePtr) { + if (auto layout = getLayoutAttrFromOp(anchor)) + return layout; + return resolveLayoutFromValueChain(basePtr); +} + +static std::string layoutToEmitCString(mlir::pto::Layout layout) { + switch (layout) { + case mlir::pto::Layout::ND: + return "pto::Layout::ND"; + case mlir::pto::Layout::DN: + return "pto::Layout::DN"; + case mlir::pto::Layout::NZ: + return "pto::Layout::NZ"; + } + return "pto::Layout::ND"; +} + +//===----------------------------------------------------------------------===// +// Type Converter +//===----------------------------------------------------------------------===// + +class PTOToEmitCTypeConverter : public TypeConverter { +public: + PTOToEmitCTypeConverter(MLIRContext *Ctx) { + // --------------------------------------------------------- + // 1. 基本类型 (f32, i32, index) + // --------------------------------------------------------- + addConversion([Ctx](FloatType type) -> Type { + if (type.isF32()) return emitc::OpaqueType::get(Ctx, "float"); + if (type.isF16()) return emitc::OpaqueType::get(Ctx, "half"); + if (type.isBF16()) return emitc::OpaqueType::get(Ctx, "bfloat16_t"); + if (type.isF64()) return emitc::OpaqueType::get(Ctx, "double"); + llvm::errs() << "[Debug] Unsupported FloatType: " << type << "\n"; + return Type{}; + }); + + addConversion([Ctx](IntegerType type) -> Type { + // [关键修改] i1 保持为 i1,不要转为 emitc.opaque<"bool"> + // 这样 emitc.if (接受 i1) 就不会报错。 + // 在打印 C++ 代码时,i1 会自动打印为 bool。 + //if (type.getWidth() == 1) return IntegerType::get(Ctx, 1); + if (type.getWidth() == 1) return type; // <--- 保持 i1 不变 + + // Prefer fixed-width C types. Preserve signedness if the MLIR integer is + // explicitly signed/unsigned; treat signless as signed by default. + const bool isUnsigned = type.isUnsignedInteger(); + switch (type.getWidth()) { + case 8: + return emitc::OpaqueType::get(Ctx, isUnsigned ? "uint8_t" : "int8_t"); + case 16: + return emitc::OpaqueType::get(Ctx, + isUnsigned ? "uint16_t" : "int16_t"); + case 32: + return emitc::OpaqueType::get(Ctx, + isUnsigned ? "uint32_t" : "int32_t"); + case 64: + return emitc::OpaqueType::get(Ctx, + isUnsigned ? "uint64_t" : "int64_t"); + default: + llvm::errs() << "[Debug] Unsupported IntegerType width: " + << type.getWidth() << "\n"; + return emitc::OpaqueType::get(Ctx, "int32_t"); // Fallback + } + }); + + addConversion([Ctx](IndexType type) -> Type { + return emitc::OpaqueType::get(Ctx, "int32_t"); + }); + + // vector<4xi16> (e.g. TMRGSORT executedNumList) -> pto::MrgSortExecutedNumList + addConversion([Ctx](VectorType type) -> Type { + if (type.getRank() == 1 && type.getNumElements() == 4 && + type.getElementType().isInteger(16)) + return emitc::OpaqueType::get(Ctx, "pto::MrgSortExecutedNumList"); + return Type{}; + }); + + // --------------------------------------------------------- + // 2. PTO 特殊类型 (透传或转换) + // --------------------------------------------------------- + addConversion([Ctx](emitc::OpaqueType type) { return type; }); + addConversion([Ctx](emitc::PointerType type) { return type; }); + + // --------------------------------------------------------- + // 2.5 PtrType 转换 (指针类型) + // --------------------------------------------------------- + addConversion([this, Ctx](pto::PtrType type) -> std::optional { + Type elemType = type.getElementType(); + Type newElemType = convertType(elemType); + if (!newElemType) + return std::nullopt; + + std::string elemTypeStr; + if (auto opq = dyn_cast(newElemType)) { + elemTypeStr = opq.getValue().str(); + } else { + llvm::errs() << " [Error] PtrType elem type is not OpaqueType: " + << newElemType << "\n"; + return std::nullopt; + } + + std::string qualifier = "__gm__"; + + std::string finalTypeStr = qualifier + " " + elemTypeStr; + return emitc::PointerType::get( + emitc::OpaqueType::get(Ctx, finalTypeStr)); + }); + + // --------------------------------------------------------- + // 3. MemRef 转换 (Debug 重点) + // --------------------------------------------------------- + addConversion([this, Ctx](MemRefType type) -> std::optional { + llvm::errs() << "[Debug] Converting MemRef: " << type << "\n"; + + // A. 转换元素类型 + Type elemType = type.getElementType(); + Type newElemType = convertType(elemType); + if (!newElemType) { + llvm::errs() << " [Error] Failed to convert element type: " << elemType << "\n"; + return std::nullopt; + } + + // 获取元素类型的字符串 + std::string elemTypeStr; + if (auto opq = dyn_cast(newElemType)) { + elemTypeStr = opq.getValue().str(); + } else { + llvm::errs() << " [Error] Converted element type is not OpaqueType: " << newElemType << "\n"; + return std::nullopt; + } + + // B. 处理 Memory Space + std::string qualifier = ""; + Attribute memorySpace = type.getMemorySpace(); + + if (!memorySpace) { + qualifier = "__gm__"; + } else if (auto ptoAttr = dyn_cast(memorySpace)) { + qualifier = addrSpaceQualifier(ptoAttr.getAddressSpace()); + } else { + llvm::errs() << " [Warning] Unknown MemorySpace Attribute type: " << memorySpace << "\n"; + qualifier = "__gm__"; // Fallback + } + + std::string finalTypeStr = qualifier + " " + elemTypeStr; + llvm::errs() << " [Success] -> " << finalTypeStr << "*\n"; + + return emitc::PointerType::get(emitc::OpaqueType::get(Ctx, finalTypeStr)); + }); + + // --------------------------------------------------------- + // 4. Function & Materialization + // --------------------------------------------------------- + addConversion([this](FunctionType type) -> Type { + SmallVector inputs; + if (failed(convertTypes(type.getInputs(), inputs))) return Type{}; + SmallVector results; + if (failed(convertTypes(type.getResults(), results))) return Type{}; + return FunctionType::get(type.getContext(), inputs, results); + }); + + auto materializeCast = [](OpBuilder &Builder, Type ResultType, + ValueRange Inputs, Location Loc) -> Value { + if (Inputs.size() != 1) return Value(); + return Builder.create(Loc, ResultType, Inputs[0]).getResult(0); + }; + + addSourceMaterialization(materializeCast); + addTargetMaterialization(materializeCast); + // Needed for region/block signature conversions (e.g. CFG block args). + addArgumentMaterialization(materializeCast); + } +}; + +static constexpr unsigned kPTOIndexBitWidth = + 32; // keep consistent with IndexType conversion + +// Forward declarations (definitions below). +static emitc::OpaqueType getSignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth); +static emitc::OpaqueType getUnsignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth); +static emitc::OpaqueType getWiderSignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth); +static emitc::OpaqueType getWiderUnsignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth); +static Value makeEmitCOpaqueConstant(ConversionPatternRewriter &rewriter, + Location loc, Type type, + llvm::StringRef literal); +static Value makeEmitCIntConstant(ConversionPatternRewriter &rewriter, + Location loc, Type type, int64_t value); +static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, + Type dstType, Value src); +static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter, + Location loc, Value v, + unsigned bitWidth); + +//===----------------------------------------------------------------------===// +// Arith -> EmitC (full dialect coverage for scalar ops) +//===----------------------------------------------------------------------===// + +template +struct ArithSimpleBinaryToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type dstTy = this->getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getOperands()); + return success(); + } +}; + +// Integer bitwise ops (andi/ori/xori) on signless integers: perform in unsigned +// to avoid signedness pitfalls, then cast back. +template +struct ArithUnsignedBitwiseBinaryToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = this->getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + if (bitWidth == 1) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value resU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, resU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithDivUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::DivUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value divU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, divU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithRemUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::RemUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value remU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, remU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithCeilDivUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::CeilDivUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value one = makeEmitCIntConstant(rewriter, loc, uTy, 1); + Value rhsMinusOne = rewriter.create(loc, uTy, rhsU, one); + Value num = rewriter.create(loc, uTy, lhsU, rhsMinusOne); + Value divU = rewriter.create(loc, uTy, num, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, divU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithCeilDivSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::CeilDivSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); + Value one = makeEmitCIntConstant(rewriter, loc, dstTy, 1); + + Value q0 = rewriter.create(loc, dstTy, adaptor.getLhs(), + adaptor.getRhs()); + Value r = rewriter.create(loc, dstTy, adaptor.getLhs(), + adaptor.getRhs()); + + Value rNeZero = rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::ne, r, + zero); + Value lhsLt0 = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, adaptor.getLhs(), + zero); + Value rhsLt0 = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, adaptor.getRhs(), + zero); + Value signsSame = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::eq, lhsLt0, rhsLt0); + Value adjust = + rewriter.create(loc, rewriter.getI1Type(), + rNeZero, signsSame); + + Value qPlusOne = rewriter.create(loc, dstTy, q0, one); + Value result = rewriter.create(loc, dstTy, adjust, + qPlusOne, q0); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithFloorDivSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::FloorDivSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); + Value one = makeEmitCIntConstant(rewriter, loc, dstTy, 1); + + Value q0 = rewriter.create(loc, dstTy, adaptor.getLhs(), + adaptor.getRhs()); + Value r = rewriter.create(loc, dstTy, adaptor.getLhs(), + adaptor.getRhs()); + + Value rNeZero = rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::ne, r, + zero); + Value lhsLt0 = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, adaptor.getLhs(), + zero); + Value rhsLt0 = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, adaptor.getRhs(), + zero); + Value signsDifferent = + rewriter.create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::ne, lhsLt0, rhsLt0); + Value adjust = + rewriter.create(loc, rewriter.getI1Type(), + rNeZero, signsDifferent); + + Value qMinusOne = rewriter.create(loc, dstTy, q0, one); + Value result = rewriter.create(loc, dstTy, adjust, + qMinusOne, q0); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithShiftLeftToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::ShLIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + if (bitWidth == 1) { + // Compute on u8 and truncate to i1. + auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); + Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); + Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); + Value sh = rewriter.create(loc, u8Ty, lhsU8, + rhsU8); + Value masked = + rewriter.create(loc, u8Ty, sh, + makeEmitCIntConstant(rewriter, loc, + u8Ty, 1)); + rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value shU = + rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, shU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithShiftRightUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::ShRUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + if (bitWidth == 1) { + // (x >> y) on i1 is either x (y==0) or 0 (y!=0); approximate in u8. + auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); + Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); + Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); + Value sh = rewriter.create(loc, u8Ty, lhsU8, + rhsU8); + Value masked = + rewriter.create(loc, u8Ty, sh, + makeEmitCIntConstant(rewriter, loc, + u8Ty, 1)); + rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value shU = + rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, shU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithShiftRightSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::ShRSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + if (bitWidth == 1) { + // (x >> y) on i1 is either x (y==0) or 0 (y!=0); approximate in u8. + auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); + Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); + Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); + Value sh = rewriter.create(loc, u8Ty, lhsU8, + rhsU8); + Value masked = + rewriter.create(loc, u8Ty, sh, + makeEmitCIntConstant(rewriter, loc, + u8Ty, 1)); + rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); + return success(); + } + + // Signed arithmetic shift; cast RHS to unsigned to interpret shift amount. + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value sh = + rewriter.create(loc, dstTy, adaptor.getLhs(), + rhsU); + rewriter.replaceOp(op, sh); + return success(); + } +}; + +struct ArithNegFToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::NegFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getOperand()); + return success(); + } +}; + +struct ArithRemFToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::RemFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + // Use builtin `fmod` when possible. For f16, compute in float and cast back. + Type callTy = dstTy; + Value lhs = adaptor.getLhs(); + Value rhs = adaptor.getRhs(); + + if (auto opFloatTy = dyn_cast(op.getType())) { + if (opFloatTy.isF16()) { + auto f32Ty = emitc::OpaqueType::get(rewriter.getContext(), "float"); + lhs = emitCCast(rewriter, loc, f32Ty, lhs); + rhs = emitCCast(rewriter, loc, f32Ty, rhs); + callTy = f32Ty; + } + } + + // Prefer `__builtin_fmod*` to avoid relying on extra headers. + llvm::StringRef callee = "__builtin_fmod"; + if (auto opFloatTy = dyn_cast(op.getType())) { + if (opFloatTy.isF32() || opFloatTy.isF16()) + callee = "__builtin_fmodf"; + else if (opFloatTy.isF64()) + callee = "__builtin_fmod"; + } + + auto call = rewriter.create( + loc, TypeRange{callTy}, callee, ValueRange{lhs, rhs}, + /*args=*/ArrayAttr{}, /*template_args=*/ArrayAttr{}); + Value result = call.getResult(0); + if (callTy != dstTy) + result = emitCCast(rewriter, loc, dstTy, result); + + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithSelectToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::SelectOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getCondition().getType().isInteger(1)) + return rewriter.notifyMatchFailure( + op, "only scalar i1 conditions supported for arith.select"); + + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + auto cond = + rewriter.create(op.getLoc(), dstTy, + adaptor.getCondition(), + adaptor.getTrueValue(), + adaptor.getFalseValue()); + rewriter.replaceOp(op, cond.getResult()); + return success(); + } +}; + +struct ArithExtUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::ExtUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto dstIntTy = dyn_cast(op.getType()); + auto srcIntTy = dyn_cast(op.getIn().getType()); + if (!dstIntTy || !srcIntTy) + return rewriter.notifyMatchFailure(op, "expected scalar integer types"); + + Type dstTy = getTypeConverter()->convertType(dstIntTy); + if (!dstTy) + return failure(); + + // i1 -> iN: bool to integer already behaves as 0/1. + if (srcIntTy.getWidth() == 1) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } + + auto uSrcTy = + getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); + auto uDstTy = + getUnsignedIntOpaqueType(rewriter.getContext(), dstIntTy.getWidth()); + Value srcU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), + srcIntTy.getWidth()); + Value extU = emitCCast(rewriter, loc, uDstTy, srcU); + Value result = emitCCast(rewriter, loc, dstTy, extU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithExtSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::ExtSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto dstIntTy = dyn_cast(op.getType()); + auto srcIntTy = dyn_cast(op.getIn().getType()); + if (!dstIntTy || !srcIntTy) + return rewriter.notifyMatchFailure(op, "expected scalar integer types"); + + Type dstTy = getTypeConverter()->convertType(dstIntTy); + if (!dstTy) + return failure(); + + // i1 sign-extension: 0 -> 0, 1 -> -1. + if (srcIntTy.getWidth() == 1) { + Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); + Value asInt = emitCCast(rewriter, loc, dstTy, adaptor.getIn()); + Value neg = rewriter.create(loc, dstTy, zero, asInt).getResult(); + rewriter.replaceOp(op, neg); + return success(); + } + + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } +}; + +template +struct ArithCastToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(CastOp op, typename CastOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type dstTy = this->getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } +}; + +struct ArithIndexCastUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::IndexCastUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + // MemRef casts are handled elsewhere; for safety, fall back to emitc.cast. + if (isa(op.getIn().getType()) || isa(op.getType())) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } + + auto getBW = [](Type t) -> std::optional { + if (auto i = dyn_cast(t)) + return i.getWidth(); + if (isa(t)) + return kPTOIndexBitWidth; + return std::nullopt; + }; + + auto srcBW = getBW(op.getIn().getType()); + auto dstBW = getBW(op.getType()); + if (!srcBW || !dstBW) + return rewriter.notifyMatchFailure(op, "unsupported index_castui types"); + + if (*dstBW <= *srcBW) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } + + auto uSrcTy = getUnsignedIntOpaqueType(rewriter.getContext(), *srcBW); + auto uDstTy = getUnsignedIntOpaqueType(rewriter.getContext(), *dstBW); + Value srcU = emitCCast(rewriter, loc, uSrcTy, adaptor.getIn()); + Value extU = emitCCast(rewriter, loc, uDstTy, srcU); + Value result = emitCCast(rewriter, loc, dstTy, extU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithUIToFPToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::UIToFPOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto srcIntTy = dyn_cast(op.getIn().getType()); + if (!srcIntTy) + return rewriter.notifyMatchFailure(op, "expected scalar integer input"); + + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + // Convert via an unsigned integer type of the same width. + if (srcIntTy.getWidth() == 1) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } + auto uSrcTy = + getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); + Value srcU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), + srcIntTy.getWidth()); + Value fp = rewriter.create(loc, dstTy, srcU).getResult(); + rewriter.replaceOp(op, fp); + return success(); + } +}; + +struct ArithFPToUIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::FPToUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto dstIntTy = dyn_cast(op.getType()); + if (!dstIntTy) + return rewriter.notifyMatchFailure(op, "expected scalar integer result"); + + Type dstTy = getTypeConverter()->convertType(dstIntTy); + if (!dstTy) + return failure(); + + auto uDstTy = + getUnsignedIntOpaqueType(rewriter.getContext(), dstIntTy.getWidth()); + Value asU = rewriter.create(loc, uDstTy, adaptor.getIn()).getResult(); + Value result = emitCCast(rewriter, loc, dstTy, asU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithBitcastToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::BitcastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + // For pointer-like types, a regular cast is fine. + if (isa(dstTy)) { + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } + + // Only support scalar int/float/index bitcasts here. + auto srcTy = op.getIn().getType(); + auto dstOrigTy = op.getType(); + + auto getBitWidth = [](Type t) -> std::optional { + if (auto it = dyn_cast(t)) + return it.getWidth(); + if (auto ft = dyn_cast(t)) + return ft.getWidth(); + if (isa(t)) + return kPTOIndexBitWidth; + return std::nullopt; + }; + auto srcBW = getBitWidth(srcTy); + auto dstBW = getBitWidth(dstOrigTy); + if (!srcBW || !dstBW || *srcBW != *dstBW) + return rewriter.notifyMatchFailure(op, "bitcast requires equal bitwidth"); + + // Determine the template argument from the destination type string. + auto dstOpaque = dyn_cast(dstTy); + if (!dstOpaque) + return rewriter.notifyMatchFailure(op, "expected emitc opaque dest type"); + + auto templateArgs = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(rewriter.getContext(), + dstOpaque.getValue())}); + auto call = rewriter.create( + loc, TypeRange{dstTy}, "ptoas_bitcast", /*operands=*/ValueRange{adaptor.getIn()}, + /*args=*/ArrayAttr{}, /*template_args=*/templateArgs); + rewriter.replaceOp(op, call.getResult(0)); + return success(); + } +}; + +// arith.cmpf lowering with ordered/unordered semantics. +struct ArithCmpFToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + static Value isNaN(ConversionPatternRewriter &rewriter, Location loc, + Value v) { + return rewriter + .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::ne, + v, v) + .getResult(); + } + + static Value isNotNaN(ConversionPatternRewriter &rewriter, Location loc, + Value v) { + return rewriter + .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::eq, + v, v) + .getResult(); + } + + LogicalResult matchAndRewrite(arith::CmpFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!isa(op.getLhs().getType())) + return rewriter.notifyMatchFailure(op, "cmpf only supported on scalar floats"); + + auto loc = op.getLoc(); + auto i1Ty = rewriter.getI1Type(); + + bool unordered = false; + emitc::CmpPredicate pred = emitc::CmpPredicate::eq; + + switch (op.getPredicate()) { + case arith::CmpFPredicate::AlwaysFalse: { + auto cst = makeEmitCOpaqueConstant(rewriter, loc, i1Ty, "false"); + rewriter.replaceOp(op, cst); + return success(); + } + case arith::CmpFPredicate::AlwaysTrue: { + auto cst = makeEmitCOpaqueConstant(rewriter, loc, i1Ty, "true"); + rewriter.replaceOp(op, cst); + return success(); + } + case arith::CmpFPredicate::OEQ: + unordered = false; + pred = emitc::CmpPredicate::eq; + break; + case arith::CmpFPredicate::OGT: + unordered = false; + pred = emitc::CmpPredicate::gt; + break; + case arith::CmpFPredicate::OGE: + unordered = false; + pred = emitc::CmpPredicate::ge; + break; + case arith::CmpFPredicate::OLT: + unordered = false; + pred = emitc::CmpPredicate::lt; + break; + case arith::CmpFPredicate::OLE: + unordered = false; + pred = emitc::CmpPredicate::le; + break; + case arith::CmpFPredicate::ONE: + unordered = false; + pred = emitc::CmpPredicate::ne; + break; + case arith::CmpFPredicate::ORD: { + Value ordered = rewriter.create( + loc, i1Ty, isNotNaN(rewriter, loc, adaptor.getLhs()), + isNotNaN(rewriter, loc, adaptor.getRhs())); + rewriter.replaceOp(op, ordered); + return success(); + } + case arith::CmpFPredicate::UEQ: + unordered = true; + pred = emitc::CmpPredicate::eq; + break; + case arith::CmpFPredicate::UGT: + unordered = true; + pred = emitc::CmpPredicate::gt; + break; + case arith::CmpFPredicate::UGE: + unordered = true; + pred = emitc::CmpPredicate::ge; + break; + case arith::CmpFPredicate::ULT: + unordered = true; + pred = emitc::CmpPredicate::lt; + break; + case arith::CmpFPredicate::ULE: + unordered = true; + pred = emitc::CmpPredicate::le; + break; + case arith::CmpFPredicate::UNE: + unordered = true; + pred = emitc::CmpPredicate::ne; + break; + case arith::CmpFPredicate::UNO: { + Value unord = rewriter.create( + loc, i1Ty, isNaN(rewriter, loc, adaptor.getLhs()), + isNaN(rewriter, loc, adaptor.getRhs())); + rewriter.replaceOp(op, unord); + return success(); + } + } + + Value cmp = rewriter + .create(loc, i1Ty, pred, adaptor.getLhs(), + adaptor.getRhs()) + .getResult(); + + Value unord = rewriter.create( + loc, i1Ty, isNaN(rewriter, loc, adaptor.getLhs()), + isNaN(rewriter, loc, adaptor.getRhs())); + Value ord = rewriter.create( + loc, i1Ty, isNotNaN(rewriter, loc, adaptor.getLhs()), + isNotNaN(rewriter, loc, adaptor.getRhs())); + + if (unordered) { + Value res = + rewriter.create(loc, i1Ty, unord, cmp).getResult(); + rewriter.replaceOp(op, res); + return success(); + } + + Value res = + rewriter.create(loc, i1Ty, ord, cmp).getResult(); + rewriter.replaceOp(op, res); + return success(); + } +}; + +struct ArithAddUIExtendedToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(arith::AddUIExtendedOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getSum().getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, + "expected scalar integer or index operands"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + SmallVector newResultTypes; + if (failed(getTypeConverter()->convertTypes(op->getResultTypes(), + newResultTypes))) + return failure(); + if (newResultTypes.size() != 2) + return failure(); + + Type sumDstTy = newResultTypes[0]; + Type overflowDstTy = newResultTypes[1]; + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + auto wideTy = getWiderUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value lhsWide = emitCCast(rewriter, loc, wideTy, lhsU); + Value rhsWide = emitCCast(rewriter, loc, wideTy, rhsU); + Value sumWide = + rewriter.create(loc, wideTy, lhsWide, rhsWide).getResult(); + + Value sumN = emitCCast(rewriter, loc, uTy, sumWide); + Value sum = emitCCast(rewriter, loc, sumDstTy, sumN); + + Value shiftAmt = makeEmitCIntConstant(rewriter, loc, wideTy, bitWidth); + Value high = rewriter + .create(loc, wideTy, sumWide, + shiftAmt) + .getResult(); + Value zeroWide = makeEmitCIntConstant(rewriter, loc, wideTy, 0); + Value overflow = + rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::ne, high, zeroWide) + .getResult(); + overflow = emitCCast(rewriter, loc, overflowDstTy, overflow); + + rewriter.replaceOp(op, {sum, overflow}); + return success(); + } +}; + +template +struct ArithMulExtendedToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getResult(0).getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, + "expected scalar integer or index operands"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + SmallVector newResultTypes; + if (failed(this->getTypeConverter()->convertTypes(op->getResultTypes(), + newResultTypes))) + return failure(); + if (newResultTypes.size() != 2) + return failure(); + + Type lowDstTy = newResultTypes[0]; + Type highDstTy = newResultTypes[1]; + + Type wideTy = isUnsigned ? (Type)getWiderUnsignedIntOpaqueType(rewriter.getContext(), + bitWidth) + : (Type)getWiderSignedIntOpaqueType(rewriter.getContext(), + bitWidth); + + Value lhsWide; + Value rhsWide; + if constexpr (isUnsigned) { + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + lhsWide = emitCCast(rewriter, loc, wideTy, lhsU); + rhsWide = emitCCast(rewriter, loc, wideTy, rhsU); + } else { + lhsWide = emitCCast(rewriter, loc, wideTy, adaptor.getLhs()); + rhsWide = emitCCast(rewriter, loc, wideTy, adaptor.getRhs()); + } + + Value prodWide = + rewriter.create(loc, wideTy, lhsWide, rhsWide).getResult(); + Value low = emitCCast(rewriter, loc, lowDstTy, prodWide); + + Value shiftAmt = makeEmitCIntConstant(rewriter, loc, wideTy, bitWidth); + Value highWide = rewriter + .create(loc, wideTy, prodWide, + shiftAmt) + .getResult(); + Value high = emitCCast(rewriter, loc, highDstTy, highWide); + + rewriter.replaceOp(op, {low, high}); + return success(); + } +}; + +using ArithMulSIExtendedToEmitC = + ArithMulExtendedToEmitC; +using ArithMulUIExtendedToEmitC = + ArithMulExtendedToEmitC; + +struct ArithMinMaxIToEmitCBase { + static Value makeSelect(ConversionPatternRewriter &rewriter, Location loc, + Type dstTy, Value cond, Value trueV, Value falseV) { + return rewriter + .create(loc, dstTy, cond, trueV, falseV) + .getResult(); + } +}; + +struct ArithMaxSIToEmitC : public OpConversionPattern, + ArithMinMaxIToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MaxSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + Value cond = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getRhs(), + adaptor.getLhs()); + rewriter.replaceOp(op, res); + return success(); + } +}; + +struct ArithMinSIToEmitC : public OpConversionPattern, + ArithMinMaxIToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MinSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + Value cond = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getLhs(), + adaptor.getRhs()); + rewriter.replaceOp(op, res); + return success(); + } +}; + +struct ArithMaxUIToEmitC : public OpConversionPattern, + ArithMinMaxIToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MaxUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + Value lhsU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value cond = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, lhsU, rhsU) + .getResult(); + Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getRhs(), + adaptor.getLhs()); + rewriter.replaceOp(op, res); + return success(); + } +}; + +struct ArithMinUIToEmitC : public OpConversionPattern, + ArithMinMaxIToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MinUIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + Value lhsU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = + castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value cond = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, lhsU, rhsU) + .getResult(); + Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getLhs(), + adaptor.getRhs()); + rewriter.replaceOp(op, res); + return success(); + } +}; + +// Floating-point max/min variants. +struct ArithFloatMinMaxToEmitCBase { + static Value isNaN(ConversionPatternRewriter &rewriter, Location loc, + Value v) { + return rewriter + .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::ne, + v, v) + .getResult(); + } + + static Value makeFZero(ConversionPatternRewriter &rewriter, Location loc, + Type ty) { + return makeEmitCOpaqueConstant(rewriter, loc, ty, "0.0f"); + } +}; + +struct ArithMaxNumFToEmitC : public OpConversionPattern, + ArithFloatMinMaxToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MaxNumFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); + Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); + + Value cmpLt = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value maxNoNaN = + rewriter + .create(loc, dstTy, cmpLt, adaptor.getRhs(), + adaptor.getLhs()) + .getResult(); + + Value rhsOrMax = + rewriter + .create(loc, dstTy, rhsNaN, adaptor.getLhs(), + maxNoNaN) + .getResult(); + Value res = + rewriter + .create(loc, dstTy, lhsNaN, adaptor.getRhs(), + rhsOrMax) + .getResult(); + rewriter.replaceOp(op, res); + return success(); + } +}; + +struct ArithMinNumFToEmitC : public OpConversionPattern, + ArithFloatMinMaxToEmitCBase { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::MinNumFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); + Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); + + Value cmpLt = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value minNoNaN = + rewriter + .create(loc, dstTy, cmpLt, adaptor.getLhs(), + adaptor.getRhs()) + .getResult(); + + Value rhsOrMin = + rewriter + .create(loc, dstTy, rhsNaN, adaptor.getLhs(), + minNoNaN) + .getResult(); + Value res = + rewriter + .create(loc, dstTy, lhsNaN, adaptor.getRhs(), + rhsOrMin) + .getResult(); + rewriter.replaceOp(op, res); + return success(); + } +}; + +template +struct ArithMinMaxFPropagateNaNToEmitC : public OpConversionPattern, + ArithFloatMinMaxToEmitCBase { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!isa(op.getType())) + return rewriter.notifyMatchFailure(op, "expected scalar float type"); + + auto loc = op.getLoc(); + Type dstTy = this->getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + + Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); + Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); + + // Basic compare-based min/max. + Value cmpLt = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::lt, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value candidate = rewriter + .create( + loc, dstTy, cmpLt, + isMaximum ? adaptor.getRhs() : adaptor.getLhs(), + isMaximum ? adaptor.getLhs() : adaptor.getRhs()) + .getResult(); + + // Fix signed zero tie-breaking for equal zeros. + Value zero = makeFZero(rewriter, loc, dstTy); + Value eq = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::eq, + adaptor.getLhs(), adaptor.getRhs()) + .getResult(); + Value lhsZero = rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::eq, + adaptor.getLhs(), zero) + .getResult(); + Value bothZero = rewriter + .create(loc, rewriter.getI1Type(), + eq, lhsZero) + .getResult(); + + auto floatTy = cast(op.getType()); + auto bitsTy = getUnsignedIntOpaqueType(rewriter.getContext(), floatTy.getWidth()); + auto templateArgs = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(rewriter.getContext(), + cast(bitsTy).getValue())}); + Value lhsBits = + rewriter + .create(loc, TypeRange{bitsTy}, "ptoas_bitcast", + ValueRange{adaptor.getLhs()}, + /*args=*/ArrayAttr{}, + /*template_args=*/templateArgs) + .getResult(0); + + Value oneBits = makeEmitCIntConstant(rewriter, loc, bitsTy, 1); + Value shAmt = makeEmitCIntConstant(rewriter, loc, bitsTy, + floatTy.getWidth() - 1); + Value signMask = rewriter + .create(loc, bitsTy, oneBits, + shAmt) + .getResult(); + Value signBit = rewriter + .create(loc, bitsTy, lhsBits, signMask) + .getResult(); + Value zeroBits = makeEmitCIntConstant(rewriter, loc, bitsTy, 0); + Value lhsIsNegZero = + rewriter + .create(loc, rewriter.getI1Type(), + emitc::CmpPredicate::ne, signBit, zeroBits) + .getResult(); + + Value tie = + rewriter + .create( + loc, dstTy, lhsIsNegZero, + isMaximum ? adaptor.getRhs() : adaptor.getLhs(), + isMaximum ? adaptor.getLhs() : adaptor.getRhs()) + .getResult(); + Value noNaN = rewriter + .create(loc, dstTy, bothZero, tie, + candidate) + .getResult(); + + // Propagate NaN: if lhs is NaN return lhs, else if rhs is NaN return rhs. + Value rhsOrNoNaN = rewriter + .create(loc, dstTy, rhsNaN, + adaptor.getRhs(), noNaN) + .getResult(); + Value res = rewriter + .create(loc, dstTy, lhsNaN, + adaptor.getLhs(), rhsOrNoNaN) + .getResult(); + rewriter.replaceOp(op, res); + return success(); + } +}; + +using ArithMaximumFToEmitC = + ArithMinMaxFPropagateNaNToEmitC; +using ArithMinimumFToEmitC = + ArithMinMaxFPropagateNaNToEmitC; + +//===----------------------------------------------------------------------===// +// Arith -> EmitC helpers +//===----------------------------------------------------------------------===// + +static emitc::OpaqueType getSignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth) { + switch (bitWidth) { + case 1: + return emitc::OpaqueType::get(ctx, "int8_t"); + case 8: + return emitc::OpaqueType::get(ctx, "int8_t"); + case 16: + return emitc::OpaqueType::get(ctx, "int16_t"); + case 32: + return emitc::OpaqueType::get(ctx, "int32_t"); + case 64: + return emitc::OpaqueType::get(ctx, "int64_t"); + case 128: + return emitc::OpaqueType::get(ctx, "__int128"); + default: + llvm::errs() << "[Debug] Unsupported signed integer bitwidth: " << bitWidth + << "\n"; + return emitc::OpaqueType::get(ctx, "int64_t"); + } +} + +static emitc::OpaqueType getUnsignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth) { + switch (bitWidth) { + case 1: + return emitc::OpaqueType::get(ctx, "uint8_t"); + case 8: + return emitc::OpaqueType::get(ctx, "uint8_t"); + case 16: + return emitc::OpaqueType::get(ctx, "uint16_t"); + case 32: + return emitc::OpaqueType::get(ctx, "uint32_t"); + case 64: + return emitc::OpaqueType::get(ctx, "uint64_t"); + case 128: + return emitc::OpaqueType::get(ctx, "unsigned __int128"); + default: + llvm::errs() << "[Debug] Unsupported unsigned integer bitwidth: " + << bitWidth << "\n"; + return emitc::OpaqueType::get(ctx, "uint64_t"); + } +} + +static emitc::OpaqueType getWiderSignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth) { + switch (bitWidth) { + case 1: + case 8: + return getSignedIntOpaqueType(ctx, 16); + case 16: + return getSignedIntOpaqueType(ctx, 32); + case 32: + return getSignedIntOpaqueType(ctx, 64); + case 64: + return getSignedIntOpaqueType(ctx, 128); + default: + return getSignedIntOpaqueType(ctx, 128); + } +} + +static emitc::OpaqueType getWiderUnsignedIntOpaqueType(MLIRContext *ctx, + unsigned bitWidth) { + switch (bitWidth) { + case 1: + case 8: + return getUnsignedIntOpaqueType(ctx, 16); + case 16: + return getUnsignedIntOpaqueType(ctx, 32); + case 32: + return getUnsignedIntOpaqueType(ctx, 64); + case 64: + return getUnsignedIntOpaqueType(ctx, 128); + default: + return getUnsignedIntOpaqueType(ctx, 128); + } +} + +static Value makeEmitCOpaqueConstant(ConversionPatternRewriter &rewriter, + Location loc, Type type, + llvm::StringRef literal) { + auto attr = emitc::OpaqueAttr::get(rewriter.getContext(), literal); + return rewriter.create(loc, type, attr); +} + +static Value makeEmitCIntConstant(ConversionPatternRewriter &rewriter, + Location loc, Type type, int64_t value) { + return makeEmitCOpaqueConstant(rewriter, loc, type, std::to_string(value)); +} + +static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, + Type dstType, Value src) { + if (src.getType() == dstType) + return src; + return rewriter.createOrFold(loc, dstType, src); +} + +// For signless iN integers lowered to signed C++ types, this creates a value +// representing the same N-bit pattern in an unsigned C++ type of the same +// width. This avoids incorrect sign-extension when later widening to a larger +// unsigned type. +static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter, + Location loc, Value v, + unsigned bitWidth) { + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + return emitCCast(rewriter, loc, uTy, v); +} + +struct ArithMulIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::MulIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + // i1 mul is equivalent to bitwise AND (mod 2 arithmetic). + if (bitWidth == 1) { + rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value mulU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, mulU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithAddIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::AddIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + // i1 add is equivalent to XOR (mod 2 arithmetic). + if (bitWidth == 1) { + rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value addU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, addU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithCastOPToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(arith::IndexCastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newTy = getTypeConverter()->convertType(op.getType()); + if (!newTy) + return failure(); + rewriter.replaceOpWithNewOp(op, newTy, adaptor.getIn()); + return success(); + } +}; + +struct ArithSubIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::SubIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Type opTy = op.getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + + Type dstTy = getTypeConverter()->convertType(opTy); + if (!dstTy) + return failure(); + + // i1 sub is equivalent to XOR (mod 2 arithmetic). + if (bitWidth == 1) { + rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } + + auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); + Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), + bitWidth); + Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), + bitWidth); + Value subU = rewriter.create(loc, uTy, lhsU, rhsU); + Value result = emitCCast(rewriter, loc, dstTy, subU); + rewriter.replaceOp(op, result); + return success(); + } +}; + +struct ArithDivSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::DivSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newTy = getTypeConverter()->convertType(op.getType()); + if (!newTy) + return failure(); + rewriter.replaceOpWithNewOp(op, newTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } +}; + +struct ArithRemSIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::RemSIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newTy = getTypeConverter()->convertType(op.getType()); + if (!newTy) + return failure(); + rewriter.replaceOpWithNewOp(op, newTy, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } +}; + +struct ArithTruncIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::TruncIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + auto dstIntTy = dyn_cast(op.getType()); + auto srcIntTy = dyn_cast(op.getIn().getType()); + if (!dstIntTy || !srcIntTy) + return rewriter.notifyMatchFailure(op, "expected scalar integer types"); + + Type dstTy = getTypeConverter()->convertType(dstIntTy); + if (!dstTy) + return failure(); + + // to-i1 conversions: Arith wants truncation to the low bit, while C/C++ + // casts to bool are equivalent to `v != 0`. Implement as `(bool)(v & 1)`. + if (dstIntTy.getWidth() == 1) { + if (srcIntTy.getWidth() == 1) { + rewriter.replaceOp(op, adaptor.getIn()); + return success(); + } + + auto uSrcTy = + getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); + Value inU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), + srcIntTy.getWidth()); + Value one = makeEmitCIntConstant(rewriter, loc, uSrcTy, 1); + Value masked = + rewriter.create(loc, uSrcTy, inU, one); + Value asBool = emitCCast(rewriter, loc, dstTy, masked); + rewriter.replaceOp(op, asBool); + return success(); + } + + rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); + return success(); + } +}; + + struct ArithConstantToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newType = getTypeConverter()->convertType(op.getType()); + if (!newType) return failure(); + + // `adaptor.getValue()` may be null if attribute conversion isn't defined. + // Use the original attribute as fallback and always cast null-safely. + Attribute valueAttr = adaptor.getValue(); + if (!valueAttr) valueAttr = op.getValue(); + + if (auto floatAttr = dyn_cast_or_null(valueAttr)) { + SmallString<32> valStr; + floatAttr.getValue().toString(valStr); + llvm::StringRef s(valStr); + // Ensure the literal parses as a floating-point constant in C/C++. + // `APFloat::toString` may emit "1" for integral values; make it "1.0". + const bool hasFloatMarker = + s.contains('.') || s.contains('e') || s.contains('E') || + s.contains('p') || s.contains('P') || s.starts_with("0x") || + s.starts_with("0X") || s.starts_with("nan") || + s.starts_with("-nan") || s.starts_with("inf") || + s.starts_with("-inf"); + if (!hasFloatMarker) + valStr.append(".0"); + // Suffix: keep `f` for f16/f32; omit for f64. + if (!floatAttr.getType().isF64()) + valStr.append("f"); + auto constAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); + rewriter.replaceOpWithNewOp(op, newType, constAttr); + return success(); + } + + if (auto intAttr = dyn_cast_or_null(valueAttr)) { + std::string valStr = std::to_string(intAttr.getValue().getSExtValue()); + auto constAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); + rewriter.replaceOpWithNewOp(op, newType, constAttr); + return success(); + } + + return failure(); + } + }; +//===----------------------------------------------------------------------===// +// pto.mgather lowering -> MGATHER(dst, mem, idx) +// %dst = pto.mgather %mem, %idx : memref<...>, memref<...> -> memref<...> +//===----------------------------------------------------------------------===// + +struct PTOMGatherToMGATHER : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::MGatherOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value mem = peelUnrealized(adaptor.getMem()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa currently has no NPU implementation for MGATHER/MSCATTER. + // Fallback to a smoke-friendly lowering to keep compile/run coverage. + rewriter.create( + op.getLoc(), TypeRange{}, "TLOAD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, mem}); + + if (op->getNumResults() == 0) { + rewriter.eraseOp(op); + } else { + rewriter.replaceOp(op, dst); + } + return success(); + } +}; + +struct AffineApplyMulConstToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(affine::AffineApplyOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto map = op.getAffineMap(); + + if (map.getNumDims() != 0 || map.getNumSymbols() != 1) + return failure(); + + auto expr = map.getResult(0); + auto bin = dyn_cast(expr); + if (!bin || bin.getKind() != AffineExprKind::Mul) + return failure(); + + auto lhs = bin.getLHS(); + auto rhs = bin.getRHS(); + + auto symExpr = dyn_cast(lhs); + auto constExpr = dyn_cast(rhs); + if (!symExpr || !constExpr) + return failure(); + + Value inputVal = adaptor.getMapOperands()[0]; + + std::string valStr = std::to_string(constExpr.getValue()); + auto cstAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); + auto cstOp = rewriter.create( + op.getLoc(), inputVal.getType(), cstAttr); + + rewriter.replaceOpWithNewOp( + op, inputVal.getType(), inputVal, cstOp); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Kernel inference helpers +//===----------------------------------------------------------------------===// + +enum class KernelKind { VecAdd, Matmul, Unknown }; + +static KernelKind inferKernelKind(func::FuncOp f) { + bool hasAdd = false; + bool hasMM = false; + f.walk([&](Operation *op) { + if (isa(op)) hasAdd = true; + if (isa(op)) hasMM = true; + if (isa(op)) hasMM = true; + }); + if (hasMM) return KernelKind::Matmul; + if (hasAdd) return KernelKind::VecAdd; + return KernelKind::Unknown; +} + +static void inferTileMNK(func::FuncOp f, int &M, int &N, int &K) { + M = 32; N = 32; K = 32; + SmallVector subs; + f.walk([&](memref::SubViewOp sv) { subs.push_back(sv); }); + + auto readShape2D = [&](memref::SubViewOp sv, int &d0, int &d1) { + auto resTy = mlir::cast(sv.getResult().getType()); + if (resTy.getRank() == 2 && resTy.hasStaticShape()) { + d0 = (int)resTy.getDimSize(0); + d1 = (int)resTy.getDimSize(1); + } + }; + + if (subs.empty()) return; + + int a0=32, a1=32; + readShape2D(subs[0], a0, a1); + M = a0; N = a1; + + if (subs.size() >= 2) { + int b0=32, b1=32; + readShape2D(subs[0], a0, a1); + readShape2D(subs[1], b0, b1); + M = a0; K = a1; N = b1; + } +} + +struct FuncToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(func::FuncOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Convert the function signature with the type converter. + Type convertedTy = getTypeConverter()->convertType(op.getFunctionType()); + auto funcType = dyn_cast_or_null(convertedTy); + if (!funcType) + return rewriter.notifyMatchFailure(op, "failed to convert function type"); + if (funcType.getNumResults() > 1) + return rewriter.notifyMatchFailure( + op, "EmitC cannot return multiple values"); + + // Create the EmitC function with the converted signature. + auto emitcFunc = rewriter.create(op.getLoc(), op.getName(), + funcType); + emitcFunc.setSpecifiersAttr( + rewriter.getStrArrayAttr({"__global__ AICORE"})); + + // Inline the original body, then convert region/block argument types to + // match the converted signature (also covers CFG blocks introduced by + // pre-lowering, e.g. scf.while -> cf.br/cf.cond_br). + rewriter.inlineRegionBefore(op.getBody(), emitcFunc.getBody(), + emitcFunc.end()); + + TypeConverter::SignatureConversion entryConv(op.getNumArguments()); + for (unsigned i = 0; i < op.getNumArguments(); ++i) + entryConv.addInputs(i, funcType.getInput(i)); + + if (failed(rewriter.convertRegionTypes(&emitcFunc.getBody(), + *getTypeConverter(), &entryConv))) + return failure(); + + // [Compatibility patch] Preserve existing snippets that rely on `T`. + { + Block &entryBlock = emitcFunc.getBody().front(); + rewriter.setInsertionPointToStart(&entryBlock); + rewriter.create(op.getLoc(), "using T = float;"); + } + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// SubView lowering to GlobalTensor (keep your existing code) +//===----------------------------------------------------------------------=== + +enum class Role { A, B, C, Unknown }; + +static Role inferSubviewRole(memref::SubViewOp sv) { + for (Operation *u : sv.getResult().getUsers()) { + if (auto ld = dyn_cast(u)) { + Value ub = ld.getDst(); + if (!ub) continue; + for (Operation *uu : ub.getUsers()) { + if (auto mm = dyn_cast(uu)) { + if (mm.getLhs() == ub) return Role::A; + if (mm.getRhs() == ub) return Role::B; + } + if (auto mmacc = dyn_cast(uu)) { + if (mmacc.getLhs() == ub) return Role::A; + if (mmacc.getRhs() == ub) return Role::B; + } + } + } + + if (auto st = dyn_cast(u)) { + if (st.getDst() == sv.getResult()) return Role::C; + } + } + return Role::Unknown; +} + +// ============================================================================= +// 4. MemRef SubView -> Explicit Shape/Stride Construction (Full Implementation) +// ============================================================================= +struct SubviewToEmitCPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + // 辅助函数:尝试从 OpFoldResult 中提取静态整数值 + std::optional extractStaticInt(OpFoldResult ofr) const { + if (auto attr = ofr.dyn_cast()) { + if (auto intAttr = dyn_cast(attr)) + return intAttr.getInt(); + } else { + Value v = ofr.get(); + if (auto cOp = v.getDefiningOp()) { + if (auto iAttr = dyn_cast(cOp.getValue())) + return iAttr.getInt(); + } else if (auto idxOp = v.getDefiningOp()) { + return idxOp.value(); + } + } + return std::nullopt; + } + + LogicalResult matchAndRewrite(memref::SubViewOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + // 获取源 MemRef 类型信息 + auto srcType = mlir::cast(op.getSource().getType()); + int64_t rank = srcType.getRank(); + + auto elemTypeToString = [&](Type elemTy) -> std::string { + if (elemTy.isF16()) + return "half"; + if (elemTy.isBF16()) + return "bfloat16_t"; + if (elemTy.isF32()) + return "float"; + if (elemTy.isF64()) + return "double"; + if (elemTy.isInteger(8)) { + if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) + return "int8_t"; + return "uint8_t"; + } + if (elemTy.isInteger(16)) { + if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) + return "int16_t"; + return "uint16_t"; + } + if (elemTy.isInteger(32)) { + if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) + return "int32_t"; + return "uint32_t"; + } + if (elemTy.isInteger(64)) { + return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; + } + return "float"; + }; + + // ------------------------------------------------------------------------- + // Part 1: 指针偏移计算 (Runtime Pointer Arithmetic) + // ------------------------------------------------------------------------- + + // 准备类型: unsigned + Type u32Ty = emitc::OpaqueType::get(ctx, "unsigned"); + + // Helper: 创建 unsigned 常量 + auto mkU32 = [&](int64_t v) -> Value { + return rewriter.create( + loc, u32Ty, emitc::OpaqueAttr::get(ctx, std::to_string(v))); + }; + + // Helper: 将 OpFoldResult 转为 EmitC Value (用于计算) + auto ofrToEmitCValue = [&](OpFoldResult ofr) -> Value { + if (auto v = ofr.dyn_cast()) { + Value rv = rewriter.getRemappedValue(v); + // 如果类型不匹配,插入 Cast + if (rv.getType() != u32Ty) + return rewriter.create(loc, u32Ty, rv).getResult(); + return rv; + } + if (auto attr = ofr.dyn_cast()) { + if (auto ia = dyn_cast(attr)) + return mkU32(ia.getValue().getSExtValue()); + } + return mkU32(0); + }; + + // 1. 获取 Source 的 Strides (支持动态 Stride 收集) + SmallVector sourceStrides; + + if (auto rc = op.getSource().getDefiningOp()) { + sourceStrides = rc.getMixedStrides(); + } else { + SmallVector strideInts; + int64_t offset = ShapedType::kDynamic; + bool useTypeStrides = succeeded(getStridesAndOffset(srcType, strideInts, offset)); + (void)offset; + if (useTypeStrides) { + for (int64_t s : strideInts) { + if (s == ShapedType::kDynamic) { + useTypeStrides = false; + break; + } + } + } + if (useTypeStrides) { + for (int64_t s : strideInts) { + sourceStrides.push_back(rewriter.getIndexAttr(s)); + } + } else { + // Fallback: Compact Layout + auto shape = srcType.getShape(); + int64_t current = 1; + sourceStrides.resize(rank); + for (int i = rank - 1; i >= 0; --i) { + sourceStrides[i] = rewriter.getIndexAttr(current); + if (shape[i] != ShapedType::kDynamic) current *= shape[i]; + } + } + } + + // 2. 计算运行时 Offset + auto staticOffsets = op.getStaticOffsets(); + auto dynamicOffsets = adaptor.getOffsets(); + int dynOffIdx = 0; + Value totalOffset = mkU32(0); + + for (int i = 0; i < rank; ++i) { + // A. 获取 Offset + Value offVal; + if (staticOffsets[i] == ShapedType::kDynamic) { + Value rawDyn = dynamicOffsets[dynOffIdx++]; + offVal = rewriter.create(loc, u32Ty, rawDyn); + } else { + offVal = mkU32(staticOffsets[i]); + } + + // B. 获取 Stride (用于指针计算) + Value strideVal = mkU32(1); + if (i < (int)sourceStrides.size()) { + strideVal = ofrToEmitCValue(sourceStrides[i]); + } + + // C. 累加 + Value term = rewriter.create(loc, u32Ty, offVal, strideVal); + totalOffset = rewriter.create(loc, u32Ty, totalOffset, term); + } + + // 3. 生成新指针 + // + // NOTE: Some toolchains may materialize kernel pointer params as `void*` even + // when the underlying element type is i16. Pointer arithmetic on `void*` + // is ill-formed in C++, so we explicitly cast to a typed pointer for i16. + Value sourcePtr = adaptor.getSource(); + Value tileCandidate = sourcePtr; + if (auto castOp = sourcePtr.getDefiningOp()) { + tileCandidate = castOp.getOperand(); + } else if (auto uc = + sourcePtr.getDefiningOp()) { + tileCandidate = uc.getOperand(0); + } + if (auto ot = dyn_cast(tileCandidate.getType())) { + auto tyStr = ot.getValue(); + if (tyStr.find("Tile<") != std::string::npos || + tyStr.find("ConvTile<") != std::string::npos) { + std::string elemTok = elemTypeToString(srcType.getElementType()); + std::string qualifier = "__gm__"; + if (auto asAttr = + dyn_cast_or_null(srcType.getMemorySpace())) + qualifier = addrSpaceQualifier(asAttr.getAddressSpace()); + auto rawPtrTy = + emitc::OpaqueType::get(ctx, qualifier + " " + elemTok + "*"); + sourcePtr = + rewriter + .create(loc, rawPtrTy, + "PTOAS__TILE_DATA", ArrayAttr{}, + ArrayAttr{}, ValueRange{tileCandidate}) + .getResult(0); + } + } + Value newPtr; + { + auto resTy = mlir::cast(op.getResult().getType()); + Type elemTy = resTy.getElementType(); + if (elemTy.isInteger(16)) { + std::string castElemTypeStr = "int16_t"; + if (cast(elemTy).isUnsigned()) + castElemTypeStr = "uint16_t"; + + std::string qualifier = "__gm__"; + if (Attribute ms = srcType.getMemorySpace()) { + if (auto ptoAttr = dyn_cast(ms)) { + qualifier = addrSpaceQualifier(ptoAttr.getAddressSpace()); + } + } + + auto typedPtrTy = emitc::OpaqueType::get(ctx, qualifier + " " + castElemTypeStr + "*"); + Value typedSourcePtr = rewriter.create(loc, typedPtrTy, sourcePtr); + newPtr = rewriter.create(loc, typedPtrTy, typedSourcePtr, totalOffset); + } else { + newPtr = rewriter.create(loc, sourcePtr.getType(), sourcePtr, totalOffset); + } + } + + + // ------------------------------------------------------------------------- + // Part 2: For non-GM memrefs, keep pointer (no GlobalTensor). + // ------------------------------------------------------------------------- + bool isGlobal = true; + if (auto asAttr = dyn_cast_or_null(srcType.getMemorySpace())) { + auto as = asAttr.getAddressSpace(); + isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); + } + if (!isGlobal) { + Type dstTy = getTypeConverter()->convertType(op.getType()); + if (!dstTy) + return failure(); + if (newPtr.getType() != dstTy) + newPtr = rewriter.create(loc, dstTy, newPtr); + rewriter.replaceOp(op, newPtr); + return success(); + } + + // ------------------------------------------------------------------------- + // Part 3: 生成 GlobalTensor 类型 (Shape/Stride Template Generation) + // ------------------------------------------------------------------------- + + // When emitting C++ with `declareVariablesAtTop`, value declarations are + // hoisted before body statements. Avoid introducing local `using` aliases + // for templated types (Shape/Stride/GlobalTensor) because those aliases + // would appear after the hoisted declarations and break compilation + // (`unknown type name`). + // + // Instead, use the fully spelled template types as EmitC opaque types. + + auto resTy = mlir::cast(op.getResult().getType()); + + // 1. 解析具体元素类型 (完整逻辑,不省略) + std::string elemTypeStr = "float"; + Type elemTy = resTy.getElementType(); + + if (elemTy.isF16()) { + elemTypeStr = "half"; + } else if (elemTy.isBF16()) { + elemTypeStr = "bfloat16_t"; + } else if (elemTy.isF32()) { + elemTypeStr = "float"; + } else if (elemTy.isInteger(8)) { + // 区分有符号/无符号通常依赖上下文,但在 EmitC 中 int8_t 比较通用 + if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) + elemTypeStr = "int8_t"; + else + elemTypeStr = "uint8_t"; + } else if (elemTy.isInteger(16)) { + if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) + elemTypeStr = "int16_t"; + else + elemTypeStr = "uint16_t"; + } else if (elemTy.isInteger(32)) { + if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) + elemTypeStr = "int32_t"; + else + elemTypeStr = "uint32_t"; + } else if (elemTy.isInteger(64)) { + elemTypeStr = cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; + } + + // 2. 生成 Shape 模板参数,之后会右对齐有效维度并补齐到 5 维(高维填 1) + SmallVector shapeParamsVec; + SmallVector sizeValues; // 每个维度对应的运行时 size(统一为 unsigned) + auto resShape = resTy.getShape(); + auto mixedSizes = op.getMixedSizes(); + sizeValues.reserve(rank); + for (int i = 0; i < resTy.getRank(); ++i) { + if (resShape[i] == ShapedType::kDynamic) { + shapeParamsVec.push_back("-1"); + } else { + shapeParamsVec.push_back(std::to_string(resShape[i])); + } + // size 值:优先从 op.getMixedSizes() 取(可动态/静态),否则退化为类型里的静态 shape。 + if (i < (int)mixedSizes.size()) + sizeValues.push_back(ofrToEmitCValue(mixedSizes[i])); + else + sizeValues.push_back( + mkU32(resShape[i] == ShapedType::kDynamic ? 1 : resShape[i])); + } + + // 3. 生成 Stride 模板参数 + 运行时 stride 值(考虑 subview step) + SmallVector dummyStrideVec; + SmallVector strideValues; // 每个维度对应的运行时 stride(统一为 unsigned) + dummyStrideVec.reserve(rank); + strideValues.reserve(rank); + auto subViewSteps = op.getMixedStrides(); + for (int i = 0; i < rank; ++i) { + OpFoldResult srcStrideOfr = + (i < (int)sourceStrides.size()) ? sourceStrides[i] + : rewriter.getIndexAttr(1); + OpFoldResult stepOfr = (i < (int)subViewSteps.size()) + ? subViewSteps[i] + : rewriter.getIndexAttr(1); + + auto srcStatic = extractStaticInt(srcStrideOfr); + auto stepStatic = extractStaticInt(stepOfr); + if (srcStatic && stepStatic) { + int64_t finalStride = (*srcStatic) * (*stepStatic); + dummyStrideVec.push_back(std::to_string(finalStride)); + strideValues.push_back(mkU32(finalStride)); + continue; + } + + dummyStrideVec.push_back("-1"); + Value srcV = ofrToEmitCValue(srcStrideOfr); + Value stepV = ofrToEmitCValue(stepOfr); + // 尽量避免乘以 1 生成冗余指令 + if (stepStatic && *stepStatic == 1) + strideValues.push_back(srcV); + else if (srcStatic && *srcStatic == 1) + strideValues.push_back(stepV); + else + strideValues.push_back( + rewriter.create(loc, u32Ty, srcV, stepV)); + } + + // 3.1 右对齐到 5 维:shape 补 1;已有维度继承原 stride; + // 被补出来的高维按“紧密升维”规则连续推导:stride[i] = shape[i+1] * stride[i+1] + SmallVector finalShape(5, "1"); + SmallVector finalStride(5, "1"); + Value oneU32 = mkU32(1); + SmallVector finalShapeValues(5, oneU32); + SmallVector finalStrideValues(5, oneU32); + int shift = 5 - rank; + + // 先放入原始 shape/stride(保持用户提供的值) + for (int i = 0; i < rank && i < 5; ++i) { + finalShape[shift + i] = shapeParamsVec[i]; + finalStride[shift + i] = dummyStrideVec[i]; + finalShapeValues[shift + i] = sizeValues[i]; + finalStrideValues[shift + i] = strideValues[i]; + } + + auto mulOrDyn = [](const std::string &a, const std::string &b) -> std::string { + if (a == "-1" || b == "-1") + return "-1"; + int64_t va = 1, vb = 1; + (void)llvm::to_integer(a, va); + (void)llvm::to_integer(b, vb); + return std::to_string(va * vb); + }; + + // 从低维到高维倒推补齐 stride(仅对补出来的前置维度生效) + for (int i = 3; i >= 0; --i) { + // 如果该维已由原始 rank 覆盖,则保持原值 + if (i >= shift) + continue; + // 补维:shape 已经是 1,stride = shape[i+1] * stride[i+1](或动态) + finalStride[i] = mulOrDyn(finalShape[i + 1], finalStride[i + 1]); + if (finalStride[i] != "-1") { + int64_t si = 1; + (void)llvm::to_integer(finalStride[i], si); + finalStrideValues[i] = mkU32(si); + continue; + } + // 动态推导:stride[i] = shape[i+1] * stride[i+1] + if (finalShape[i + 1] == "1") { + finalStrideValues[i] = finalStrideValues[i + 1]; + } else { + finalStrideValues[i] = rewriter.create( + loc, u32Ty, finalShapeValues[i + 1], finalStrideValues[i + 1]); + } + } + + auto joinParams = [](llvm::ArrayRef vec) { + std::string out; + for (size_t i = 0; i < vec.size(); ++i) { + if (i > 0) out += ", "; + out += vec[i]; + } + return out; + }; + + std::string shapeParams = joinParams(finalShape); + std::string strideParams = joinParams(finalStride); + + // Spelled-out C++ types. + std::string shapeCppType = "pto::Shape<" + shapeParams + ">"; + std::string strideCppType = "pto::Stride<" + strideParams + ">"; + + // 3.0 Layout: prefer the attribute from InferPTOLayout; only fall back to + // local inference when the pass is disabled. + std::string layoutEnum = "pto::Layout::ND"; + if (auto layout = resolveLayoutForGlobalTensor(op, op.getSource())) { + layoutEnum = layoutToEmitCString(*layout); + } else { + auto strToInt = [](const std::string &s, int64_t &out) -> bool { + return s != "-1" && llvm::to_integer(s, out); + }; + SmallVector shapeInt(5, -1), strideInt(5, -1); + bool allStatic = true; + for (int i = 0; i < 5; ++i) { + if (!strToInt(finalShape[i], shapeInt[i]) || + !strToInt(finalStride[i], strideInt[i])) + allStatic = false; + } + + int layoutTag = 0; // ND + auto elemBytes = 4; // default float + if (elemTypeStr.find("half") != std::string::npos || + elemTypeStr.find("f16") != std::string::npos || + elemTypeStr.find("bf16") != std::string::npos) + elemBytes = 2; + else if (elemTypeStr.find("double") != std::string::npos || + elemTypeStr.find("f64") != std::string::npos) + elemBytes = 8; + + if (allStatic) { + if (shapeInt[2] == 16 && shapeInt[2] * shapeInt[3] * elemBytes == 512 && + strideInt[4] == 1 && strideInt[3] == shapeInt[4]) { + layoutTag = 2; // NZ + } else { + bool isRow = strideInt[4] == 1; + for (int i = 3; i >= 0; --i) + isRow &= (strideInt[i] == strideInt[i + 1] * shapeInt[i + 1]); + bool isCol = strideInt[0] == 1; + for (int i = 0; i < 4; ++i) + isCol &= (strideInt[i + 1] == strideInt[i] * shapeInt[i]); + if (isCol) + layoutTag = 1; // DN + else + layoutTag = isRow ? 0 : 0; // fallback ND + } + } + + if (layoutTag == 1) + layoutEnum = "pto::Layout::DN"; + else if (layoutTag == 2) + layoutEnum = "pto::Layout::NZ"; + } + // GlobalTensor takes a Layout non-type template parameter; directly use the + // enum constant. + + + // ------------------------------------------------------------------------- + // Part 3: 显式对象实例化 (Explicit Object Instantiation) + // ------------------------------------------------------------------------- + + // A. Instantiate Shape object. + auto shapeTypeOpaque = emitc::OpaqueType::get(ctx, shapeCppType); + SmallVector shapeArgs; + // 从 adaptor.getSizes() 获取 subview 的所有 dynamic sizes + for (Value dynSize : adaptor.getSizes()) { + shapeArgs.push_back(dynSize); + } + + auto shapeInstOp = rewriter.create( + loc, + shapeTypeOpaque, // 返回类型 + shapeCppType, // 调用的“函数名”即类名构造函数 + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange(shapeArgs) + ); + + // B. Instantiate Stride object. + auto strideTypeOpaque = emitc::OpaqueType::get(ctx, strideCppType); + // 仅传入动态 stride 维度对应的值,匹配 pto::Stride 的 N-parameter ctor(并满足其 static_assert)。 + SmallVector strideCtorArgs; + strideCtorArgs.reserve(5); + for (int i = 0; i < 5; ++i) { + if (finalStride[i] == "-1") + strideCtorArgs.push_back(finalStrideValues[i]); + } + auto strideInstOp = rewriter.create( + loc, strideTypeOpaque, strideCppType, + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange(strideCtorArgs)); + + // C. Instantiate GlobalTensor object (ptr + shape + stride). + std::string gtCppType = "GlobalTensor<" + elemTypeStr + ", " + shapeCppType + + ", " + strideCppType + ", " + layoutEnum + ">"; + auto gtType = emitc::OpaqueType::get(ctx, gtCppType); + + // 准备构造参数: [ptr, shape_instance, stride_instance] + SmallVector gtConstructorArgs; + gtConstructorArgs.push_back(newPtr); + gtConstructorArgs.push_back(shapeInstOp.getResult(0)); // 拿到 shape_inst 的 SSA Value + gtConstructorArgs.push_back(strideInstOp.getResult(0)); // 拿到 stride_inst 的 SSA Value + + rewriter.replaceOpWithNewOp( + op, + gtType, + gtCppType, + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange(gtConstructorArgs) + ); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Helper: build GlobalTensor from a static MemRef (for TLOAD/TSTORE) +//===----------------------------------------------------------------------===// + +static std::string getElemTypeStringForGT(Type elemTy) { + if (elemTy.isF16()) return "half"; + if (elemTy.isBF16()) return "bfloat16_t"; + if (elemTy.isF32()) return "float"; + if (elemTy.isF64()) return "double"; + if (elemTy.isInteger(8)) { + if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) + return "int8_t"; + return "uint8_t"; + } + if (elemTy.isInteger(16)) { + if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) + return "int16_t"; + return "uint16_t"; + } + if (elemTy.isInteger(32)) { + if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) + return "int32_t"; + return "uint32_t"; + } + if (elemTy.isInteger(64)) { + return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; + } + return "float"; +} + +static Value buildGlobalTensorFromMemref(ConversionPatternRewriter &rewriter, + Location loc, Value basePtr, + MemRefType mrTy, + Operation *anchor) { + auto *ctx = rewriter.getContext(); + + // Only handle fully static shapes/strides for now. + auto shape = mrTy.getShape(); + for (int64_t dim : shape) { + if (dim == ShapedType::kDynamic) + return Value(); + } + + SmallVector strides; + int64_t offset = 0; + if (failed(getStridesAndOffset(mrTy, strides, offset))) { + // Fallback: compact row-major + strides.resize(shape.size()); + int64_t s = 1; + for (int i = (int)shape.size() - 1; i >= 0; --i) { + strides[i] = s; + s *= shape[i]; + } + offset = 0; + } + if (offset == ShapedType::kDynamic) + return Value(); + for (int64_t s : strides) { + if (s == ShapedType::kDynamic) + return Value(); + } + + // Apply static base offset if needed. + Value ptr = basePtr; + if (offset != 0) { + Type u32Ty = emitc::OpaqueType::get(ctx, "unsigned"); + auto offVal = rewriter.create( + loc, u32Ty, emitc::OpaqueAttr::get(ctx, std::to_string(offset))); + ptr = rewriter.create(loc, basePtr.getType(), basePtr, + offVal); + } + + std::string suffix = "_" + std::to_string(reinterpret_cast(anchor)); + std::string shapeTypeName = "GTShape" + suffix; + std::string strideTypeName = "GTStride" + suffix; + std::string gtTypeName = "GT" + suffix; + + std::string elemTypeStr = getElemTypeStringForGT(mrTy.getElementType()); + + SmallVector shapeParamsVec; + SmallVector strideParamsVec; + for (int i = 0, e = (int)shape.size(); i < e; ++i) { + shapeParamsVec.push_back(std::to_string(shape[i])); + strideParamsVec.push_back(std::to_string(strides[i])); + } + + // Right-align to 5D (pad leading dims with 1). + SmallVector finalShape(5, "1"); + SmallVector finalStride(5, "1"); + int rank = (int)shape.size(); + int shift = 5 - rank; + for (int i = 0; i < rank && i < 5; ++i) { + finalShape[shift + i] = shapeParamsVec[i]; + finalStride[shift + i] = strideParamsVec[i]; + } + auto mulOrDyn = [](const std::string &a, const std::string &b) -> std::string { + if (a == "-1" || b == "-1") + return "-1"; + int64_t va = 1, vb = 1; + (void)llvm::to_integer(a, va); + (void)llvm::to_integer(b, vb); + return std::to_string(va * vb); + }; + for (int i = 3; i >= 0; --i) { + if (i >= shift) + continue; + finalStride[i] = mulOrDyn(finalShape[i + 1], finalStride[i + 1]); + } + + auto joinParams = [](llvm::ArrayRef vec) { + std::string out; + for (size_t i = 0; i < vec.size(); ++i) { + if (i > 0) out += ", "; + out += vec[i]; + } + return out; + }; + + std::string shapeParams = joinParams(finalShape); + std::string strideParams = joinParams(finalStride); + + rewriter.create( + loc, "using " + shapeTypeName + " = pto::Shape<" + shapeParams + ">;"); + rewriter.create( + loc, "using " + strideTypeName + " = pto::Stride<" + strideParams + ">;"); + + // Layout: prefer the attribute from InferPTOLayout; only fall back to local + // inference when the pass is disabled. + std::string layoutEnum = "pto::Layout::ND"; + bool hasLayoutAttr = false; + if (auto layout = resolveLayoutForGlobalTensor(anchor, basePtr)) { + layoutEnum = layoutToEmitCString(*layout); + hasLayoutAttr = true; + } + if (!hasLayoutAttr) { + SmallVector shapeInt(5, -1), strideInt(5, -1); + for (int i = 0; i < 5; ++i) { + (void)llvm::to_integer(finalShape[i], shapeInt[i]); + (void)llvm::to_integer(finalStride[i], strideInt[i]); + } + int layoutTag = 0; // ND + int elemBytes = 4; + if (elemTypeStr.find("half") != std::string::npos || + elemTypeStr.find("bf16") != std::string::npos) + elemBytes = 2; + else if (elemTypeStr.find("double") != std::string::npos) + elemBytes = 8; + if (shapeInt[2] == 16 && shapeInt[2] * shapeInt[3] * elemBytes == 512 && + strideInt[4] == 1 && strideInt[3] == shapeInt[4]) { + layoutTag = 2; // NZ + } else { + bool isRow = strideInt[4] == 1; + for (int i = 3; i >= 0; --i) + isRow &= (strideInt[i] == strideInt[i + 1] * shapeInt[i + 1]); + bool isCol = strideInt[0] == 1; + for (int i = 0; i < 4; ++i) + isCol &= (strideInt[i + 1] == strideInt[i] * shapeInt[i]); + if (isCol) layoutTag = 1; // DN + else layoutTag = isRow ? 0 : 0; // fallback ND + } + if (layoutTag == 1) + layoutEnum = "pto::Layout::DN"; + else if (layoutTag == 2) + layoutEnum = "pto::Layout::NZ"; + } + std::string layoutConstName = gtTypeName + "_layout"; + rewriter.create( + loc, "constexpr pto::Layout " + layoutConstName + " = " + layoutEnum + ";"); + + auto shapeTypeOpaque = emitc::OpaqueType::get(ctx, shapeTypeName); + auto strideTypeOpaque = emitc::OpaqueType::get(ctx, strideTypeName); + auto shapeInstOp = rewriter.create( + loc, shapeTypeOpaque, shapeTypeName, ArrayAttr{}, ArrayAttr{}, + ValueRange{}); + auto strideInstOp = rewriter.create( + loc, strideTypeOpaque, strideTypeName, ArrayAttr{}, ArrayAttr{}, + ValueRange{}); + + rewriter.create( + loc, "using " + gtTypeName + " = GlobalTensor<" + elemTypeStr + ", " + + shapeTypeName + ", " + strideTypeName + ", " + + layoutConstName + ">;"); + auto gtType = emitc::OpaqueType::get(ctx, gtTypeName); + + SmallVector gtArgs; + gtArgs.push_back(ptr); + gtArgs.push_back(shapeInstOp.getResult(0)); + gtArgs.push_back(strideInstOp.getResult(0)); + + auto gtInst = rewriter.create( + loc, gtType, gtTypeName, ArrayAttr{}, ArrayAttr{}, ValueRange(gtArgs)); + + return gtInst.getResult(0); +} + +//===----------------------------------------------------------------------===// +// pto.pointer_cast lowering +//===----------------------------------------------------------------------=== +struct PointerCastConversion : public OpConversionPattern { + static bool getIndexConst(Value v, int64_t &out) { + if (auto cst = v.getDefiningOp()) { + if (auto ia = dyn_cast(cst.getValue())) { + out = ia.getValue().getSExtValue(); + return true; + } + } + return false; + } + + using OpConversionPattern::OpConversionPattern; + + enum class TileRole { Vec, Mat, Left, Right, Acc, Bias, Scaling }; + + static void collectUserOpsThroughCasts(Value v, SmallVectorImpl &out) { + for (Operation *u : v.getUsers()) { + if (auto castOp = dyn_cast(u)) { + for (Value r : castOp.getResults()) + collectUserOpsThroughCasts(r, out); + continue; + } + out.push_back(u); + } + } + + static Value peelUnrealized(Value v) { + while (auto castOp = v.getDefiningOp()) { + v = castOp.getOperand(0); + } + return v; + } + + static TileRole inferRole(pto::PointerCastOp op) { + // 1. 优先检查 AddressSpace + if (auto memRefTy = dyn_cast(op.getType())) { + Attribute memorySpace = memRefTy.getMemorySpace(); + if (auto ptoAttr = dyn_cast_or_null(memorySpace)) { + switch (ptoAttr.getAddressSpace()) { + case pto::AddressSpace::LEFT: return TileRole::Left; + case pto::AddressSpace::RIGHT: return TileRole::Right; + case pto::AddressSpace::ACC: return TileRole::Acc; + case pto::AddressSpace::BIAS: return TileRole::Bias; + case pto::AddressSpace::MAT: return TileRole::Mat; + case pto::AddressSpace::SCALING: return TileRole::Scaling; + default: break; + } + } + } + + // 2. 通过 Usage 推导 (Fallback) + SmallVector users; + collectUserOpsThroughCasts(op.getResult(), users); + + for (Operation *user : users) { + if (auto mm = dyn_cast(user)) { + if (mm.getDst() && peelUnrealized(mm.getDst()) == op.getResult()) return TileRole::Acc; + if (peelUnrealized(mm.getLhs()) == op.getResult()) return TileRole::Left; + if (peelUnrealized(mm.getRhs()) == op.getResult()) return TileRole::Right; + } + if (auto mmacc = dyn_cast(user)) { + if (mmacc.getDst() && peelUnrealized(mmacc.getDst()) == op.getResult()) return TileRole::Acc; + if (peelUnrealized(mmacc.getAccIn()) == op.getResult()) return TileRole::Acc; + if (peelUnrealized(mmacc.getLhs()) == op.getResult()) return TileRole::Left; + if (peelUnrealized(mmacc.getRhs()) == op.getResult()) return TileRole::Right; + } + } + + return TileRole::Vec; + } + + // [新增] 辅助函数:判断 Value 是否源自 arith.constant + static bool isConstant(Value v, int64_t &outVal) { + if (!v) return false; + if (auto cst = v.getDefiningOp()) { + if (auto attr = dyn_cast(cst.getValue())) { + outVal = attr.getInt(); + return true; + } + } + return false; + } + + LogicalResult matchAndRewrite(pto::PointerCastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + auto selfType = mlir::cast(op.getType()); + ArrayRef shape = selfType.getShape(); + Type elemType = selfType.getElementType(); + + // 1. 推导 Tile Role + TileRole role = inferRole(op); + + // 2. 类型字符串生成 (elemTypeStr, dimStr) + std::string elemTypeStr = "T"; + if (elemType.isF16()) elemTypeStr = "half"; + else if (elemType.isBF16()) elemTypeStr = "bfloat16_t"; + else if (elemType.isF32()) elemTypeStr = "float"; + else if (elemType.isInteger(8)) elemTypeStr = cast(elemType).isUnsigned() ? "uint8_t" : "int8_t"; + else if (elemType.isInteger(16)) elemTypeStr = cast(elemType).isUnsigned() ? "uint16_t" : "int16_t"; + else if (elemType.isInteger(32)) elemTypeStr = cast(elemType).isUnsigned() ? "uint32_t" : "int32_t"; + else if (elemType.isInteger(64)) elemTypeStr = cast(elemType).isUnsigned() ? "uint64_t" : "int64_t"; + + std::string dimStr; + auto dimToString = [](int64_t dim, const char* symbol) -> std::string { + return (dim == ShapedType::kDynamic) ? std::string(symbol) : std::to_string(dim); + }; + + if (role == TileRole::Left) dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "K"); + else if (role == TileRole::Right) dimStr = dimToString(shape[0], "K") + ", " + dimToString(shape[1], "N"); + else if (role == TileRole::Bias) dimStr = "1, " + dimToString(shape[1], "N"); + else dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "N"); + + // 3. Role Token + const char *roleTok = "TileType::Vec"; + switch (role) { + case TileRole::Left: roleTok = "TileType::Left"; break; + case TileRole::Right: roleTok = "TileType::Right"; break; + case TileRole::Acc: roleTok = "TileType::Acc"; break; + case TileRole::Bias: roleTok = "TileType::Bias"; break; + case TileRole::Mat: roleTok = "TileType::Mat"; break; + case TileRole::Vec: roleTok = "TileType::Vec"; break; + case TileRole::Scaling: roleTok = "TileType::Scaling"; break; + } + + // 4. Config & Layout (support BLayoutAttr/SLayoutAttr/PadValueAttr after namespace change) + std::string layoutParams = "BLayout::RowMajor"; + std::string extraParams = ""; + if (auto configOpt = op.getConfig()) { + auto config = *configOpt; + int32_t blVal = 0; + if (auto attr = dyn_cast(config.getBLayout())) + blVal = static_cast(attr.getValue()); + + if (blVal == 1) layoutParams = "BLayout::ColMajor"; + + int32_t slVal = 0; + if (auto attr = dyn_cast(config.getSLayout())) + slVal = static_cast(attr.getValue()); + + std::string slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; + + int32_t frVal = 0; + if (auto attr = dyn_cast(config.getSFractalSize())) frVal = attr.getInt(); + + int32_t padVal = 0; + if (auto attr = dyn_cast(config.getPad())) + padVal = static_cast(attr.getValue()); + + std::string padStr = "PadValue::Null"; + switch (padVal) { + case 1: padStr = "PadValue::Zero"; break; + case 2: padStr = "PadValue::Max"; break; + case 3: padStr = "PadValue::Min"; break; + } + + if (!slStr.empty()) { + extraParams += ", " + slStr + ", " + std::to_string(frVal) + ", " + padStr; + } + } + + // [核心修改] Valid Dims 处理逻辑 (支持混合静态/动态) + std::string vrowTok, vcolTok; + bool useConstructor = false; + + // 引入标志位,明确记录哪个维度是动态的 + bool rowIsDynamic = false; + bool colIsDynamic = false; + + SmallVector constructorArgs; + + Value vRow = op.getValidRow(); + Value vCol = op.getValidCol(); + Value vRowEmitC = adaptor.getValidRow(); + Value vColEmitC = adaptor.getValidCol(); + + int64_t cRow, cCol; + + // --- Row 逻辑 --- + if (vRow && isConstant(vRow, cRow)) { + // Case A: 静态常量 (e.g., 32) + vrowTok = std::to_string(cRow); + } else if (vRow) { + // Case B: 动态变量 (e.g., %arg0) + vrowTok = "-1"; + rowIsDynamic = true; // 标记为动态 + useConstructor = true; + } else { + // Case C: 默认静态 (Shape) + vrowTok = std::to_string(shape[0]); + } + + // --- Col 逻辑 --- + if (vCol && isConstant(vCol, cCol)) { + // Case A: 静态常量 + vcolTok = std::to_string(cCol); + } else if (vCol) { + // Case B: 动态变量 + vcolTok = "-1"; + colIsDynamic = true; // 标记为动态 + useConstructor = true; + } else { + // Case C: 默认静态 + vcolTok = std::to_string(shape[1]); + } + + // --- 收集构造参数 --- + // [修复] 只收集被标记为 Dynamic 的维度的值 + if (useConstructor) { + if (rowIsDynamic && vRowEmitC) constructorArgs.push_back(vRowEmitC); + if (colIsDynamic && vColEmitC) constructorArgs.push_back(vColEmitC); + } + + // 5. 生成 Tile 类型字符串 + std::string tileTypeStr = + std::string("Tile<") + roleTok + ", " + elemTypeStr + ", " + dimStr + ", " + + layoutParams + ", " + vrowTok + ", " + vcolTok + extraParams + ">"; + + auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); + Value resultValue; + + if (useConstructor) { + // 使用 CallOpaqueOp 生成构造函数调用 (Tile v = Tile(...)) + auto ctorOp = rewriter.create( + loc, + tileType, // Result Type + tileTypeStr, // Callee Name (类名) + ArrayAttr{}, // args + ArrayAttr{}, // template_args + ValueRange(constructorArgs) // operands + ); + resultValue = ctorOp.getResult(0); + } else { + // 静态情况 (Tile v;) + auto varOp = rewriter.create( + loc, + tileType, + emitc::OpaqueAttr::get(ctx, "") + ); + resultValue = varOp.getResult(); + } + + // TASSIGN: pto-isa expects an integral address. + Value addr = adaptor.getAddrs()[0]; + if (isa(addr.getType()) || + (isa(addr.getType()) && + cast(addr.getType()).getValue().ends_with("*"))) { + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + auto rcU64 = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); + addr = rewriter.create( + loc, u64Ty, "reinterpret_cast", + /*args=*/ArrayAttr{}, /*templateArgs=*/rcU64, + /*operands=*/ValueRange{addr}) + .getResult(0); + } + + rewriter.create( + loc, TypeRange{}, "TASSIGN", + ArrayAttr{}, ArrayAttr{}, + ValueRange{resultValue, addr}); + + rewriter.replaceOp(op, resultValue); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.load_dps / pto.store_dps lowering (FIX: keep optional result) +//===----------------------------------------------------------------------=== + +struct PTOTLoadToTLOAD : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TLoadOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getDst()) + return rewriter.notifyMatchFailure(op, "expected outs(dst) on pto.tload"); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value srcArg = src; + if (auto srcMrTy = dyn_cast(op.getSrc().getType())) { + bool isGlobal = true; + if (auto asAttr = dyn_cast_or_null(srcMrTy.getMemorySpace())) { + auto as = asAttr.getAddressSpace(); + isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); + } + if (isGlobal) { + if (Value gt = buildGlobalTensorFromMemref(rewriter, op.getLoc(), src, srcMrTy, + op.getOperation())) + srcArg = gt; + } + } + + rewriter.create( + op.getLoc(), TypeRange{}, "TLOAD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, srcArg}); + + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +struct PTOTStoreToTSTORE : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TStoreOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getDst()) + return rewriter.notifyMatchFailure(op, "expected outs(dst) on pto.tstore"); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value dstArg = dst; + if (auto dstMrTy = dyn_cast(op.getDst().getType())) { + bool isGlobal = true; + if (auto asAttr = dyn_cast_or_null(dstMrTy.getMemorySpace())) { + auto as = asAttr.getAddressSpace(); + isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); + } + if (isGlobal) { + if (Value gt = buildGlobalTensorFromMemref(rewriter, op.getLoc(), dst, dstMrTy, + op.getOperation())) + dstArg = gt; + } + } + + rewriter.create( + op.getLoc(), TypeRange{}, "TSTORE", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dstArg, src}); + + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.matmul_dps lowering (Simplified: No internal copy/sync) +//===----------------------------------------------------------------------===// +struct PTOTMatmulToTMATMUL : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // 1. 获取操作数 (剥离 Cast) + Value lhs = peelUnrealized(adaptor.getLhs()); // A (Left) + Value rhs = peelUnrealized(adaptor.getRhs()); // B (Right) + Value dst = peelUnrealized(adaptor.getDst()); // C (Acc) + + // 2. 直接生成函数调用 TMATMUL(dst, lhs, rhs) + // 假设输入已经在对应的 L0 Buffer 中 + rewriter.create( + op.getLoc(), TypeRange{}, "TMATMUL", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, lhs, rhs}); + + // 3. 处理 Op 替换/删除 + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.tgemv lowering +//===----------------------------------------------------------------------===// +struct PTOTGemvToTGEMV : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGemvOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // 1. 获取操作数 (剥离 Cast) + Value lhs = peelUnrealized(adaptor.getLhs()); // A (Matrix) + Value rhs = peelUnrealized(adaptor.getRhs()); // B (Vector) + Value dst = peelUnrealized(adaptor.getDst()); // C (Result) + + // 2. 直接生成函数调用 TGEMV(dst, lhs, rhs) + rewriter.create( + op.getLoc(), TypeRange{}, "TGEMV", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, lhs, rhs}); + + // 3. 处理 Op 替换/删除 + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.tgemv.acc lowering +//===----------------------------------------------------------------------===// +struct PTOTGemvAccToTGEMVACC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGemvAccOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getDst()) + return rewriter.notifyMatchFailure(op, "expected outs(dst) for pto.tgemv.acc"); + + // 1. 获取操作数 + Value accIn = peelUnrealized(adaptor.getAccIn()); // AccOld + Value lhs = peelUnrealized(adaptor.getLhs()); // A (Matrix) + Value rhs = peelUnrealized(adaptor.getRhs()); // B (Vector) + Value dst = peelUnrealized(adaptor.getDst()); // AccNew + + // 2. 直接生成函数调用 TGEMV_ACC(dst, accIn, lhs, rhs) + rewriter.create( + op.getLoc(), TypeRange{}, "TGEMV_ACC", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, accIn, lhs, rhs}); + + // 3. 处理 Op 替换/删除 + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.matmul_acc_dps lowering (Simplified: No internal copy/sync) +//===----------------------------------------------------------------------===// +struct PTOTMatmulAccToTMATMULACC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulAccOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getDst()) + return rewriter.notifyMatchFailure(op, "expected outs(dst) for pto.tmatmul.acc"); + + // 1. 获取操作数 + Value accIn = peelUnrealized(adaptor.getAccIn()); // AccOld + Value lhs = peelUnrealized(adaptor.getLhs()); // A (Left) + Value rhs = peelUnrealized(adaptor.getRhs()); // B (Right) + Value dst = peelUnrealized(adaptor.getDst()); // AccNew + + // 2. 直接生成函数调用 TMATMUL_ACC(dst, accIn, lhs, rhs) + rewriter.create( + op.getLoc(), TypeRange{}, "TMATMUL_ACC", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, accIn, lhs, rhs}); + + // 3. 处理 Op 替换/删除 + if (op->getNumResults() == 1) { + rewriter.replaceOp(op, dst); + } else { + rewriter.eraseOp(op); + } + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Return lowering +//===----------------------------------------------------------------------=== + +struct ReturnToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(func::ReturnOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto vals = adaptor.getOperands(); + if (vals.empty()) { + rewriter.replaceOpWithNewOp(op, Value{}); + return success(); + } + if (vals.size() == 1) { + rewriter.replaceOpWithNewOp(op, vals[0]); + return success(); + } + return rewriter.notifyMatchFailure(op, "EmitC cannot return multiple values"); + } +}; + +//===----------------------------------------------------------------------===// +// Sync lowering +//===----------------------------------------------------------------------=== + +static std::string getPipeName(pto::PIPE pipe) { + switch (pipe) { + case pto::PIPE::PIPE_S: return "PIPE_S"; + case pto::PIPE::PIPE_V: return "PIPE_V"; + case pto::PIPE::PIPE_M: return "PIPE_M"; + case pto::PIPE::PIPE_MTE1: return "PIPE_MTE1"; + case pto::PIPE::PIPE_MTE2: return "PIPE_MTE2"; + case pto::PIPE::PIPE_MTE3: return "PIPE_MTE3"; + case pto::PIPE::PIPE_ALL: return "PIPE_ALL"; + case pto::PIPE::PIPE_MTE4: return "PIPE_MTE4"; + case pto::PIPE::PIPE_MTE5: return "PIPE_MTE5"; + case pto::PIPE::PIPE_V2: return "PIPE_V2"; + case pto::PIPE::PIPE_FIX: return "PIPE_FIX"; + case pto::PIPE::VIRTUAL_PIPE_MTE2_L1A: return "VIRTUAL_PIPE_MTE2_L1A"; + case pto::PIPE::VIRTUAL_PIPE_MTE2_L1B: return "VIRTUAL_PIPE_MTE2_L1B"; + // 默认回退 + default: return "PIPE_ALL"; + } +} + +//===----------------------------------------------------------------------===// +// pto.barrier lowering -> pipe_barrier(...) +//===----------------------------------------------------------------------===// +struct PTOBarrierToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::BarrierOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto *ctx = rewriter.getContext(); + + // [FIX] op.getPipe() returns PipeAttr. + // We must call .getPipe() on the attribute to get the actual Enum value. + pto::PIPE pipeEnum = op.getPipe().getPipe(); + + // Convert Enum to String (e.g., PIPE_ALL -> "PIPE_ALL") + std::string pipeStr = pto::stringifyPIPE(pipeEnum).str(); + + auto args = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, pipeStr) + }); + + rewriter.replaceOpWithNewOp( + op, + TypeRange{}, // void return + "pipe_barrier", // function name + args, // arguments + ArrayAttr{}, // template args + ValueRange{} // operands + ); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Sync lowering (robust for bracket form pto.set_flag[...] / pto.wait_flag[...]) +// Replace your PTOSyncToRuntimeCall with the code below. +//===----------------------------------------------------------------------===// + +static LogicalResult extractSyncTripletTokens(Operation *op, + std::string &srcTok, + std::string &dstTok, + std::string &evtTok, + ConversionPatternRewriter &rewriter) { + auto *ctx = rewriter.getContext(); + + auto pipeToTok = [](mlir::Attribute a, std::string &out) -> bool { + if (!a) return false; + if (auto p = dyn_cast(a)) { + out = mlir::pto::stringifyPIPE(p.getPipe()).str(); + return true; + } + if (auto s = dyn_cast(a)) { + out = s.getValue().str(); // expects already like "PIPE_MTE2" + return true; + } + return false; + }; + + auto evtToTok = [](mlir::Attribute a, std::string &out) -> bool { + if (!a) return false; + if (auto e = dyn_cast(a)) { + out = mlir::pto::stringifyEVENT(e.getEvent()).str(); + return true; + } + if (auto s = dyn_cast(a)) { + out = s.getValue().str(); // expects already like "EVENT_ID0" + return true; + } + return false; + }; + + auto tryNamed = [&](StringRef s0, StringRef s1, StringRef e0) -> bool { + std::string st, dt, et; + if (!pipeToTok(op->getAttr(s0), st)) return false; + if (!pipeToTok(op->getAttr(s1), dt)) return false; + if (!evtToTok(op->getAttr(e0), et)) return false; + srcTok = std::move(st); + dstTok = std::move(dt); + evtTok = std::move(et); + return true; + }; + + // 1) Most common named-attr encodings + if (tryNamed("src_pipe", "dst_pipe", "event_id")) return success(); + if (tryNamed("srcPipe", "dstPipe", "eventId")) return success(); + if (tryNamed("src", "dst", "event")) return success(); + + // 2) Bracket-form / custom-asm often packs them into an ArrayAttr under some key + auto tryArrayKey = [&](StringRef key) -> bool { + auto arr = op->getAttrOfType(key); + if (!arr || arr.size() < 3) return false; + + std::string st, dt, et; + if (!pipeToTok(arr[0], st)) return false; + if (!pipeToTok(arr[1], dt)) return false; + if (!evtToTok(arr[2], et)) return false; + srcTok = std::move(st); + dstTok = std::move(dt); + evtTok = std::move(et); + return true; + }; + + if (tryArrayKey("args") || tryArrayKey("pipes") || tryArrayKey("sync") || + tryArrayKey("triplet") || tryArrayKey("attrs")) + return success(); + + // 3) Last resort: scan everything and pick 2 Pipe + 1 Event in encounter order. + std::vector pipes; + std::string event; + for (auto &na : op->getAttrs()) { + Attribute a = na.getValue(); + std::string tok; + if (pipeToTok(a, tok)) { + pipes.push_back(std::move(tok)); + continue; + } + if (evtToTok(a, tok)) { + event = std::move(tok); + continue; + } + } + + if (pipes.size() >= 2 && !event.empty()) { + srcTok = pipes[0]; + dstTok = pipes[1]; + evtTok = event; + return success(); + } + + return rewriter.notifyMatchFailure(op, "cannot extract PIPE/PIPE/EVENT tokens from pto.{set,wait}_flag"); +} +static inline std::string pipeTokFromPipeEnum(mlir::pto::PIPE p) { + return mlir::pto::stringifyPIPE(p).str(); +} +static inline std::string evtTokFromEventEnum(mlir::pto::EVENT e) { + return mlir::pto::stringifyEVENT(e).str(); +} +static inline std::string pipeTokFromPipeAttr(mlir::pto::PipeAttr a) { + return mlir::pto::stringifyPIPE(a.getPipe()).str(); +} +static inline std::string evtTokFromEventAttr(mlir::pto::EventAttr a) { + return mlir::pto::stringifyEVENT(a.getEvent()).str(); +} + +template +struct HasGetSrcPipe : std::false_type {}; +template +struct HasGetSrcPipe().getSrcPipe())>> : std::true_type {}; + +template +struct HasGetDstPipe : std::false_type {}; +template +struct HasGetDstPipe().getDstPipe())>> : std::true_type {}; + +template +struct HasGetEventId : std::false_type {}; +template +struct HasGetEventId().getEventId())>> : std::true_type {}; + +template +struct HasGetSrcPipeAttr : std::false_type {}; +template +struct HasGetSrcPipeAttr().getSrcPipeAttr())>> : std::true_type {}; + +template +struct HasGetDstPipeAttr : std::false_type {}; +template +struct HasGetDstPipeAttr().getDstPipeAttr())>> : std::true_type {}; + +template +struct HasGetEventIdAttr : std::false_type {}; +template +struct HasGetEventIdAttr().getEventIdAttr())>> : std::true_type {}; + +template +static LogicalResult extractSyncTokens(SyncOpT op, + std::string &srcTok, + std::string &dstTok, + std::string &evtTok, + ConversionPatternRewriter &rewriter) { + if constexpr (HasGetSrcPipe::value && + HasGetDstPipe::value && + HasGetEventId::value) { + auto s = op.getSrcPipe(); + auto d = op.getDstPipe(); + auto e = op.getEventId(); + + if constexpr (std::is_same::value) srcTok = pipeTokFromPipeEnum(s); + else srcTok = pipeTokFromPipeAttr(s); + + if constexpr (std::is_same::value) dstTok = pipeTokFromPipeEnum(d); + else dstTok = pipeTokFromPipeAttr(d); + + if constexpr (std::is_same::value) evtTok = evtTokFromEventEnum(e); + else evtTok = evtTokFromEventAttr(e); + + return success(); + } + + if constexpr (HasGetSrcPipeAttr::value && + HasGetDstPipeAttr::value && + HasGetEventIdAttr::value) { + auto s = op.getSrcPipeAttr(); + auto d = op.getDstPipeAttr(); + auto e = op.getEventIdAttr(); + srcTok = pipeTokFromPipeAttr(s); + dstTok = pipeTokFromPipeAttr(d); + evtTok = evtTokFromEventAttr(e); + return success(); + } + + return extractSyncTripletTokens(op.getOperation(), srcTok, dstTok, evtTok, rewriter); +} +struct PTOSetFlagToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::SetFlagOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + + std::string srcTok, dstTok, evtTok; + if (failed(extractSyncTokens(op, srcTok, dstTok, evtTok, rewriter))) + return failure(); + + auto argsAttr = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, srcTok), + emitc::OpaqueAttr::get(ctx, dstTok), + emitc::OpaqueAttr::get(ctx, evtTok), + }); + + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "set_flag", + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + return success(); + } +}; + +struct PTOWaitFlagToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::WaitFlagOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + + std::string srcTok, dstTok, evtTok; + if (failed(extractSyncTokens(op, srcTok, dstTok, evtTok, rewriter))) + return failure(); + + auto argsAttr = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, srcTok), + emitc::OpaqueAttr::get(ctx, dstTok), + emitc::OpaqueAttr::get(ctx, evtTok), + }); + + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "wait_flag", + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + return success(); + } +}; + +struct PTOGetBufToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::GetBufOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + + std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); + auto argsAttr = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, pipeTok), + op.getBufIdAttr(), + op.getModeAttr(), + }); + + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "get_buf", + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + return success(); + } +}; + +struct PTORlsBufToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::RlsBufOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + + std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); + auto argsAttr = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, pipeTok), + op.getBufIdAttr(), + op.getModeAttr(), + }); + + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "rls_buf", + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + return success(); + } +}; + +struct PTOSyncSetToEmitC : public OpConversionPattern { + PTOSyncSetToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult + matchAndRewrite(mlir::pto::SyncSetOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + auto loc = op->getLoc(); + + std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); + auto argsAttr = rewriter.getArrayAttr( + {emitc::OpaqueAttr::get(ctx, pipeTok), op.getEventIdAttr()}); + const char *kSyncSetCallee = (targetArch == PTOArch::A3) + ? "ffts_cross_core_sync" + : "set_intra_block"; + rewriter.create(loc, TypeRange{}, kSyncSetCallee, + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + + rewriter.eraseOp(op); + return success(); + } + + PTOArch targetArch; +}; + +struct PTOSyncWaitToEmitC : public OpConversionPattern { + PTOSyncWaitToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult + matchAndRewrite(mlir::pto::SyncWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto *ctx = rewriter.getContext(); + auto loc = op->getLoc(); + + std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); + auto argsAttr = rewriter.getArrayAttr( + {emitc::OpaqueAttr::get(ctx, pipeTok), op.getEventIdAttr()}); + const char *kSyncWaitCallee = + (targetArch == PTOArch::A3) ? "wait_flag_dev" : "wait_intra_block"; + rewriter.create(loc, TypeRange{}, kSyncWaitCallee, + argsAttr, ArrayAttr{}, ValueRange{}); + + rewriter.eraseOp(op); + return success(); + } + + PTOArch targetArch; +}; + +// GetBlockIdxOp Lowering (pto.get_block_idx -> get_block_idx()) +struct PTOGetBlockIdxToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::pto::GetBlockIdxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + rewriter.replaceOpWithNewOp( + op, op.getType(), "get_block_idx", ValueRange{}, ArrayAttr{}, + ArrayAttr{}); + + return success(); + } +}; + +// GetBlockNumOp Lowering (pto.get_block_num -> get_block_num()) +struct PTOGetBlockNumToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::pto::GetBlockNumOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + rewriter.replaceOpWithNewOp( + op, op.getType(), "get_block_num", ValueRange{}, ArrayAttr{}, + ArrayAttr{}); + + return success(); + } +}; + +// GetSubBlockIdxOp Lowering (pto.get_block_idx -> get_subblockid()) +struct PTOGetSubBlockIdxToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::pto::GetSubBlockIdxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + rewriter.replaceOpWithNewOp( + op, op.getType(), "get_subblockid", ValueRange{}, ArrayAttr{}, + ArrayAttr{}); + + return success(); + } +}; + +// GetSubBlockNumOp Lowering (pto.get_block_num -> get_subblockdim()) +struct PTOGetSubBlockNumToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::pto::GetSubBlockNumOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + rewriter.replaceOpWithNewOp( + op, op.getType(), "get_subblockdim", ValueRange{}, ArrayAttr{}, + ArrayAttr{}); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.mscatter lowering -> MSCATTER(mem, src, idx) +// pto.mscatter %src, %mem, %idx : memref<...>, memref<...>, memref<...> +//===----------------------------------------------------------------------===// + +struct PTOMScatterToMSCATTER : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::MScatterOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = peelUnrealized(adaptor.getSrc()); + Value mem = peelUnrealized(adaptor.getMem()); + + // pto-isa currently has no NPU implementation for MGATHER/MSCATTER. + // Fallback to a smoke-friendly lowering to keep compile/run coverage. + rewriter.create( + op.getLoc(), TypeRange{}, "TSTORE", + ArrayAttr{}, ArrayAttr{}, + ValueRange{mem, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOSetValToSETVAL : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSetValOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value dst = peelUnrealized(adaptor.getDst()); + Value val = peelUnrealized(adaptor.getVal()); + + // ---- offset: SSA index operand ---- + Value offset = peelUnrealized(adaptor.getOffset()); + + // NOTE: EmitC has no direct member-call op today. We emit a marker call + // and post-process ptoas output to rewrite it into: + // dst.SetValue(offset, val); + rewriter.create( + op.getLoc(), TypeRange{}, "PTOAS__TILE_SET_VALUE", + ArrayAttr{}, ArrayAttr{}, ValueRange{dst, offset, val}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOGetValToGETVAL : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGetValOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = peelUnrealized(adaptor.getSrc()); + + // ---- offset: SSA index operand ---- + Value offset = peelUnrealized(adaptor.getOffset()); + + // NOTE: EmitC has no direct member-call op today. We emit a marker call + // and post-process ptoas output to rewrite it into: + // auto x = src.GetValue(offset); + Type dstTy = getTypeConverter()->convertType(op.getDst().getType()); + if (!dstTy) + return failure(); + auto call = rewriter.create( + op.getLoc(), + TypeRange{dstTy}, + "PTOAS__TILE_GET_VALUE", + ArrayAttr{}, ArrayAttr{}, + ValueRange{src, offset}); + + rewriter.replaceOp(op, call.getResults()); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.load_scalar / pto.store_scalar lowering -> ptr[offset] +//===----------------------------------------------------------------------===// + +struct PTOLoadScalarToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::LoadScalarOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value ptr = peelUnrealized(adaptor.getPtr()); + Value offset = peelUnrealized(adaptor.getOffset()); + + Type dstTy = getTypeConverter()->convertType(op.getValue().getType()); + if (!dstTy) + return failure(); + + auto call = rewriter.create( + op.getLoc(), TypeRange{dstTy}, "PTOAS__PTR_LOAD", + ArrayAttr{}, ArrayAttr{}, ValueRange{ptr, offset}); + + rewriter.replaceOp(op, call.getResults()); + return success(); + } +}; + +struct PTOStoreScalarToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::StoreScalarOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value ptr = peelUnrealized(adaptor.getPtr()); + Value offset = peelUnrealized(adaptor.getOffset()); + Value val = peelUnrealized(adaptor.getValue()); + + rewriter.create( + op.getLoc(), TypeRange{}, "PTOAS__PTR_STORE", + ArrayAttr{}, ArrayAttr{}, ValueRange{ptr, offset, val}); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.tabs lowering -> TABS(dst, src) +//===----------------------------------------------------------------------===// + +struct PTOTAbsToTABS : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAbsOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + // intrinsic: TABS(dst, src) + rewriter.create( + op.getLoc(), TypeRange{}, "TABS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tadd lowering -> TADD(dst, src0, src1) +//===----------------------------------------------------------------------===// + +struct PTOTAddToTADD : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAddOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + op.getLoc(), TypeRange{}, "TADD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src0, src1}); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// populate patterns +//===----------------------------------------------------------------------=== +struct ReinterpretCastToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + auto resMrTy = dyn_cast(op.getType()); + if (!resMrTy) + return failure(); + + auto asAttr = dyn_cast_or_null(resMrTy.getMemorySpace()); + const bool isGm = (!asAttr || asAttr.getAddressSpace() == pto::AddressSpace::GM); + + bool emitAddPtrTrace = op->hasAttr("pto.addptr_trace"); + Value source = peelUnrealized(adaptor.getSource()); + auto offsets = adaptor.getOffsets(); + Value offsetVal = offsets.empty() ? Value() : offsets[0]; + + // GM: keep pointer arithmetic. + if (isGm) { + if (!offsetVal) { + rewriter.replaceOp(op, source); + return success(); + } + + Type resultType = getTypeConverter()->convertType(op.getType()); + if (!resultType) + return failure(); + + auto addOp = rewriter.create(loc, resultType, source, offsetVal); + if (emitAddPtrTrace) { + rewriter.setInsertionPointAfter(addOp); + rewriter.create( + loc, TypeRange{}, "PTOAS__ADDPTR_TRACE", + ArrayAttr{}, ArrayAttr{}, + ValueRange{addOp.getResult(), source, offsetVal}); + } + rewriter.replaceOp(op, addOp.getResult()); + return success(); + } + + // UB/L1/L0 tiles: materialize a new Tile view by assigning an adjusted + // underlying pointer (in elements). + pto::AddressSpace as = asAttr.getAddressSpace(); + + // Element type token. + std::string elemTok = "float"; + Type elemTy = resMrTy.getElementType(); + int64_t elemBytes = 4; + if (elemTy.isF16()) + elemBytes = 2, + elemTok = "half"; + else if (elemTy.isBF16()) + elemBytes = 2, + elemTok = "bfloat16_t"; + else if (elemTy.isF32()) + elemBytes = 4, + elemTok = "float"; + else if (elemTy.isInteger(8)) + elemBytes = 1, + elemTok = cast(elemTy).isUnsigned() ? "uint8_t" : "int8_t"; + else if (elemTy.isInteger(16)) + elemBytes = 2, + elemTok = cast(elemTy).isUnsigned() ? "uint16_t" : "int16_t"; + else if (elemTy.isInteger(32)) + elemBytes = 4, + elemTok = cast(elemTy).isUnsigned() ? "uint32_t" : "int32_t"; + else if (elemTy.isInteger(64)) + elemBytes = 8, + elemTok = cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; + + // Tile role. + const char *roleTok = "TileType::Vec"; + switch (as) { + case pto::AddressSpace::VEC: + roleTok = "TileType::Vec"; + break; + case pto::AddressSpace::MAT: + roleTok = "TileType::Mat"; + break; + case pto::AddressSpace::LEFT: + roleTok = "TileType::Left"; + break; + case pto::AddressSpace::RIGHT: + roleTok = "TileType::Right"; + break; + case pto::AddressSpace::ACC: + roleTok = "TileType::Acc"; + break; + case pto::AddressSpace::BIAS: + roleTok = "TileType::Bias"; + break; + case pto::AddressSpace::GM: + roleTok = "TileType::Vec"; + break; + } + + // Shape (fallback to 32x32). + int64_t rows = 32, cols = 32; + if (resMrTy.getRank() >= 2 && resMrTy.hasStaticShape()) { + rows = resMrTy.getDimSize(0); + cols = resMrTy.getDimSize(1); + } + + // Keep a conservative default config for now. + std::string tileTypeStr = + std::string("Tile<") + roleTok + ", " + elemTok + ", " + + std::to_string(rows) + ", " + std::to_string(cols) + + ", BLayout::RowMajor, " + std::to_string(rows) + ", " + + std::to_string(cols) + ", SLayout::NoneBox, 512, PadValue::Null>"; + + auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); + Value tile = rewriter + .create(loc, tileType, + emitc::OpaqueAttr::get(ctx, "")) + .getResult(); + + // Compute an integer address and assign it to the new tile. + // NOTE: pto-isa TASSIGN requires an integral address (not a pointer). + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + auto rcU64 = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); + + // Non-GM reinterpret_cast operands come from UB/L1/L0 tiles. + // We need the underlying address, but `__cce_get_tile_ptr()` is only valid + // inside `__tf__` functions. Use `tile.data()` (via a post-processed marker) + // and compute the adjusted address in bytes. + Value rawPtr = source; + if (auto ot = dyn_cast(source.getType())) { + // Only Tiles have a `.data()` member. For plain address-space pointers + // (e.g. `__ubuf__ float*`), use the pointer value directly. + if (ot.getValue().starts_with("Tile<")) { + std::string rawPtrTok = + std::string(addrSpaceQualifier(as)) + " " + elemTok + "*"; + auto rawPtrTy = emitc::OpaqueType::get(ctx, rawPtrTok); + rawPtr = rewriter + .create(loc, rawPtrTy, + "PTOAS__TILE_DATA", ArrayAttr{}, + ArrayAttr{}, ValueRange{source}) + .getResult(0); + } + } + + Value baseAddr = rewriter + .create(loc, u64Ty, "reinterpret_cast", + /*args=*/ArrayAttr{}, + /*templateArgs=*/rcU64, + /*operands=*/ValueRange{rawPtr}) + .getResult(0); + + Value addr = baseAddr; + if (offsetVal) { + Value offU64 = offsetVal; + if (offU64.getType() != u64Ty) + offU64 = rewriter.create(loc, u64Ty, offU64).getResult(); + + auto bytesAttr = emitc::OpaqueAttr::get(ctx, std::to_string(elemBytes)); + Value bytesVal = rewriter.create(loc, u64Ty, bytesAttr); + Value byteOff = rewriter.create(loc, u64Ty, offU64, bytesVal); + addr = rewriter.create(loc, u64Ty, baseAddr, byteOff); + } + + rewriter.create(loc, TypeRange{}, "TASSIGN", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{tile, addr}); + + rewriter.replaceOp(op, tile); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.taddc lowering -> TADDC(dst, src0, src1, src2) +//===----------------------------------------------------------------------===// + +struct PTOTAddCToTADDC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAddCOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value src2 = peelUnrealized(adaptor.getSrc2()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa does not provide NPU implementation for TADDC yet. + // Decompose: dst = src0 + src1 + src2 + rewriter.create( + loc, TypeRange{}, "TADD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src0, src1}); + rewriter.create( + loc, TypeRange{}, "TADD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, dst, src2}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tadds lowering -> TADDS(dst, src, scalar) +//===----------------------------------------------------------------------===// + +struct PTOAddSToTADDS : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAddSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value scalar = peelUnrealized(adaptor.getScalar()); + + rewriter.create( + op.getLoc(), TypeRange{}, "TADDS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src, scalar}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.taddsc lowering -> TADDSC(dst, src0, scalar, src1) +//===----------------------------------------------------------------------===// + +struct PTOAddSCToTADDSC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAddSCOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa does not provide NPU implementation for TADDSC yet. + // Decompose: dst = src0 + scalar + src1 + rewriter.create( + loc, TypeRange{}, "TADDS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src0, scalar}); + rewriter.create( + loc, TypeRange{}, "TADD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, dst, src1}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOTAndToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAndOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value a = peelUnrealized(adaptor.getSrc0()); + Value b = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + op.getLoc(), TypeRange{}, "TAND", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, a, b}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOAndSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAndSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + op.getLoc(), TypeRange{}, "TANDS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src, scalar}); + + rewriter.eraseOp(op); + return success(); + } +}; + + +struct PTOTCIToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TCIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value S = peelUnrealized(adaptor.getS()); + + // scalar cpp type token + std::string scalarTok = "int32_t"; + if (auto it = S.getType().dyn_cast()) { + scalarTok = (it.getWidth() == 16) ? "int16_t" : "int32_t"; + } + + // descending -> "0"/"1" + std::string descTok = op.getDescending() ? "1" : "0"; + + ArrayAttr targs; + if (auto ot = dst.getType().dyn_cast()) { + std::string tileTok = ot.getValue().str(); // "Tile<...>" + targs = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, tileTok), + emitc::OpaqueAttr::get(ctx, scalarTok), + emitc::OpaqueAttr::get(ctx, descTok), + }); + } else { + targs = rewriter.getArrayAttr({}); + } + + rewriter.create( + loc, TypeRange{}, "TCI", + /*args=*/ArrayAttr{}, + /*templateArgs=*/targs, + /*operands=*/ValueRange{dst, S}); + + rewriter.eraseOp(op); + return success(); + } +}; +static std::string cmpModeTok(pto::CmpModeAttr a) { + // 生成 "CmpMode::GT" 这种 token + auto m = a.getValue(); // 取 enum + switch (m) { + case pto::CmpMode::EQ: return "CmpMode::EQ"; + case pto::CmpMode::NE: return "CmpMode::NE"; + case pto::CmpMode::LT: return "CmpMode::LT"; + case pto::CmpMode::LE: return "CmpMode::LE"; + case pto::CmpMode::GT: return "CmpMode::GT"; + case pto::CmpMode::GE: return "CmpMode::GE"; + } + return "CmpMode::EQ"; +} +struct PTOColExpandToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TColExpandOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value src = peelUnrealized(adaptor.getSrc()); + + rewriter.create( + loc, TypeRange{}, "TCOLEXPAND", + /*args=*/ArrayAttr(), + /*templateArgs=*/ArrayAttr(), + /*operands=*/ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; + +struct PTOCmpToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TCmpOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + + std::string tok = "CmpMode::EQ"; + if (auto a = op.getCmpModeAttr()) + tok = cmpModeTok(a); + + auto modeTy = emitc::OpaqueType::get(ctx, "CmpMode"); + Value modeVal = rewriter.create( + loc, modeTy, emitc::OpaqueAttr::get(ctx, tok)); + + auto argsAttr = rewriter.getArrayAttr({}); + + rewriter.create( + loc, + TypeRange{}, + "TCMP", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src0, src1, modeVal}); + + rewriter.eraseOp(op); + return success(); + } +}; + +struct PTOCmpSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TCmpSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + + // cmpMode -> token + auto cmpAttr = op.getCmpModeAttr(); // PTO_CmpModeAttr + std::string tok = cmpModeTok(cmpAttr); + + auto modeTy = emitc::OpaqueType::get(ctx, "CmpMode"); + Value modeVal = rewriter.create( + loc, modeTy, emitc::OpaqueAttr::get(ctx, tok)); + + rewriter.create( + loc, + TypeRange{}, + "TCMPS", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, scalar, modeVal}); + + rewriter.eraseOp(op); + return success(); + } +}; + + +struct PTOColMaxToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TColMaxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + // intrinsic: TCOLMAX(dst, src) + rewriter.create( + loc, TypeRange{}, "TCOLMAX", + /*args=*/ArrayAttr{}, // default: print all operands + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOColMinToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TColMinOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + // intrinsic: TCOLMIN(dst, src) + rewriter.create( + loc, TypeRange{}, "TCOLMIN", + /*args=*/ArrayAttr{}, // default: print all operands + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOColSumToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TColSumOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + // Check if tmp exists before accessing it + if (op.getTmp()) { + // Format 2: with tmp and isBinary + Value tmp = peelUnrealized(adaptor.getTmp()); + bool isBinary = false; + if (auto a = op.getIsBinaryAttr()) + isBinary = a.getValue(); + + auto boolTy = emitc::OpaqueType::get(ctx, "bool"); + auto tok = isBinary ? "true" : "false"; + Value isBinaryVal = rewriter.create( + loc, boolTy, emitc::OpaqueAttr::get(ctx, tok)); + + rewriter.create( + loc, TypeRange{}, "TCOLSUM", + /*args=*/ArrayAttr(), + /*templateArgs=*/ArrayAttr(), + /*operands=*/ValueRange{dst, src, tmp, isBinaryVal}); + } else { + // Format 1: without tmp and isBinary + rewriter.create( + loc, TypeRange{}, "TCOLSUM", + /*args=*/ArrayAttr(), + /*templateArgs=*/ArrayAttr(), + /*operands=*/ValueRange{dst, src}); + } + + rewriter.eraseOp(op); + return success(); + } +}; +static std::string roundModeTok(mlir::pto::RoundModeAttr attr) { + using RM = mlir::pto::RoundMode; + switch (attr.getValue()) { + case RM::NONE: return "RoundMode::CAST_NONE"; + case RM::RINT: return "RoundMode::CAST_RINT"; + case RM::ROUND: return "RoundMode::CAST_ROUND"; + case RM::FLOOR: return "RoundMode::CAST_FLOOR"; + case RM::CEIL: return "RoundMode::CAST_CEIL"; + case RM::TRUNC: return "RoundMode::CAST_TRUNC"; + case RM::ODD: return "RoundMode::CAST_ODD"; + case RM::CAST_RINT: return "RoundMode::CAST_RINT"; + } + return "RoundMode::CAST_RINT"; +} +struct PTOCvtToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TCvtOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + // rmode default: CAST_RINT + pto::RoundModeAttr rmAttr = op.getRmodeAttr(); + std::string rmTok = rmAttr ? roundModeTok(rmAttr) + : std::string("RoundMode::CAST_RINT"); + + // 生成: TCVT(dst, src, RoundMode::XXX) + auto rmodeTy = emitc::OpaqueType::get(ctx, "RoundMode"); + Value rmodeVal = rewriter.create( + loc, rmodeTy, emitc::OpaqueAttr::get(ctx, rmTok)); + + // 这里 args 被清空,只保留 operands,包括 src, dst 和 rmode + rewriter.create( + loc, TypeRange{}, "TCVT", + /*args=*/ArrayAttr{}, // 不使用 args + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, rmodeVal}); // 传递 dst, src 和 rmode + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tdiv lowering -> TDIV(dst, src0, src1) +//===----------------------------------------------------------------------===// + +struct PTODivToTDIV : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TDivOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + op.getLoc(), TypeRange{}, "TDIV", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src0, src1}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tdivs lowering -> TDIVS(dst, src, scalar) or TDIVS(dst, scalar, src) +// Order is determined by operand types: if src is tile_buf, order is (tile, scalar) +// Otherwise, order is (scalar, tile) +//===----------------------------------------------------------------------===// + +struct PTODivSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TDivSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + // Check types BEFORE conversion (using original op types, not adaptor types) + // The adaptor types may already be converted to emitc.opaque + Value origSrc = op.getSrc(); + Value origScalar = op.getScalar(); + + // Determine order based on original operand types + // Check if src is memref/tensor/partition_tensor_view/tile (not scalar) + bool srcIsMemref = (isa(origSrc.getType()) || + isa(origSrc.getType()) || + isa(origSrc.getType()) || + isa(origSrc.getType())); + // Check if scalar is memref/tensor/partition_tensor_view/tile (not scalar) + bool scalarIsMemref = (isa(origScalar.getType()) || + isa(origScalar.getType()) || + isa(origScalar.getType()) || + isa(origScalar.getType())); + + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + if (srcIsMemref && !scalarIsMemref) { + // memref/scalar: TDIVS(dst, src, scalar) - normal order + rewriter.create( + loc, TypeRange{}, "TDIVS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src, scalar}); + } else if (!srcIsMemref && scalarIsMemref) { + // scalar/memref: TDIVS(dst, scalar, src) - swapped order + rewriter.create( + loc, TypeRange{}, "TDIVS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, scalar, src}); + } else { + // This should not happen if verifier is correct, but provide a fallback + return op.emitError("TDivSOp: expected exactly one memref/tensor operand and one scalar operand"); + } + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// pto.tdivs (TDivSOp) lowering -> TDIVS(dst, src, scalar) or TDIVS(dst, scalar, src) +// Order is determined by operand types: if src is tile_buf, order is (tile, scalar) +// Otherwise, order is (scalar, tile) +//===----------------------------------------------------------------------===// + +struct PTOTDivSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TDivSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + // Determine order based on operand types + bool srcIsTile = isa(src.getType()); + bool scalarIsTile = isa(scalar.getType()); + + if (srcIsTile && !scalarIsTile) { + // tile/scalar: TDIVS(dst, src, scalar) + rewriter.create( + loc, TypeRange{}, "TDIVS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src, scalar}); + } else if (!srcIsTile && scalarIsTile) { + // scalar/tile: TDIVS(dst, scalar, src) + rewriter.create( + loc, TypeRange{}, "TDIVS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, scalar, src}); + } else { + // Default: assume src is tile (should not happen if types are correct) + rewriter.create( + loc, TypeRange{}, "TDIVS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src, scalar}); + } + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.texp lowering -> TEXP(dst, src) +//===----------------------------------------------------------------------===// + +struct PTOExpToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TExpOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + loc, TypeRange{}, "TEXP", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.texpands lowering -> TEXPANDS(dst, scalar) +//===----------------------------------------------------------------------===// + +struct PTOExpandsToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TExpandsOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + loc, TypeRange{}, "TEXPANDS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, scalar}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.textract lowering -> TEXTRACT(dst, src, indexRow, indexCol) +//===----------------------------------------------------------------------===// + +struct PTOExtractToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TExtractOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value r0 = peelUnrealized(adaptor.getIndexRow()); + Value c0 = peelUnrealized(adaptor.getIndexCol()); + + rewriter.create( + loc, TypeRange{}, "TEXTRACT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, r0, c0}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tfillpad lowering -> TFILLPAD_EXPAND(dst, src) +//===----------------------------------------------------------------------===// + +struct PTOFillPadToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TFillPadOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + loc, TypeRange{}, "TFILLPAD_EXPAND", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// pto.tgather lowering +// - Index form: TGATHER(dst, src0, indices) +// - Mask form : TGATHER(dst, src0) +//===----------------------------------------------------------------------===// + +static std::string maskPatternTok(mlir::pto::MaskPatternAttr a) { + + auto v = a.getValue(); // enum + return (std::string("pto::MaskPattern::") + mlir::pto::stringifyMaskPattern(v).str()); +} + +struct PTOGatherToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGatherOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value src0 = peelUnrealized(adaptor.getSrc()); + + // Case 1: index-based TGATHER(dst, src0, indices) + if (Value idx = adaptor.getIndices()) { + idx = peelUnrealized(idx); + + rewriter.create( + loc, TypeRange{}, "TGATHER", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src0, idx}); + + rewriter.eraseOp(op); + return success(); + } + + // Case 2: mask-pattern TGATHER(dst, src0) + auto mp = op.getMaskPatternAttr(); + if (!mp) + return rewriter.notifyMatchFailure(op, "expected maskPattern when indices is absent"); + + auto getOpaqueTok = [&](Value v, StringRef name) -> FailureOr { + if (auto ot = v.getType().dyn_cast()) + return ot.getValue().str(); + return rewriter.notifyMatchFailure(op, (name + " must be emitc::OpaqueType (tile)").str()); + }; + + auto dstTokOr = getOpaqueTok(dst, "dst"); + auto srcTokOr = getOpaqueTok(src0, "src0"); + if (failed(dstTokOr) || failed(srcTokOr)) + return failure(); + + // mp is an EnumAttr; stringify name is "P0101" etc. + // We emit MaskPattern::P0101 (because generated C++ has `using namespace pto;`) + std::string mpTok = std::string("MaskPattern::") + + mlir::pto::stringifyMaskPattern(mp.getValue()).str(); + + auto targs = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, *dstTokOr), + emitc::OpaqueAttr::get(ctx, *srcTokOr), + emitc::OpaqueAttr::get(ctx, mpTok), + }); + + rewriter.create( + loc, TypeRange{}, "TGATHER", + /*args=*/ArrayAttr{}, + /*templateArgs=*/targs, + /*operands=*/ValueRange{dst, src0}); + + rewriter.eraseOp(op); + return success(); + } +}; + + +struct PTOGatherbToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGatherBOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value offsets = peelUnrealized(adaptor.getOffsets()); + Value dst = peelUnrealized(adaptor.getDst()); + + rewriter.create( + loc, TypeRange{}, "TGATHERB", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, offsets}); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// TLOG lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + +struct PTOLogToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TLogOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TLOG", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + + + +//===----------------------------------------------------------------------===// +// TLRELU lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + + struct PTOLReluToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TLReluOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value slope = peelUnrealized(adaptor.getSlope()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, slope}; + + rewriter.create( + loc, TypeRange{}, "TLRELU", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// TMAX lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + +struct PTOMaxToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMaxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TMAX", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// TMAXS lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + + struct PTOMaxSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMaxSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, scalar}; + rewriter.create( + loc, TypeRange{}, "TMAXS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + + +//===----------------------------------------------------------------------===// +// TMIN lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + +struct PTOMinToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMinOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TMIN", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// TMINS lowering to EmitC (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TMINS lowering to EmitC (fix APFloat -> FloatAttr) (PTOConvert.cpp) +//===----------------------------------------------------------------------===// + +struct PTOMinsToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMinSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value scalar = peelUnrealized(adaptor.getScalar()); + + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TMINS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering for TMOV op -> EmitC) +//===----------------------------------------------------------------------===// + +struct PTOMovToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMovOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TMOV", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TMOV_FP DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOMovFPToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMovFPOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDst()); + Value src = peelUnrealized(adaptor.getSrc()); + Value fp = peelUnrealized(adaptor.getFp()); + + // TMOV_FP(dstTileData, cTile, fbTile) + ArrayAttr templateArgs; + auto dstOT = dst.getType().dyn_cast(); + auto srcOT = src.getType().dyn_cast(); + auto fpOT = fp.getType().dyn_cast(); + if (dstOT && srcOT && fpOT) { + templateArgs = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, dstOT.getValue().str()), + emitc::OpaqueAttr::get(ctx, srcOT.getValue().str()), + emitc::OpaqueAttr::get(ctx, fpOT.getValue().str()), + }); + } else { + templateArgs = ArrayAttr{}; + } + + SmallVector operands{dst, src, fp}; + rewriter.create( + loc, TypeRange{}, "TMOV_FP", + /*args=*/ArrayAttr{}, /*templateArgs=*/templateArgs, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TMRGSORT DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOMrgSortToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMrgSortOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + if (op.isFormat1()) { + Value src = peelUnrealized(adaptor.getSrcs().front()); + Value dst = peelUnrealized(adaptor.getDsts().front()); + Value blockLen = peelUnrealized(adaptor.getBlockLen()); + + SmallVector operands{dst, src, blockLen}; + rewriter.create( + loc, TypeRange{}, "TMRGSORT", + ArrayAttr{}, ArrayAttr{}, operands); + } else if (op.isFormat2()) { + // pto-isa API: + // TMRGSORT( + // dst, executedNumList, tmp, src0, src1, src2, src3); + auto *ctx = rewriter.getContext(); + + Value dst = peelUnrealized(adaptor.getDsts()[0]); + Value tmp = peelUnrealized(adaptor.getDsts()[1]); + Value excuted = peelUnrealized(adaptor.getExcuted()); + + SmallVector srcs; + srcs.reserve(4); + for (Value v : adaptor.getSrcs()) + srcs.push_back(peelUnrealized(v)); + + auto dstOT = dst.getType().dyn_cast(); + auto tmpOT = tmp.getType().dyn_cast(); + if (!dstOT || !tmpOT || srcs.size() != 4) + return op.emitOpError("format2 expects (dst,tmp) tilebufs and exactly 4 srcs"); + + SmallVector targs; + targs.reserve(7); + targs.push_back(emitc::OpaqueAttr::get(ctx, dstOT.getValue().str())); + targs.push_back(emitc::OpaqueAttr::get(ctx, tmpOT.getValue().str())); + for (Value v : srcs) { + auto ot = v.getType().dyn_cast(); + if (!ot) + return op.emitOpError("format2 expects tilebuf srcs"); + targs.push_back(emitc::OpaqueAttr::get(ctx, ot.getValue().str())); + } + targs.push_back(emitc::OpaqueAttr::get(ctx, op.getExhausted() ? "true" : "false")); + ArrayAttr templateArgs = rewriter.getArrayAttr(targs); + + SmallVector operands{dst, excuted, tmp}; + operands.append(srcs.begin(), srcs.end()); + + rewriter.create( + loc, TypeRange{}, "TMRGSORT", + /*args=*/ArrayAttr{}, /*templateArgs=*/templateArgs, operands); + } else { + return op.emitOpError("unsupported mrgsort_dps format"); + } + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TMUL DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOMulToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMulOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TMUL", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TMULS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOMulsToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMulSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc0()); + Value dst = peelUnrealized(adaptor.getDst()); + Value scalar = peelUnrealized(adaptor.getScalar()); + + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TMULS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TNEG DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTONegToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TNegOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TNEG", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TNOT DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTONotToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TNotOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TNOT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TOR DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOOrToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TOrOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TOR", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TORS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOOrsToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TOrSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + // NOTE: The conversion type system may materialize integers as emitc.opaque + // (e.g. "int32_t"). For EmitC call emission we can pass the scalar through + // directly without arith casts here. + Value s = adaptor.getScalar(); + + SmallVector operands{dst, src0, s}; + rewriter.create( + loc, TypeRange{}, "TORS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TPARTADD DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOPartAddToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TPartAddOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TPARTADD", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TPARTMAX DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOPartMaxToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TPartMaxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TPARTMAX", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TPARTMIN DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOPartMinToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TPartMinOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TPARTMIN", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TPRELU DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOPreluToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TPReluOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa TPRELU requires a tmp tile argument. Current NPU implementation + // does not use tmp, so we safely pass dst as tmp for compatibility. + SmallVector operands{dst, src0, src1, dst}; + rewriter.create( + loc, TypeRange{}, "TPRELU", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TRECIP DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORecipToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRecipOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TRECIP", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TRELU DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOReluToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TReluOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TRELU", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TREM DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORemToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRemOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TREM", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TREMS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORemSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRemSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value scalar = peelUnrealized(adaptor.getScalar()); + + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TREMS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWEXPAND DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowExpandToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowExpandOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TROWEXPAND", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDDIV DPS/memref op) +//===----------------------------------------------------------------------===// +// Helper: replace or erase based on whether op has results. +static void replaceOrEraseWithOpaqueCall(Operation *op, + StringRef callee, + ArrayRef args, + ConversionPatternRewriter &rewriter) { + TypeRange resultTypes = op->getResultTypes(); + auto call = rewriter.create( + op->getLoc(), resultTypes, callee, ArrayAttr{}, ArrayAttr{}, ValueRange(args)); + if (resultTypes.empty()) + rewriter.eraseOp(op); + else + rewriter.replaceOp(op, call.getResults()); +} + +// ---------- TOp ---------- +struct PTOTGemvBiasToTGEMV_BIAS + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TGemvBiasOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value a = peelUnrealized(adaptor.getA()); + Value b = peelUnrealized(adaptor.getB()); + Value bias = peelUnrealized(adaptor.getBias()); + Value dst = peelUnrealized(adaptor.getDst()); + + replaceOrEraseWithOpaqueCall(op.getOperation(), "TGEMV_BIAS", + {dst, a, b, bias}, rewriter); + return success(); + } +}; + +struct PTOTMatmulBiasToTMATMUL_BIAS + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulBiasOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value a = peelUnrealized(adaptor.getA()); + Value b = peelUnrealized(adaptor.getB()); + Value bias = peelUnrealized(adaptor.getBias()); + Value dst = peelUnrealized(adaptor.getDst()); + + replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_BIAS", + {dst, a, b, bias}, rewriter); + return success(); + } +}; + +struct PTOTMatmulMXToTMATMUL_MX + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulMxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value a = peelUnrealized(adaptor.getA()); + Value aScale = peelUnrealized(adaptor.getAScale()); + Value b = peelUnrealized(adaptor.getB()); + Value bScale = peelUnrealized(adaptor.getBScale()); + Value dst = peelUnrealized(adaptor.getDst()); + + replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX", + {dst, a, aScale, b, bScale}, rewriter); + return success(); + } +}; + +struct PTOTMatmulMXAccToTMATMUL_MX_ACC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulMxAccOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value cIn = peelUnrealized(adaptor.getCIn()); + Value a = peelUnrealized(adaptor.getA()); + Value aScale = peelUnrealized(adaptor.getAScale()); + Value b = peelUnrealized(adaptor.getB()); + Value bScale = peelUnrealized(adaptor.getBScale()); + Value dst = peelUnrealized(adaptor.getDst()); + + replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX_ACC", + {dst, cIn, a, aScale, b, bScale}, rewriter); + return success(); + } +}; + +struct PTOTMatmulMXBiasToTMATMUL_MX_BIAS + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TMatmulMxBiasOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value a = peelUnrealized(adaptor.getA()); + Value aScale = peelUnrealized(adaptor.getAScale()); + Value b = peelUnrealized(adaptor.getB()); + Value bScale = peelUnrealized(adaptor.getBScale()); + Value bias = peelUnrealized(adaptor.getBias()); + Value dst = peelUnrealized(adaptor.getDst()); + + replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX_BIAS", + {dst, a, aScale, b, bScale, bias}, rewriter); + return success(); + } +}; + +struct PTORowExpandDivToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowExpandDivOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TROWEXPANDDIV", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDMUL DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowExpandMulToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowExpandMulOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TROWEXPANDMUL", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDSUB DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowExpandSubToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowExpandSubOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TROWEXPANDSUB", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWMAX DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowMaxToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowMaxOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value tmp = peelUnrealized(adaptor.getTmp()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, tmp}; + rewriter.create( + loc, TypeRange{}, "TROWMAX", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWMIN DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowMinToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowMinOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value tmp = peelUnrealized(adaptor.getTmp()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, tmp}; + rewriter.create( + loc, TypeRange{}, "TROWMIN", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TROWSUM DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORowSumToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRowSumOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value tmp = peelUnrealized(adaptor.getTmp()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, tmp}; + rewriter.create( + loc, TypeRange{}, "TROWSUM", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TRSQRT DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTORsqrtToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TRsqrtOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TRSQRT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSCATTER DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOScatterToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TScatterOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value idx = peelUnrealized(adaptor.getIndexes()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, idx}; + rewriter.create( + loc, TypeRange{}, "TSCATTER", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSEL DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSelToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSelOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value mask = peelUnrealized(adaptor.getMask()); + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, mask, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TSEL", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSELS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSelSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSelSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value selectMode = peelUnrealized(adaptor.getSelectMode()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1, selectMode}; + rewriter.create( + loc, TypeRange{}, "TSELS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSHL DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOShlSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TShlOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TSHL", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSHR DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOShrSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TShrOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TSHR", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering for TSHLS/TSHRS DPS: shift by scalar) +//===----------------------------------------------------------------------===// + +struct PTOShlSConstToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TShlSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Value dst = peelUnrealized(adaptor.getDst()); + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TSHLS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + rewriter.eraseOp(op); + return success(); + } +}; + +struct PTOShrSConstToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TShrSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + Value dst = peelUnrealized(adaptor.getDst()); + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TSHRS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSORT32 DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSORT32SToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSort32Op op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value idx = peelUnrealized(adaptor.getIdx()); + + SmallVector operands{dst, src, idx}; + rewriter.create( + loc, TypeRange{}, "TSORT32", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSQRT DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSqrtSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSqrtOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src}; + rewriter.create( + loc, TypeRange{}, "TSQRT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSTORE_FP DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOStoreFPSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TStoreFPOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value fp = peelUnrealized(adaptor.getFp()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, fp}; + rewriter.create( + loc, TypeRange{}, "TSTORE_FP", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSUB DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSubSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSubOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src0, src1}; + rewriter.create( + loc, TypeRange{}, "TSUB", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSUBC DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSubCSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSubCOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value src2 = peelUnrealized(adaptor.getSrc2()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa does not provide NPU implementation for TSUBC yet. + // Decompose: dst = src0 - src1 + src2 + rewriter.create( + loc, TypeRange{}, "TSUB", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src0, src1}); + rewriter.create( + loc, TypeRange{}, "TADD", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, dst, src2}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSUBS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSubSSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSubSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, scalar}; + rewriter.create( + loc, TypeRange{}, "TSUBS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSUBSC DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSubSCToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSubSCOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa does not provide NPU implementation for TSUBSC yet. + // Decompose: dst = src0 - scalar + src1 + rewriter.create( + loc, TypeRange{}, "TSUBS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src0, scalar}); + rewriter.create( + loc, TypeRange{}, "TADD", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, dst, src1}); + + rewriter.eraseOp(op); + return success(); + } +}; + + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TXOR DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOXORToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TXorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src0 = peelUnrealized(adaptor.getSrc0()); + Value src1 = peelUnrealized(adaptor.getSrc1()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa TXOR requires a tmp tile argument. Current NPU implementation + // does not use tmp, so we safely pass dst as tmp for compatibility. + SmallVector operands{dst, src0, src1, dst}; + rewriter.create( + loc, TypeRange{}, "TXOR", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +struct PTOTTransToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TTransOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value tmp = peelUnrealized(adaptor.getTmp()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, src, tmp}; + rewriter.create( + loc, TypeRange{}, "TTRANS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TXORS DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOXORSToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TXorSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value scalar = peelUnrealized(adaptor.getScalar()); + Value dst = peelUnrealized(adaptor.getDst()); + + // pto-isa TXORS requires a tmp tile argument. Current NPU implementation + // does not use tmp, so we safely pass dst as tmp for compatibility. + SmallVector operands{dst, src, scalar, dst}; + rewriter.create( + loc, TypeRange{}, "TXORS", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + struct PTOPrintToTPRINT : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TPrintOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + + SmallVector operands{src}; + rewriter.create( + loc, TypeRange{}, "TPRINT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +// pto.print "format", %scalar -> PRINTF("format", scalar) +struct PTOPrintOpToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::PrintOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + + std::string fmt = op.getFormat().str(); + if (fmt.empty()) + fmt = "%f"; + std::string quoted = "\""; + for (char c : fmt) { + if (c == '"' || c == '\\') + quoted += '\\'; + else if (c == '\n') + quoted += "\\n"; + else if (c == '\t') + quoted += "\\t"; + else + quoted += c; + } + quoted += "\""; + + Value scalar = peelUnrealized(adaptor.getScalar()); + auto argsAttr = rewriter.getArrayAttr( + {emitc::OpaqueAttr::get(ctx, quoted), + IntegerAttr::get(IndexType::get(ctx), 0)}); + rewriter.create( + loc, TypeRange{}, "cce::printf", + /*args=*/argsAttr, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{scalar}); + + rewriter.eraseOp(op); + return success(); + } +}; + +// pto.trap -> TRAP() +struct PTOTrapOpToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TrapOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + rewriter.create( + loc, TypeRange{}, "trap", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{}); + + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// PTOConvert.cpp (add lowering + patterns.add for TSYNC DPS/memref op) +//===----------------------------------------------------------------------===// + +struct PTOSYNCToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TSyncOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value events = peelUnrealized(adaptor.getEvents()); + Value dst = peelUnrealized(adaptor.getDst()); + + SmallVector operands{dst, events}; + rewriter.create( + loc, TypeRange{}, "TSYNC", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/operands); + + rewriter.eraseOp(op); + return success(); + } +}; + +// ============================================================================= +// 2. BindTileOp Lowering (FIX: Trace back to physical address) +// ============================================================================= +struct PTOBindTileToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + static bool getIndexConst(Value v, int64_t &out) { + if (!v) + return false; + if (auto cst = v.getDefiningOp()) { + if (auto ia = dyn_cast(cst.getValue())) { + out = ia.getValue().getSExtValue(); + return true; + } + } + return false; + } + + LogicalResult matchAndRewrite(pto::BindTileOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + auto configAttr = op.getConfigAttr(); + auto viewSemantics = op->getAttrOfType("pto.view_semantics"); + + auto peelAllCasts = [](Value v) { + while (auto castOp = v.getDefiningOp()) + v = castOp.getOperand(0); + if (auto castOp = v.getDefiningOp()) + v = castOp.getOperand(); + return v; + }; + auto isTileLike = [](Value v) -> bool { + auto ot = dyn_cast(v.getType()); + if (!ot) + return false; + StringRef s = ot.getValue(); + return s.contains("Tile<") || s.contains("ConvTile<"); + }; + + auto buildTileValue = [&]() -> FailureOr { + auto resMrTy = dyn_cast(op.getType()); + if (!resMrTy) + return failure(); + + const char *roleTok = "TileType::Vec"; + if (auto asAttr = + dyn_cast_or_null(resMrTy.getMemorySpace())) { + switch (asAttr.getAddressSpace()) { + case pto::AddressSpace::VEC: + roleTok = "TileType::Vec"; + break; + case pto::AddressSpace::MAT: + roleTok = "TileType::Mat"; + break; + case pto::AddressSpace::LEFT: + roleTok = "TileType::Left"; + break; + case pto::AddressSpace::RIGHT: + roleTok = "TileType::Right"; + break; + case pto::AddressSpace::ACC: + roleTok = "TileType::Acc"; + break; + case pto::AddressSpace::BIAS: + roleTok = "TileType::Bias"; + break; + case pto::AddressSpace::SCALING: + roleTok = "TileType::Scaling"; + break; + case pto::AddressSpace::GM: + case pto::AddressSpace::Zero: + roleTok = "TileType::Vec"; + break; + } + } + + Type elemTy = resMrTy.getElementType(); + Type emitElemTy = getTypeConverter()->convertType(elemTy); + if (!emitElemTy) + return failure(); + auto emitElemOpaque = dyn_cast(emitElemTy); + if (!emitElemOpaque) + return failure(); + std::string elemTypeStr = emitElemOpaque.getValue().str(); + + if (resMrTy.getRank() < 2) + return failure(); + int64_t rows = resMrTy.getDimSize(0); + int64_t cols = resMrTy.getDimSize(1); + if (rows == ShapedType::kDynamic || cols == ShapedType::kDynamic) + return failure(); + + std::string blTok = "BLayout::RowMajor"; + if (auto blAttr = dyn_cast(configAttr.getBLayout())) { + if (static_cast(blAttr.getValue()) == 1) + blTok = "BLayout::ColMajor"; + } + + std::string slTok = "SLayout::NoneBox"; + if (auto slAttr = dyn_cast(configAttr.getSLayout())) { + int32_t slVal = static_cast(slAttr.getValue()); + slTok = (slVal == 1) ? "SLayout::RowMajor" + : (slVal == 2) ? "SLayout::ColMajor" + : "SLayout::NoneBox"; + } + + int32_t fractal = 512; + if (auto frAttr = dyn_cast(configAttr.getSFractalSize())) + fractal = frAttr.getInt(); + + std::string padTok = "PadValue::Null"; + if (auto padAttr = dyn_cast(configAttr.getPad())) { + switch (static_cast(padAttr.getValue())) { + case 1: + padTok = "PadValue::Zero"; + break; + case 2: + padTok = "PadValue::Max"; + break; + case 3: + padTok = "PadValue::Min"; + break; + default: + padTok = "PadValue::Null"; + break; + } + } + + std::string vrowTok, vcolTok; + bool useConstructor = false; + bool rowIsDynamic = false; + bool colIsDynamic = false; + SmallVector constructorArgs; + + Value vRow = op.getValidRow(); + Value vCol = op.getValidCol(); + Value vRowEmitC = adaptor.getValidRow(); + Value vColEmitC = adaptor.getValidCol(); + int64_t cRow = 0, cCol = 0; + + if (vRow && getIndexConst(vRow, cRow)) { + vrowTok = std::to_string(cRow); + } else if (vRow) { + vrowTok = "-1"; + rowIsDynamic = true; + useConstructor = true; + } else { + vrowTok = std::to_string(rows); + } + + if (vCol && getIndexConst(vCol, cCol)) { + vcolTok = std::to_string(cCol); + } else if (vCol) { + vcolTok = "-1"; + colIsDynamic = true; + useConstructor = true; + } else { + vcolTok = std::to_string(cols); + } + + if (useConstructor) { + if (rowIsDynamic && vRowEmitC) + constructorArgs.push_back(vRowEmitC); + if (colIsDynamic && vColEmitC) + constructorArgs.push_back(vColEmitC); + } + + std::string tileTypeStr = std::string("Tile<") + roleTok + ", " + + elemTypeStr + ", " + std::to_string(rows) + + ", " + std::to_string(cols) + ", " + blTok + + ", " + vrowTok + ", " + vcolTok + ", " + slTok + + ", " + std::to_string(fractal) + ", " + padTok + + ">"; + + auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); + if (useConstructor) { + return rewriter + .create(loc, tileType, tileTypeStr, ArrayAttr{}, + ArrayAttr{}, ValueRange(constructorArgs)) + .getResult(0); + } + + return rewriter + .create(loc, tileType, emitc::OpaqueAttr::get(ctx, "")) + .getResult(); + }; + + auto emitElemTypeToString = [&](Type elemTy) -> std::string { + if (elemTy.isF16()) + return "half"; + if (elemTy.isBF16()) + return "bfloat16_t"; + if (elemTy.isF32()) + return "float"; + if (elemTy.isF64()) + return "double"; + if (elemTy.isInteger(8)) { + if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) + return "int8_t"; + return "uint8_t"; + } + if (elemTy.isInteger(16)) { + if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) + return "int16_t"; + return "uint16_t"; + } + if (elemTy.isInteger(32)) { + if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) + return "int32_t"; + return "uint32_t"; + } + if (elemTy.isInteger(64)) { + return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; + } + return "float"; + }; + + auto buildIntegralAddress = [&](Value sourceValue) -> FailureOr { + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + auto rcU64 = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); + + Value rawPtr = sourceValue; + if (auto ot = dyn_cast(sourceValue.getType())) { + StringRef tyStr = ot.getValue(); + if (tyStr.contains("Tile<") || tyStr.contains("ConvTile<")) { + auto srcMrTy = dyn_cast(op.getSource().getType()); + if (!srcMrTy) + return failure(); + std::string elemTok = emitElemTypeToString(srcMrTy.getElementType()); + pto::AddressSpace as = pto::AddressSpace::GM; + if (auto asAttr = + dyn_cast_or_null(srcMrTy.getMemorySpace())) + as = asAttr.getAddressSpace(); + std::string rawPtrTok = + std::string(addrSpaceQualifier(as)) + " " + elemTok + "*"; + auto rawPtrTy = emitc::OpaqueType::get(ctx, rawPtrTok); + rawPtr = rewriter + .create( + loc, rawPtrTy, "PTOAS__TILE_DATA", ArrayAttr{}, + ArrayAttr{}, ValueRange{sourceValue}) + .getResult(0); + } + } + + if (isa(rawPtr.getType()) || + (isa(rawPtr.getType()) && + cast(rawPtr.getType()).getValue().ends_with("*"))) { + return rewriter + .create(loc, u64Ty, "reinterpret_cast", + ArrayAttr{}, rcU64, ValueRange{rawPtr}) + .getResult(0); + } + + if (rawPtr.getType() == u64Ty) + return rawPtr; + return rewriter.create(loc, u64Ty, rawPtr).getResult(); + }; + + Value tileCandidate = peelAllCasts(adaptor.getSource()); + if (viewSemantics && viewSemantics.getValue() == "bitcast" && + isTileLike(tileCandidate)) { + FailureOr dstTile = buildTileValue(); + if (failed(dstTile)) + return failure(); + FailureOr addr = buildIntegralAddress(tileCandidate); + if (failed(addr)) + return failure(); + + rewriter.create(loc, TypeRange{}, "TASSIGN", + ArrayAttr{}, ArrayAttr{}, + ValueRange{*dstTile, *addr}); + rewriter.replaceOp(op, *dstTile); + return success(); + } + + if (viewSemantics && viewSemantics.getValue() == "treshape" && + isTileLike(tileCandidate)) { + FailureOr dstTile = buildTileValue(); + if (failed(dstTile)) + return failure(); + + rewriter.create(loc, TypeRange{}, "TRESHAPE", + ArrayAttr{}, ArrayAttr{}, + ValueRange{*dstTile, tileCandidate}); + rewriter.replaceOp(op, *dstTile); + return success(); + } + + SmallVector physAddrs; + Value source = op.getSource(); + + while (auto castOp = source.getDefiningOp()) + source = castOp.getOperand(0); + + if (auto upstreamCast = source.getDefiningOp()) { + auto upstreamOperands = upstreamCast.getAddrs(); + physAddrs.append(upstreamOperands.begin(), upstreamOperands.end()); + } else { + physAddrs.push_back(adaptor.getSource()); + } + + Value vRow = op.getValidRow(); + Value vCol = op.getValidCol(); + + rewriter.replaceOpWithNewOp( + op, op.getType(), physAddrs, vRow ? vRow : Value(), + vCol ? vCol : Value(), configAttr); + + return success(); + } +}; + +// ============================================================================= +// Arith CmpI -> EmitC Cmp +// ============================================================================= +class ArithCmpIToEmitC : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(arith::CmpIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + // 将 arith.cmpi 转换为 emitc.cmp + // 映射 Predicate: eq -> equal, slt -> less, etc. + emitc::CmpPredicate emitcPred; + const bool isUnsignedPred = + op.getPredicate() == arith::CmpIPredicate::ult || + op.getPredicate() == arith::CmpIPredicate::ule || + op.getPredicate() == arith::CmpIPredicate::ugt || + op.getPredicate() == arith::CmpIPredicate::uge; + switch (op.getPredicate()) { + case arith::CmpIPredicate::eq: emitcPred = emitc::CmpPredicate::eq; break; + case arith::CmpIPredicate::ne: emitcPred = emitc::CmpPredicate::ne; break; + case arith::CmpIPredicate::slt: emitcPred = emitc::CmpPredicate::lt; break; + case arith::CmpIPredicate::sle: emitcPred = emitc::CmpPredicate::le; break; + case arith::CmpIPredicate::sgt: emitcPred = emitc::CmpPredicate::gt; break; + case arith::CmpIPredicate::sge: emitcPred = emitc::CmpPredicate::ge; break; + // ... 处理无符号比较 (ult, ule 等) ... + case arith::CmpIPredicate::ult: emitcPred = emitc::CmpPredicate::lt; break; + case arith::CmpIPredicate::ule: emitcPred = emitc::CmpPredicate::le; break; + case arith::CmpIPredicate::ugt: emitcPred = emitc::CmpPredicate::gt; break; + case arith::CmpIPredicate::uge: emitcPred = emitc::CmpPredicate::ge; break; + default: return failure(); + } + + Type resTy = getTypeConverter()->convertType(op.getType()); + if (!resTy) + return failure(); + + Value lhs = adaptor.getLhs(); + Value rhs = adaptor.getRhs(); + if (isUnsignedPred) { + Type opTy = op.getLhs().getType(); + auto intTy = dyn_cast(opTy); + const bool isIndex = isa(opTy); + if (!intTy && !isIndex) + return rewriter.notifyMatchFailure( + op, "expected scalar integer or index operands"); + + const unsigned bitWidth = + intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); + if (bitWidth != 1) { + lhs = castSignlessIntToUnsignedSameWidth(rewriter, loc, lhs, bitWidth); + rhs = castSignlessIntToUnsignedSameWidth(rewriter, loc, rhs, bitWidth); + } + } + + rewriter.replaceOpWithNewOp( + op, + /*resultType=*/resTy, // i1 -> bool/i1 + emitcPred, + lhs, + rhs + ); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Section Op Lowering +//===----------------------------------------------------------------------===// +template +struct SectionToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + std::string getMacroName() const { + if (std::is_same::value) + return "__DAV_CUBE__"; + if (std::is_same::value) + return "__DAV_VEC__"; + return "UNKNOWN_MACRO"; + } + + LogicalResult + matchAndRewrite(SectionOpTy op, typename SectionOpTy::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + std::string startMacro = "\n#if defined(" + getMacroName() + ")"; + rewriter.create(loc, startMacro); + + if constexpr (std::is_same_v) { + // Vector mask is a global HW state and may be modified by previous kernels + // (or earlier sections). Reset it to a well-defined state for deterministic + // execution of VEC ops. + rewriter.create(loc, "set_mask_norm();"); + rewriter.create(loc, "set_vector_mask(-1, -1);"); + } + + Block &innerBlock = op.getBody().front(); + if (!innerBlock.empty()) { + rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{}); + } + + std::string endMacro = "#endif // " + getMacroName() + "\n"; + rewriter.create(loc, endMacro); + + rewriter.eraseOp(op); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// SCF Control-Flow Pre-Lowering +// +// EmitC translation supports `emitc.for`/`emitc.if` plus CFG-style +// `cf.br`/`cf.cond_br`. Upstream SCFToEmitC patterns only cover `scf.for` and +// `scf.if`, so we pre-lower some SCF ops into those supported forms. +//===----------------------------------------------------------------------===// + +namespace { + +static bool isTriviallyInlineableExecuteRegion(scf::ExecuteRegionOp op) { + Region &r = op.getRegion(); + if (!r.hasOneBlock()) + return false; + Block &b = r.front(); + return isa_and_nonnull(b.getTerminator()); +} + +static bool needsWholeFunctionSCFToCF(func::FuncOp func) { + bool needs = false; + func.walk([&](Operation *op) { + if (!isa(op)) + return WalkResult::advance(); + Operation *parentOp = op->getParentOp(); + + // `scf.execute_region` can legally appear in single-block parents. Only + // require whole-function SCFToCF if we need to lower it into CFG blocks + // (multi-block region / non-trivial terminators). + if (auto exec = dyn_cast(op)) { + if (parentOp && parentOp->hasTrait() && + !isTriviallyInlineableExecuteRegion(exec)) { + needs = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + } + + if (parentOp && parentOp->hasTrait()) { + needs = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + return needs; +} + +// scf.execute_region is semantically just an inlined region producing results +// via scf.yield. Inline it to the parent block to avoid extra lowering needs. +struct SCFExecuteRegionInline + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(scf::ExecuteRegionOp op, + PatternRewriter &rewriter) const override { + if (op.getRegion().empty()) + return rewriter.notifyMatchFailure(op, "expected non-empty region"); + + Block &innerBlock = op.getRegion().front(); + auto yield = dyn_cast(innerBlock.getTerminator()); + if (!yield) + return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); + + // Move the body operations before the execute_region op. + rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{}); + + // Replace execute_region results with yielded values, then erase the yield. + rewriter.replaceOp(op, yield.getOperands()); + rewriter.eraseOp(yield); + return success(); + } +}; + +// Lower scf.execute_region into CFG blocks with cf.br/cf.cond_br by inlining the +// region blocks into the parent region and rewriting scf.yield to branch into a +// continuation block carrying results. +// +// Note: This requires the parent region to allow multiple blocks (e.g. the +// function body CFG region). For execute_region nested in single-block regions +// (scf.for/scf.if), run SCFToCF first to eliminate the single-block constraint. +struct SCFExecuteRegionToCF : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(scf::ExecuteRegionOp op, + PatternRewriter &rewriter) const override { + if (isTriviallyInlineableExecuteRegion(op)) + return rewriter.notifyMatchFailure(op, "trivially inlineable"); + + Operation *parentOp = op->getParentOp(); + if (parentOp && parentOp->hasTrait()) { + return rewriter.notifyMatchFailure( + op, "cannot lower scf.execute_region inside a single-block parent region"); + } + + if (op.getRegion().empty()) + return rewriter.notifyMatchFailure(op, "expected non-empty region"); + + Location loc = op.getLoc(); + Block *curBlock = op->getBlock(); + Region *parentRegion = curBlock->getParent(); + + // Split the parent block so we can branch to a continuation block with phi + // arguments for the execute_region results. + auto execIt = Block::iterator(op.getOperation()); + Block *continueBlock = rewriter.splitBlock(curBlock, std::next(execIt)); + + SmallVector contArgs; + contArgs.reserve(op.getNumResults()); + for (Type t : op.getResultTypes()) + contArgs.push_back(continueBlock->addArgument(t, loc)); + + for (auto it : llvm::enumerate(op.getResults())) + it.value().replaceAllUsesWith(contArgs[it.index()]); + + // Capture blocks before moving the region. + SmallVector movedBlocks; + movedBlocks.reserve(op.getRegion().getBlocks().size()); + for (Block &b : op.getRegion()) + movedBlocks.push_back(&b); + Block *entryBlock = &op.getRegion().front(); + + // Inline the execute_region blocks into the parent region right before the + // continuation block. + rewriter.inlineRegionBefore(op.getRegion(), *parentRegion, + continueBlock->getIterator()); + + // Replace all scf.yield terminators with a branch to the continuation. + for (Block *b : movedBlocks) { + auto yield = dyn_cast(b->getTerminator()); + if (!yield) + continue; + rewriter.setInsertionPoint(yield); + rewriter.create(loc, continueBlock, yield.getOperands()); + rewriter.eraseOp(yield); + } + + // Replace execute_region itself with a branch to the inlined entry block. + rewriter.setInsertionPoint(op); + rewriter.create(loc, entryBlock, ValueRange{}); + rewriter.eraseOp(op); + return success(); + } +}; + +// Lower scf.index_switch into CFG blocks with cf.cond_br/cf.br so that we can +// avoid `scf.if` result materialization quirks (and avoid relying on cf.switch, +// which is not supported by EmitC C++ translation). +struct SCFIndexSwitchToCF : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + static LogicalResult cloneYieldingBlockAndBranchTo( + PatternRewriter &rewriter, Location loc, Block &srcBlock, Block *destBlock, + Block *continueBlock) { + rewriter.setInsertionPointToEnd(destBlock); + + IRMapping mapping; + for (Operation &inner : srcBlock.without_terminator()) + rewriter.clone(inner, mapping); + + auto yield = dyn_cast(srcBlock.getTerminator()); + if (!yield) + return failure(); + + SmallVector yieldOperands; + yieldOperands.reserve(yield.getNumOperands()); + for (Value v : yield.getOperands()) + yieldOperands.push_back(mapping.lookupOrDefault(v)); + + rewriter.create(loc, continueBlock, yieldOperands); + return success(); + } + + LogicalResult matchAndRewrite(scf::IndexSwitchOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + Operation *parentOp = op->getParentOp(); + if (parentOp && parentOp->hasTrait()) { + return rewriter.notifyMatchFailure( + op, "cannot lower scf.index_switch inside a single-block parent region"); + } + + Block *curBlock = op->getBlock(); + Region *parentRegion = curBlock->getParent(); + + // Split the parent block so we can branch to a continuation block with phi + // arguments for the switch results. + auto switchIt = Block::iterator(op.getOperation()); + Block *continueBlock = rewriter.splitBlock(curBlock, std::next(switchIt)); + + SmallVector contArgs; + contArgs.reserve(op.getNumResults()); + for (Type t : op.getResultTypes()) + contArgs.push_back(continueBlock->addArgument(t, loc)); + + for (auto it : llvm::enumerate(op.getResults())) + it.value().replaceAllUsesWith(contArgs[it.index()]); + + unsigned numCases = op.getCases().size(); + auto insertPt = continueBlock->getIterator(); + + SmallVector checkBlocks; + SmallVector caseBlocks; + checkBlocks.reserve(numCases); + caseBlocks.reserve(numCases); + + // Create check blocks for each case: check_i compares selector to case_i. + for (unsigned i = 0; i < numCases; ++i) + checkBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); + + // Create one block for default and one block per case to execute the body. + Block *defaultBlock = rewriter.createBlock(parentRegion, insertPt); + for (unsigned i = 0; i < numCases; ++i) + caseBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); + + Value selector = op.getArg(); + auto cases = op.getCases(); + + // Fill check blocks with chained comparisons. + for (unsigned i = 0; i < numCases; ++i) { + rewriter.setInsertionPointToEnd(checkBlocks[i]); + Value caseVal = rewriter.create(loc, cases[i]); + Value cond = rewriter.create( + loc, arith::CmpIPredicate::eq, selector, caseVal); + Block *falseDest = (i + 1 < numCases) ? checkBlocks[i + 1] : defaultBlock; + rewriter.create(loc, cond, caseBlocks[i], ValueRange{}, + falseDest, ValueRange{}); + } + + // Fill case blocks and default block with cloned bodies + branch to cont. + for (unsigned i = 0; i < numCases; ++i) { + if (failed(cloneYieldingBlockAndBranchTo( + rewriter, loc, op.getCaseBlock(i), caseBlocks[i], continueBlock))) + return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); + } + if (failed(cloneYieldingBlockAndBranchTo(rewriter, loc, op.getDefaultBlock(), + defaultBlock, continueBlock))) + return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); + + // Replace the original switch op with a branch into the check chain. + Block *entryDest = numCases ? checkBlocks[0] : defaultBlock; + rewriter.setInsertionPointAfter(op); + rewriter.create(loc, entryDest, ValueRange{}); + rewriter.eraseOp(op); + return success(); + } +}; + +// Lower scf.while into CFG blocks with cf.br/cf.cond_br. +// +// Note: This requires the parent region to allow multiple blocks. In +// particular, scf.if/scf.for regions are single-block and cannot contain this +// lowering. +struct SCFWhileToCF : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(scf::WhileOp op, + PatternRewriter &rewriter) const override { + Operation *parentOp = op->getParentOp(); + if (parentOp && parentOp->hasTrait()) { + return rewriter.notifyMatchFailure( + op, "cannot lower scf.while inside a single-block parent region"); + } + + Block *curBlock = op->getBlock(); + + // Only support the common structured form where the while results are used + // in the same block after the op. + for (Value res : op.getResults()) { + for (auto &use : res.getUses()) { + if (use.getOwner()->getBlock() != curBlock) + return rewriter.notifyMatchFailure( + op, "unsupported: while results used outside the parent block"); + } + } + + auto loc = op.getLoc(); + auto whileIt = Block::iterator(op.getOperation()); + Block *afterWhileBlock = rewriter.splitBlock(curBlock, std::next(whileIt)); + + // Add block args to carry while results into the continuation block. + SmallVector exitArgs; + exitArgs.reserve(op.getNumResults()); + for (Type t : op.getResultTypes()) + exitArgs.push_back(afterWhileBlock->addArgument(t, loc)); + + for (auto it : llvm::enumerate(op.getResults())) + it.value().replaceAllUsesWith(exitArgs[it.index()]); + + // Create the CFG blocks before the continuation block. + Region *parentRegion = curBlock->getParent(); + auto insertPt = afterWhileBlock->getIterator(); + + // Header block arguments match the while init operands. + SmallVector headerArgTypes; + for (Value v : op.getInits()) + headerArgTypes.push_back(v.getType()); + SmallVector headerArgLocs(headerArgTypes.size(), loc); + Block *headerBlock = + rewriter.createBlock(parentRegion, insertPt, headerArgTypes, + headerArgLocs); + + // Body block arguments match the "after" region arguments. + Block &afterRegionBlock = op.getAfter().front(); + SmallVector bodyArgTypes(afterRegionBlock.getArgumentTypes().begin(), + afterRegionBlock.getArgumentTypes().end()); + SmallVector bodyArgLocs(bodyArgTypes.size(), loc); + insertPt = afterWhileBlock->getIterator(); + Block *bodyBlock = + rewriter.createBlock(parentRegion, insertPt, bodyArgTypes, bodyArgLocs); + + // Move the before/after region bodies into the new CFG blocks. + rewriter.mergeBlocks(&op.getBefore().front(), headerBlock, + headerBlock->getArguments()); + rewriter.mergeBlocks(&afterRegionBlock, bodyBlock, bodyBlock->getArguments()); + + // Replace scf.condition in the header with cf.cond_br. + { + auto condOp = cast(headerBlock->getTerminator()); + rewriter.setInsertionPoint(condOp); + rewriter.create(loc, condOp.getCondition(), + /*trueDest=*/bodyBlock, + /*trueOperands=*/condOp.getArgs(), + /*falseDest=*/afterWhileBlock, + /*falseOperands=*/condOp.getArgs()); + rewriter.eraseOp(condOp); + } + + // Replace scf.yield in the body with cf.br back to the header. + { + auto yieldOp = cast(bodyBlock->getTerminator()); + rewriter.setInsertionPoint(yieldOp); + rewriter.create(loc, headerBlock, yieldOp.getOperands()); + rewriter.eraseOp(yieldOp); + } + + // Replace scf.while itself with a branch to the header. + rewriter.setInsertionPoint(op); + rewriter.create(loc, headerBlock, op.getInits()); + rewriter.eraseOp(op); + return success(); + } +}; + +// Lower cf.switch into chained comparisons and cf.cond_br/cf.br. +// +// EmitC C++ translation currently supports cf.br/cf.cond_br, but not cf.switch. +struct CFSwitchToCondBr : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(cf::SwitchOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + Operation *parentOp = op->getParentOp(); + if (parentOp && parentOp->hasTrait()) { + return rewriter.notifyMatchFailure( + op, "cannot lower cf.switch inside a single-block parent region"); + } + + Block *curBlock = op->getBlock(); + Region *parentRegion = curBlock->getParent(); + + Value flag = op.getFlag(); + auto flagTy = dyn_cast(flag.getType()); + if (!flagTy) + return rewriter.notifyMatchFailure(op, "expected integer switch flag"); + + SmallVector defaultOperands(op.getDefaultOperands().begin(), + op.getDefaultOperands().end()); + Block *defaultDest = op.getDefaultDestination(); + + SmallVector caseDests(op.getCaseDestinations().begin(), + op.getCaseDestinations().end()); + SmallVector> caseOperands; + caseOperands.reserve(caseDests.size()); + for (auto range : op.getCaseOperands()) + caseOperands.emplace_back(range.begin(), range.end()); + + if (caseDests.empty()) { + rewriter.replaceOpWithNewOp(op, defaultDest, defaultOperands); + return success(); + } + + std::optional caseValuesAttr = op.getCaseValues(); + if (!caseValuesAttr) + return rewriter.notifyMatchFailure(op, "missing case_values"); + + SmallVector caseValues; + for (APInt v : caseValuesAttr->getValues()) + caseValues.push_back(v); + + if (caseValues.size() != caseDests.size()) + return rewriter.notifyMatchFailure(op, "case_values/destinations mismatch"); + if (caseOperands.size() != caseDests.size()) + return rewriter.notifyMatchFailure(op, "case_operands/destinations mismatch"); + + // Insert check blocks right after the current block. + auto insertPt = std::next(curBlock->getIterator()); + SmallVector checkBlocks; + checkBlocks.reserve(caseDests.size()); + for (size_t i = 0; i < caseDests.size(); ++i) + checkBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); + + // Fill each check block with: + // if (flag == caseVal_i) goto caseDest_i else goto nextCheck/default. + for (size_t i = 0; i < caseDests.size(); ++i) { + rewriter.setInsertionPointToEnd(checkBlocks[i]); + + APInt caseVal = caseValues[i]; + if (caseVal.getBitWidth() != flagTy.getWidth()) { + return rewriter.notifyMatchFailure( + op, "case value bitwidth doesn't match flag type"); + } + + Value caseConst = rewriter.create( + loc, flagTy, rewriter.getIntegerAttr(flagTy, caseVal)); + Value cond = rewriter.create( + loc, arith::CmpIPredicate::eq, flag, caseConst); + + Block *falseDest = + (i + 1 < checkBlocks.size()) ? checkBlocks[i + 1] : defaultDest; + ValueRange falseOperands = + (i + 1 < checkBlocks.size()) ? ValueRange{} : ValueRange(defaultOperands); + + rewriter.create(loc, cond, + /*trueDest=*/caseDests[i], + /*trueOperands=*/caseOperands[i], + /*falseDest=*/falseDest, + /*falseOperands=*/falseOperands); + } + + // Replace the switch terminator with a branch into the first check block. + rewriter.setInsertionPoint(op); + rewriter.replaceOpWithNewOp(op, checkBlocks.front(), + ValueRange{}); + return success(); + } +}; + +} // namespace + +static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter, + MLIRContext *ctx, + DataFlowSolver &solver, + PTOArch targetArch) { + (void)solver; + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add>( + typeConverter, ctx); + patterns.add>( + typeConverter, ctx); + patterns.add>( + typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add>(typeConverter, + ctx); + patterns.add>(typeConverter, + ctx); + patterns.add>(typeConverter, + ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add>(typeConverter, ctx); + patterns.add>(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add>(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add>(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); + patterns.add(typeConverter, ctx, targetArch); + patterns.add>(typeConverter, ctx); + patterns.add>(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add< + PTOTMatmulBiasToTMATMUL_BIAS, + PTOTMatmulMXToTMATMUL_MX, + PTOTMatmulMXAccToTMATMUL_MX_ACC, + PTOTMatmulMXBiasToTMATMUL_MX_BIAS, + PTOTMatmulBiasToTMATMUL_BIAS, + PTOTMatmulMXToTMATMUL_MX, + PTOTMatmulMXAccToTMATMUL_MX_ACC, + PTOTMatmulMXBiasToTMATMUL_MX_BIAS, + PTOTGemvBiasToTGEMV_BIAS, + PTOBarrierToEmitC + >(typeConverter, ctx); + + patterns.add(typeConverter, ctx); + + populateSCFToEmitCConversionPatterns(patterns); + // Keep CFG-style branches type-consistent when block argument types are + // converted (e.g. after lowering scf.while to cf.br/cf.cond_br). + populateBranchOpInterfaceTypeConversionPattern(patterns, typeConverter); + populateCallOpTypeConversionPattern(patterns, typeConverter); +} + +//===----------------------------------------------------------------------===// +// Pass +//===----------------------------------------------------------------------===// + +namespace { +struct EmitPTOManualPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(EmitPTOManualPass) + + PTOArch targetArch; + + EmitPTOManualPass() : targetArch(PTOArch::A3) {} + + explicit EmitPTOManualPass(PTOArch arch) : targetArch(arch) {} + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + llvm::errs() << "DEBUG: Start PTOToEmitC Pass\n"; + MLIRContext *ctx = &getContext(); + ModuleOp mop = getOperation(); + + // 1. 插入头文件 + auto loc = mop->getLoc(); + OpBuilder builder(ctx); + builder.setInsertionPointToStart(mop.getBody()); + builder.create( + loc, builder.getStringAttr("pto/pto-inst.hpp"), /*isAngled=*/nullptr); + builder.create( + loc, builder.getStringAttr("using namespace pto;")); + + // Only inject the bitcast helper when we actually lower ops that need it + // (e.g. arith.bitcast or arith.maximumf/minimumf tie-breaking on zeros). + bool needsBitcastHelper = false; + mop.walk([&](Operation *op) { + if (isa(op)) { + needsBitcastHelper = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (needsBitcastHelper) { + builder.create( + loc, builder.getStringAttr(R"cpp( + template + static inline To ptoas_bitcast(From from) { + static_assert(sizeof(To) == sizeof(From), "ptoas_bitcast: size mismatch"); + To to; + __builtin_memcpy(&to, &from, sizeof(To)); + return to; + } + )cpp")); + } + + // 1.5 Pre-lower SCF constructs not handled by SCFToEmitC. + { + // scf.while / scf.index_switch are lowered via CFG blocks. This is not + // possible inside ops that require single-block regions (e.g. scf.for / + // scf.if). If we see such nesting, lower the entire function to the + // ControlFlow dialect first. + bool needsAnySCFToCF = false; + for (auto func : mop.getOps()) { + if (needsWholeFunctionSCFToCF(func)) { + needsAnySCFToCF = true; + break; + } + } + if (needsAnySCFToCF) { + RewritePatternSet scfToCfPatterns(ctx); + populateSCFToControlFlowConversionPatterns(scfToCfPatterns); + FrozenRewritePatternSet frozenSCFToCF(std::move(scfToCfPatterns)); + + ConversionTarget scfToCfTarget(*ctx); + // Only eliminate the single-block SCF constructs; we'll pre-lower + // scf.while/index_switch/execute_region ourselves afterwards. + scfToCfTarget.addIllegalOp(); + scfToCfTarget.markUnknownOpDynamicallyLegal( + [](Operation *) { return true; }); + + for (auto func : mop.getOps()) { + if (!needsWholeFunctionSCFToCF(func)) + continue; + if (failed(applyPartialConversion(func, scfToCfTarget, + frozenSCFToCF))) { + func.emitError() + << "failed to lower nested SCF to ControlFlow (SCFToCF)"; + return signalPassFailure(); + } + } + } + + RewritePatternSet scfLoweringPatterns(ctx); + scfLoweringPatterns.add(ctx); + (void)applyPatternsAndFoldGreedily(mop, std::move(scfLoweringPatterns)); + + bool hasUnsupportedSCF = false; + mop.walk([&](Operation *op) { + if (isa(op)) { + hasUnsupportedSCF = true; + op->emitError() << "Unsupported SCF op remained after pre-lowering"; + return WalkResult::interrupt(); + } + if (isa(op)) { + hasUnsupportedSCF = true; + op->emitError() + << "Unsupported CF op remained after pre-lowering: cf.switch"; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (hasUnsupportedSCF) + return signalPassFailure(); + } + + // 2. 配置转换目标 + PTOToEmitCTypeConverter typeConverter(ctx); + ConversionTarget target(*ctx); + + target.addIllegalDialect(); + target.addIllegalDialect(); + target.addIllegalDialect(); + target.addIllegalDialect(); + + // If we introduced CFG branches (e.g. from scf.while), make sure they are + // updated to use legalized operand types. + target.addDynamicallyLegalOp( + [&](Operation *op) { + return isLegalForBranchOpInterfaceTypeConversionPattern(op, + typeConverter); + }); + + // [关键] 允许 Cast 存在,最后统一清理 + target.addLegalOp(); + + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + + target.addLegalDialect(); + target.addLegalOp(); + + auto solver = std::make_unique(); + solver->load(); + solver->load(); + if (failed(solver->initializeAndRun(getOperation()))) + return signalPassFailure(); + + RewritePatternSet patterns(ctx); + populatePTOToEmitCPatterns(patterns, typeConverter, ctx, *solver, targetArch); + populateCallOpTypeConversionPattern(patterns, typeConverter); + + // 3. 执行转换 + if (failed(applyPartialConversion(mop, target, std::move(patterns)))) { + llvm::errs() << "Conversion FAILED! Rolling back executed.\n"; + return signalPassFailure(); + } + + // ========================================================================= + // 4. [终极清理] + // 顺序至关重要: + // Step A: 先移除所有 Cast,让 Loop 的 Operand 类型变成底层类型 (如 int32) + // Step B: 再根据新的 Operand 类型,修复 Loop IV 的类型 + // ========================================================================= + + // --- Step A: 清理 UnrealizedConversionCastOp --- + // Prefer dropping redundant/unused casts; otherwise lower to emitc.cast + // so the C++ emitter can print it. + llvm::SmallVector castsToErase; + bool castCleanupFailed = false; + mop.walk([&](UnrealizedConversionCastOp cast) { + if (castCleanupFailed) + return; + + if (cast->getNumOperands() != 1 || cast->getNumResults() != 1) { + cast.emitError() << "unsupported unrealized_conversion_cast shape"; + castCleanupFailed = true; + return; + } + + Value input = cast.getOperand(0); + Value output = cast.getResult(0); + Type inTy = input.getType(); + Type outTy = output.getType(); + + if (output.use_empty()) { + castsToErase.push_back(cast); + return; + } + + if (inTy == outTy) { + output.replaceAllUsesWith(input); + castsToErase.push_back(cast); + return; + } + + if (emitc::isSupportedEmitCType(inTy) && emitc::isSupportedEmitCType(outTy)) { + OpBuilder builder(cast); + auto c = builder.create(cast.getLoc(), outTy, input); + output.replaceAllUsesWith(c.getResult()); + castsToErase.push_back(cast); + return; + } + + cast.emitError() << "cannot lower unrealized_conversion_cast(" << inTy + << " -> " << outTy << ") to emitc.cast"; + castCleanupFailed = true; + }); + + for (auto cast : castsToErase) + cast.erase(); + + if (castCleanupFailed) + return signalPassFailure(); + + // --- Step A2: Sink casts of emitc.variable "reads" to their use sites --- + // + // SCFToEmitC lowers scf.if/scf.for results via mutable `emitc.variable` and + // `emitc.assign`. During type conversion, casts from the variable handle to + // the converted type may be materialized right after the variable + // declaration, effectively snapshotting the value *before* assignments. That + // produces wrong C++ (use-before-init / stale reads). + // + // Fix by re-materializing the cast at each use site so it reads the variable + // at the point of use. + { + SmallVector castOpsToSink; + mop.walk([&](emitc::CastOp castOp) { + if (castOp.getSource().getDefiningOp()) + castOpsToSink.push_back(castOp); + }); + + for (emitc::CastOp castOp : castOpsToSink) { + Value src = castOp.getSource(); + Type dstTy = castOp.getResult().getType(); + Value oldRes = castOp.getResult(); + + // Replace each use with a freshly inserted cast right before the user. + for (OpOperand &use : llvm::make_early_inc_range(oldRes.getUses())) { + Operation *user = use.getOwner(); + OpBuilder b(user); + b.setInsertionPoint(user); + auto newCast = b.create(castOp.getLoc(), dstTy, src); + use.set(newCast.getResult()); + } + + castOp.erase(); + } + } + + // --- Step B: 修复 Loop 归纳变量 (IV) --- + // 此时 emitc.for 的 operand 已经是 int32 了,我们检查 IV 是否匹配,不匹配则修正 + mop.walk([&](emitc::ForOp forOp) { + Type boundTy = forOp.getLowerBound().getType(); + BlockArgument iv = forOp.getBody()->getArgument(0); + + if (iv.getType() != boundTy) { + iv.setType(boundTy); // 强制将 IV 类型 (index) 修改为与边界一致 (int32) + } + }); + + // --- Step C: 消除冗余 Tile 变量 (Dead Code Elimination) [新增] --- + // 逻辑:如果一个 emitc.variable 没有被读取(use_empty), + // 那么它自己,以及给它赋值的 TASSIGN 都可以删除。 + // 注意:TASSIGN(v15, v9) 会把 v15 作为 Operand 0 使用,所以 v15 不是严格的 use_empty。 + // 我们需要检查:v15 是否除了 TASSIGN 之外没有其他 User。 + + llvm::SmallVector deadVars; + mop.walk([&](emitc::VariableOp varOp) { + // 检查该变量的所有 User + bool isRead = false; + for (Operation* user : varOp.getResult().getUsers()) { + // 如果 User 是 TASSIGN 且变量是第0个参数(dst),不算"读取" + if (auto call = dyn_cast(user)) { + if (call.getCallee() == "TASSIGN" && call.getOperand(0) == varOp.getResult()) { + continue; // 这是一个赋值操作,不算有效使用 + } + } + // 如果还有其他用途(如 TLOAD, TMOV, TMATMUL),则该变量有用 + isRead = true; + break; + } + + if (!isRead) { + deadVars.push_back(varOp); + } + }); + + for (auto varOp : deadVars) { + // 1. 先删除所有使用该变量的 TASSIGN + llvm::SmallVector usersToErase; + for (Operation* user : varOp.getResult().getUsers()) { + // 我们上面已经确认过,剩下的 user 只能是 TASSIGN + usersToErase.push_back(user); + } + for (auto u : usersToErase) u->erase(); + + // 2. 删除变量定义本身 + varOp.erase(); + } + + // ========================================================================= + } + }; +} // namespace + +std::unique_ptr mlir::pto::createEmitPTOManualPass() { + return std::make_unique(); +} + +std::unique_ptr mlir::pto::createEmitPTOManualPass(PTOArch arch) { + return std::make_unique(arch); +} diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/README.md b/.agent/skills/translate_cpp2py/references/ptoas_source/README.md new file mode 100644 index 00000000..e4690280 --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoas_source/README.md @@ -0,0 +1,4 @@ +Copy critical references +- https://github.com/huawei-csl/PTOAS/blob/20260309/include/PTO/IR/PTOOps.td +- https://github.com/huawei-csl/PTOAS/blob/20260309/python/pto/dialects/pto.py +- https://github.com/huawei-csl/PTOAS/blob/20260309/lib/PTO/Transforms/PTOToEmitC.cpp diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py b/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py new file mode 100644 index 00000000..68e85bf5 --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py @@ -0,0 +1,280 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from . import _pto_ops_gen as _pto_ops_gen +from ._pto_ops_gen import * +from mlir import ir as _ods_ir + + +def _load_local_pto_ext(): + import importlib.util + from pathlib import Path + + lib_dir = Path(__file__).resolve().parent.parent / "_mlir_libs" + for suffix in ("*.so", "*.pyd", "*.dll", "*.dylib"): + for so_path in lib_dir.glob(f"_pto{suffix}"): + spec = importlib.util.spec_from_file_location( + "mlir._mlir_libs._pto", so_path + ) + if spec and spec.loader: + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + raise ImportError("cannot locate local _pto extension in _mlir_libs") + + +try: + _pto_mod = _load_local_pto_ext() +except Exception: + from .._mlir_libs import _pto as _pto_mod + +register_dialect = _pto_mod.register_dialect +PtrType = _pto_mod.PtrType +TensorViewType = _pto_mod.TensorViewType +PartitionTensorViewType = _pto_mod.PartitionTensorViewType +TileType = _pto_mod.TileType +TileBufType = _pto_mod.TileBufType +AddressSpace = _pto_mod.AddressSpace +AddressSpaceAttr = _pto_mod.AddressSpaceAttr +TileBufConfigAttr = _pto_mod.TileBufConfigAttr +BLayout = _pto_mod.BLayout +BLayoutAttr = _pto_mod.BLayoutAttr +SLayout = _pto_mod.SLayout +SLayoutAttr = _pto_mod.SLayoutAttr +PadValue = _pto_mod.PadValue +PadValueAttr = _pto_mod.PadValueAttr +RoundMode = _pto_mod.RoundMode +RoundModeAttr = _pto_mod.RoundModeAttr +CmpMode = _pto_mod.CmpMode +CmpModeAttr = _pto_mod.CmpModeAttr +PIPE = _pto_mod.PIPE +PipeAttr = _pto_mod.PipeAttr +Layout = _pto_mod.Layout +LayoutAttr = _pto_mod.LayoutAttr +SyncOpType = _pto_mod.SyncOpType +SyncOpTypeAttr = _pto_mod.SyncOpTypeAttr +EVENT = _pto_mod.EVENT +EventAttr = _pto_mod.EventAttr +MaskPattern = _pto_mod.MaskPattern +MaskPatternAttr = _pto_mod.MaskPatternAttr + +__all__ = [ + # Dialect utilities + "register_dialect", + # Types + "PtrType", + "TensorViewType", + "PartitionTensorViewType", + "TileType", + "TileBufType", + "AddressSpace", + "AddressSpaceAttr", + "BLayout", + "BLayoutAttr", + "SLayout", + "SLayoutAttr", + "PadValue", + "PadValueAttr", + "RoundMode", + "RoundModeAttr", + "CmpMode", + "CmpModeAttr", + "PIPE", + "PipeAttr", + "Layout", + "LayoutAttr", + "SyncOpType", + "SyncOpTypeAttr", + "EVENT", + "EventAttr", + "MaskPattern", + "MaskPatternAttr", + "TileBufConfigAttr", + "TileConfig", + # High-level sync helpers + "record_event", + "wait_event", + "barrier", + # Scalar pointer helpers + "load_scalar", + "store_scalar" + # Aliases for SyncOpType enums (for terse calls) + , + "TLOAD", + "TSTORE_ACC", + "TSTORE_VEC", + "TMOV_M2L", + "TMOV_M2S", + "TMOV_M2B", + "TMOV_M2V", + "TMOV_V2M", + "TMATMUL", + "TVEC", + "TVECWAIT_EVENT" + # Aliases for EVENT enums + , + "EVENT_ID0", + "EVENT_ID1", + "EVENT_ID2", + "EVENT_ID3", + "EVENT_ID4", + "EVENT_ID5", + "EVENT_ID6", + "EVENT_ID7", +] + +# ----------------------------------------------------------------------------- +# Convenience wrappers for high-level sync to allow passing enums directly +# ----------------------------------------------------------------------------- + + +def _ensure_sync_attr(val, ctx): + # Accept SyncOpType enum, SyncOpTypeAttr, or string name ("TMATMUL"/"tmatmul"). + if isinstance(val, SyncOpType): + return SyncOpTypeAttr.get(val, ctx) + if isinstance(val, str): + name = val.upper() + try: + enum_val = getattr(SyncOpType, name) + except AttributeError: + raise ValueError(f"Unknown SyncOpType name: {val}") + return SyncOpTypeAttr.get(enum_val, ctx) + return val + + +def _ensure_event_attr(val, ctx): + if isinstance(val, EVENT): + return EventAttr.get(val, ctx) + if isinstance(val, str): + name = val.upper() + try: + enum_val = getattr(EVENT, name) + except AttributeError: + raise ValueError(f"Unknown EVENT name: {val}") + return EventAttr.get(enum_val, ctx) + return val + + +def record_event(src_op, dst_op, event_id, *, loc=None, ip=None): + ctx = loc.context if loc else _ods_ir.Context.current + return _pto_ops_gen.record_event( + _ensure_sync_attr(src_op, ctx), + _ensure_sync_attr(dst_op, ctx), + _ensure_event_attr(event_id, ctx), + loc=loc, + ip=ip, + ) + + +def wait_event(src_op, dst_op, event_id, *, loc=None, ip=None): + ctx = loc.context if loc else _ods_ir.Context.current + return _pto_ops_gen.wait_event( + _ensure_sync_attr(src_op, ctx), + _ensure_sync_attr(dst_op, ctx), + _ensure_event_attr(event_id, ctx), + loc=loc, + ip=ip, + ) + + +def barrier(op, *, loc=None, ip=None): + ctx = loc.context if loc else _ods_ir.Context.current + # If user passes SyncOpType/Attr, route to barrier_sync (maps to PIPE) + if isinstance(op, (SyncOpType, SyncOpTypeAttr, str)): + op_attr = _ensure_sync_attr(op, ctx) + return _pto_ops_gen.barrier_sync(op_attr, loc=loc, ip=ip) + # Otherwise fall back to low-level barrier expecting PipeAttr + return _pto_ops_gen.barrier(op, loc=loc, ip=ip) + + +# ----------------------------------------------------------------------------- +# Scalar pointer helpers (manual wrappers until python ops are regenerated) +# ----------------------------------------------------------------------------- +def load_scalar(result_type, ptr, offset, *, loc=None, ip=None): + operands = [ + _pto_ops_gen._get_op_result_or_value(ptr), + _pto_ops_gen._get_op_result_or_value(offset), + ] + op = _ods_ir.Operation.create( + "pto.load_scalar", + results=[result_type], + operands=operands, + loc=loc, + ip=ip, + ) + return op.results[0] + + +def store_scalar(ptr, offset, value, *, loc=None, ip=None): + operands = [ + _pto_ops_gen._get_op_result_or_value(ptr), + _pto_ops_gen._get_op_result_or_value(offset), + _pto_ops_gen._get_op_result_or_value(value), + ] + return _ods_ir.Operation.create( + "pto.store_scalar", + operands=operands, + loc=loc, + ip=ip, + ) + + +# ----------------------------------------------------------------------------- +# Export enum aliases for terse calls: pto.record_event(TLOAD, TLOAD, EVENT_ID0) +# ----------------------------------------------------------------------------- +TLOAD = SyncOpType.TLOAD +TSTORE_ACC = SyncOpType.TSTORE_ACC +TSTORE_VEC = SyncOpType.TSTORE_VEC +TMOV_M2L = SyncOpType.TMOV_M2L +TMOV_M2S = SyncOpType.TMOV_M2S +TMOV_M2B = SyncOpType.TMOV_M2B +TMOV_M2V = SyncOpType.TMOV_M2V +TMOV_V2M = SyncOpType.TMOV_V2M +TMATMUL = SyncOpType.TMATMUL +TVEC = SyncOpType.TVEC +TVECWAIT_EVENT = SyncOpType.TVECWAIT_EVENT + +EVENT_ID0 = EVENT.EVENT_ID0 +EVENT_ID1 = EVENT.EVENT_ID1 +EVENT_ID2 = EVENT.EVENT_ID2 +EVENT_ID3 = EVENT.EVENT_ID3 +EVENT_ID4 = EVENT.EVENT_ID4 +EVENT_ID5 = EVENT.EVENT_ID5 +EVENT_ID6 = EVENT.EVENT_ID6 +EVENT_ID7 = EVENT.EVENT_ID7 + + +class TileConfig: + alignedSize = 32 + fixedRowSize = 16 + fixedColSize = 16 + fixedMxRowSize = 16 + fixedMxColSize = 2 + fractalABSize = 512 + fractalCSize = 1024 + fractalMxSize = 32 + + +# ----------------------------------------------------------------------------- +# Op aliases without "Op" suffix (user-facing) +# ----------------------------------------------------------------------------- +def _install_op_aliases(): + added = [] + for name, obj in _pto_ops_gen.__dict__.items(): + if not isinstance(obj, type): + continue + if not issubclass(obj, _ods_ir.OpView): + continue + alias = None + if name.endswith("Op_DPS"): + alias = f"{name[:-6]}_DPS" + elif name.endswith("Op"): + alias = name[:-2] + if not alias or alias in globals(): + continue + globals()[alias] = obj + added.append(alias) + return added + + +__all__.extend(_install_op_aliases()) diff --git a/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md b/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md new file mode 100644 index 00000000..972afcec --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md @@ -0,0 +1,6 @@ +If only need one file, use: +- https://gitcode.com/cann/pto-isa/blob/8.5.0/include/pto/common/pto_instr.hpp + +Full references (not put to local dir yet): +- https://gitcode.com/cann/pto-isa/tree/8.5.0/include/pto/npu/a2a3 +- https://gitcode.com/cann/pto-isa/tree/8.5.0/include/pto/common diff --git a/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp b/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp new file mode 100644 index 00000000..ab9b961e --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp @@ -0,0 +1,830 @@ +/** +Copyright (c) 2025 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_INSTR_HPP +#define PTO_INSTR_HPP + +#include "pto/common/debug.h" +#include "pto/common/pto_instr_impl.hpp" + +#define MAP_INSTR_IMPL(API, ...) API##_IMPL(__VA_ARGS__) + +namespace pto { +template +PTO_INST void TASSIGN(T &obj, AddrType addr) { + MAP_INSTR_IMPL(TASSIGN, obj, addr); +} + +#ifndef __CPU_SIM + template + PTO_INST void TSYNC() { + TSYNC_IMPL(); + } +#endif + +template +PTO_INST void TSYNC(WaitEvents&... events) { + WaitAllEvents(events...); +} + +template +PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TADD, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TABS(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TABS, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSUB, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMUL, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMIN, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMAX, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TEXPANDS, dst, scalar); + return {}; +} + +template +PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TLOAD, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc0 &src0, T src1, CmpMode cmpMode, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCMPS, dst, src0, src1, cmpMode); + return {}; +} + +template +PTO_INST RecordEvent TCMP(TileDataDst &dst, TileDataSrc &src0, TileDataSrc &src1, CmpMode cmpMode, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCMP, dst, src0, src1, cmpMode); + return {}; +} + +template +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) +{ + TSYNC(events...); + TSTORE_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) +{ + TSYNC(events...); + TSTORE_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) +{ + TSYNC(events...); + TSTORE_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &...events) +{ + TSYNC(events...); + TSTORE_IMPL(dst, src, preQuantScalar); + return {}; +} + +template +PTO_INST RecordEvent TSTORE_FP(GlobalData &dst, TileData &src, FpTileData &fp, WaitEvents &...events) +{ + TSYNC(events...); + TSTORE_IMPL(dst, src, fp); + return {}; +} + +template +PTO_INST RecordEvent TDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TDIV, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TREM(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TREM, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TSHL(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSHL, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TSHR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSHR, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TAND(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TAND, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TOR, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TXOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TXOR, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TLOG(TileData &dst, TileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TLOG, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TNEG(TileData &dst, TileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TNEG, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TNOT(TileData &dst, TileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TNOT, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TDIVS, dst, 1, src); + return {}; +} + +template +PTO_INST RecordEvent TRELU(TileData &dst, TileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TRELU, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TPRELU(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TPRELU, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TPRINT(TileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TPRINT, src); + return {}; +} + +template +PTO_INST RecordEvent TADDC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TADDC, dst, src0, src1, src2); + return {}; +} + +template +PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSUBC, dst, src0, src1, src2); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL_MX( + TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, TileRightScale &bScaleMatrix, WaitEvents&... events) +{ + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL_MX, cMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL_MX(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, + TileRight &bMatrix, TileRightScale &bScaleMatrix, WaitEvents&... events) +{ + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL_MX, cOutMatrix, cInMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL_MX(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, + TileRightScale &bScaleMatrix, TileBias &biasData, WaitEvents&... events) +{ + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL_MX, cMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix, biasData); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL, cMatrix, aMatrix, bMatrix); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL_ACC, cOutMatrix, cInMatrix, aMatrix, bMatrix); + return {}; +} + +template +PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMATMUL_BIAS, cMatrix, aMatrix, bMatrix, biasData); + return {}; +} + +template +PTO_INST RecordEvent +TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, + TmpTileData &tmp, Src0TileData &src0, Src1TileData &src1, + Src2TileData &src2, Src3TileData &src3, WaitEvents&... events) { + TSYNC(events...); + TMRGSORT_IMPL( + dst, executedNumList, tmp, src0, src1, src2, src3); + return {}; +} + +template +PTO_INST RecordEvent TMRGSORT(DstTileData &dst, + MrgSortExecutedNumList &executedNumList, + TmpTileData &tmp, Src0TileData &src0, + Src1TileData &src1, Src2TileData &src2, WaitEvents&... events) { + TSYNC(events...); + TMRGSORT_IMPL(dst, executedNumList, tmp, src0, src1, + src2); + return {}; +} + +template +PTO_INST RecordEvent +TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, + TmpTileData &tmp, Src0TileData &src0, Src1TileData &src1, WaitEvents&... events) { + TSYNC(events...); + TMRGSORT_IMPL(dst, executedNumList, tmp, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TMRGSORT(DstTileData &dst, SrcTileData &src, + uint32_t blockLen, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMRGSORT, dst, src, blockLen); + return {}; +} + +template +PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, + uint16_t indexRow = 0, uint16_t indexCol = 0, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TEXTRACT, dst, src, indexRow, indexCol); + return {}; +} + +template +PTO_INST RecordEvent TFILLPAD(DstTileData &dst, SrcTileData &src, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TFILLPAD, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TFILLPAD_INPLACE(DstTileData &dst, SrcTileData &src, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TFILLPAD_INPLACE, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TFILLPAD_EXPAND(DstTileData &dst, SrcTileData &src, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TFILLPAD_EXPAND, dst, src); + return {}; +} + +// TSORT32不自动实现wait, 需手动TSYNC(events...) +template +PTO_INST RecordEvent TSORT32(DstTileData &dst, SrcTileData &src, IdxTileData &idx) { + MAP_INSTR_IMPL(TSORT32, dst, src, idx); + return {}; +} + +template +PTO_INST RecordEvent TSORT32(DstTileData &dst, SrcTileData &src, IdxTileData &idx, TmpTileData &tmp) { + MAP_INSTR_IMPL(TSORT32, dst, src, idx, tmp); + return {}; +} + +template +PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS0 &src0, TileDataS1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TGATHER, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents&... events) { + TSYNC(events...); + TCI_IMPL(dst, start); + return {}; +} + +template +PTO_INST RecordEvent TTRI(TileData &dst, WaitEvents&... events) { + TSYNC(events...); + TTRI_IMPL(dst); + return {}; +} + +template +PTO_INST RecordEvent TGATHER(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { + TSYNC(events...); + TGATHER_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TPARTADD, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TPARTMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TPARTMAX, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TPARTMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TPARTMIN, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCVT, dst, src, mode); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMOV, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src); + return {}; +} + +template +PTO_INST RecordEvent TMOV_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src, fp); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src, fp); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src, preQuantScalar); + return {}; +} + +template +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents&... events) { + TSYNC(events...); + TMOV_IMPL(dst, src, preQuantScalar); + return {}; +} + +template +PTO_INST RecordEvent TROWSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWSUM, dst, src, tmp); + return {}; +} + +template +PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLSUM, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, bool isBinary, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLSUM, dst, src, tmp, isBinary); + return {}; +} + +template +PTO_INST RecordEvent TCOLMAX(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLMAX, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TROWMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWMAX, dst, src, tmp); + return {}; +} + +template +PTO_INST RecordEvent TRESHAPE(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TRESHAPE, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TROWMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWMIN, dst, src, tmp); + return {}; +} + +template +PTO_INST RecordEvent TSELS(TileData &dst, TileData &src0, TileData &src1, uint8_t selectMode, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSELS, dst, src0, src1, selectMode); + return {}; +} + +template +PTO_INST RecordEvent TSEL(TileData &dst, MaskTile &selMask, TileData &src0, TileData &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSEL, dst, selMask, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TTRANS(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TTRANS, dst, src, tmp); + return {}; +} + +template +PTO_INST RecordEvent TMINS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMINS, dst, src, scalar); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPAND, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDDIV, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDMUL, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDSUB, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDADD, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDMAX, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TROWEXPANDMIN, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TRSQRT, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSQRT, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TEXP, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TGATHERB(TileDataDst &dst, TileDataSrc &src, TileDataOffset &offset, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TGATHERB, dst, src, offset); + return {}; +} + +template +PTO_INST RecordEvent TADDS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TADDS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TSUBS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSUBS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TDIVS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TDIVS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMULS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TDIVS(TileDataDst &dst, typename TileDataDst::DType scalar, TileDataSrc &src0, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TDIVS, dst, scalar, src0); + return {}; +} + +template +PTO_INST RecordEvent TREMS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TREMS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TMAXS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TMAXS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TANDS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TANDS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TORS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TORS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TXORS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TXORS, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TLRELU(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TLRELU, dst, src0, scalar); + return {}; +} + +template +PTO_INST RecordEvent TADDSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TADDSC, dst, src0, scalar, src1); + return {}; +} + +template +PTO_INST RecordEvent TSUBSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, + WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSUBSC, dst, src0, scalar, src1); + return {}; +} + +template +PTO_INST RecordEvent TCOLMIN(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLMIN, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TSCATTER(TileDataD &dst, TileDataS &src, TileDataI &indexes, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TSCATTER, dst, src, indexes); + return {}; +} + +template +PTO_INST RecordEvent TCOLEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLEXPAND, dst, src); + return {}; +} + +template +PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, TileInd &indexes, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(MGATHER, dst, src, indexes); + return {}; +} + +template +PTO_INST RecordEvent MSCATTER(GlobalData &dst, TileSrc &src, TileInd &indexes, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(MSCATTER, dst, src, indexes); + return {}; +} + +template +PTO_INST RecordEvent TNEG(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TNEG, dst, src); + return {}; +} + +template +PTO_INST RecordEvent TCOLEXPANDDIV(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLEXPANDDIV, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TCOLEXPANDMUL(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLEXPANDMUL, dst, src0, src1); + return {}; +} + +template +PTO_INST RecordEvent TCOLEXPANDSUB(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { + TSYNC(events...); + MAP_INSTR_IMPL(TCOLEXPANDSUB, dst, src0, src1); + return {}; +} + +} // namespace pto +#endif diff --git a/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py b/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py new file mode 100644 index 00000000..a65e4a55 --- /dev/null +++ b/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +"""Collect python->pto->cpp translation examples into a reference directory. + +Usage: + python collect_example_translate.py + python collect_example_translate.py --aot-dir /path/to/examples/aot --out-dir /tmp/example_translation +""" +import json +import argparse +import os +import shutil +import subprocess +from pathlib import Path + + +def unique_dir(base: Path) -> Path: + if not base.exists(): + return base + idx = 2 + while True: + candidate = Path(f"{base}_{idx}") + if not candidate.exists(): + return candidate + idx += 1 + + +REQUIRED_FIELDS = { + "example_dir", + "compile_script", + "py_source", + "py_command", + "ptoas_command", + "pto_file", + "cpp_file", +} +OPTIONAL_FIELDS = {"dependency"} + + +def load_example_list(config_path: Path) -> list[dict[str, object]]: + if not config_path.exists(): + raise FileNotFoundError(f"example config not found: {config_path}") + raw = json.loads(config_path.read_text(encoding="utf-8")) + if not isinstance(raw, list): + raise ValueError("example config root must be a list") + + examples: list[dict[str, object]] = [] + for idx, item in enumerate(raw): + if not isinstance(item, dict): + raise ValueError(f"entry #{idx} must be an object") + missing = REQUIRED_FIELDS - set(item.keys()) + if missing: + raise ValueError(f"entry #{idx} missing fields: {sorted(missing)}") + unknown = set(item.keys()) - REQUIRED_FIELDS - OPTIONAL_FIELDS + if unknown: + raise ValueError(f"entry #{idx} has unknown fields: {sorted(unknown)}") + + normalized: dict[str, str | list[str]] = {} + for key in REQUIRED_FIELDS: + value = item[key] + if not isinstance(value, str) or not value.strip(): + raise ValueError( + f"entry #{idx} field '{key}' must be a non-empty string" + ) + normalized[key] = value + + dependency = item.get("dependency", []) + if not isinstance(dependency, list): + raise ValueError( + f"entry #{idx} field 'dependency' must be a list of strings" + ) + dep_list: list[str] = [] + for dep_idx, dep in enumerate(dependency): + if not isinstance(dep, str) or not dep.strip(): + raise ValueError( + f"entry #{idx} dependency[{dep_idx}] must be a non-empty string" + ) + dep_list.append(dep) + normalized["dependency"] = dep_list + examples.append(normalized) + return examples + + +def parse_args() -> argparse.Namespace: + script_dir = Path(__file__).resolve().parent + default_repo_root = (script_dir / "../../../..").resolve() + parser = argparse.ArgumentParser( + description="Collect python->pto->cpp translation examples." + ) + parser.add_argument( + "--repo-root", + type=Path, + default=default_repo_root, + help="Repository root path (default: script_dir/../../../..).", + ) + parser.add_argument( + "--aot-dir", + type=Path, + default=default_repo_root / "examples/aot", + help="AOT examples directory (default: /examples/aot).", + ) + parser.add_argument( + "--out-dir", + type=Path, + default=(script_dir / "../references/example_translation").resolve(), + help="Output directory (default: script_dir/../references/example_translation).", + ) + parser.add_argument( + "--example-config", + type=Path, + default=script_dir / "example_list.json", + help="Example list json path (default: script_dir/example_list.json).", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + repo_root = args.repo_root.resolve() + aot_dir = args.aot_dir.resolve() + out_dir = args.out_dir.resolve() + example_config = args.example_config.resolve() + example_list = load_example_list(example_config) + + if not aot_dir.is_dir(): + raise FileNotFoundError(f"AOT examples directory not found: {aot_dir}") + + if out_dir.exists(): + shutil.rmtree(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + copied = 0 + failed = 0 + found = len(example_list) + results: list[dict[str, str]] = [] + + for idx, example in enumerate(example_list, start=1): + rel_dir = Path(str(example["example_dir"])) + example_dir = aot_dir / rel_dir + py_rel = Path(str(example["py_source"])) + py_source = example_dir / py_rel + py_cmd = str(example["py_command"]) + ptoas_cmd = str(example["ptoas_command"]) + example_name = f"{example['example_dir']}:{example['pto_file']}" + progress_name = py_rel.stem + dependencies = example.get("dependency", []) + print(f"[{idx}/{found}] collecting {rel_dir}/{progress_name}") + + if not py_source.exists(): + failed += 1 + results.append( + { + "name": example_name, + "status": "FAIL", + "reason": f"python source does not exist: {py_source}", + } + ) + continue + + dst = unique_dir(out_dir / rel_dir / Path(str(example["pto_file"])).stem) + dst.mkdir(parents=True, exist_ok=True) + + py_dst = dst / py_rel + py_dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(py_source, py_dst) + dep_copy_failed = False + for dep in dependencies: + dep_src = example_dir / dep + if not dep_src.exists(): + failed += 1 + results.append( + { + "name": example_name, + "status": "FAIL", + "reason": f"dependency does not exist: {dep_src}", + } + ) + dep_copy_failed = True + break + dep_dst = dst / dep + dep_dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(dep_src, dep_dst) + if dep_copy_failed: + continue + + run_env = os.environ.copy() + + py_run = subprocess.run( + py_cmd, + shell=True, + cwd=dst, + env=run_env, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + if py_run.returncode != 0: + failed += 1 + output = (py_run.stdout or "").strip() + results.append( + { + "name": example_name, + "status": "FAIL", + "reason": f"python command failed: {py_cmd}" + + (f" | {output}" if output else ""), + } + ) + continue + + ptoas_run = subprocess.run( + ptoas_cmd, + shell=True, + cwd=dst, + env=run_env, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + if ptoas_run.returncode != 0: + failed += 1 + output = (ptoas_run.stdout or "").strip() + results.append( + { + "name": example_name, + "status": "FAIL", + "reason": f"ptoas command failed: {ptoas_cmd}" + + (f" | {output}" if output else ""), + } + ) + continue + + pto_dst = dst / str(example["pto_file"]) + cpp_dst = dst / str(example["cpp_file"]) + if not (pto_dst.exists() and cpp_dst.exists()): + failed += 1 + results.append( + { + "name": example_name, + "status": "FAIL", + "reason": ( + "expected outputs missing after compile: " + f"{example['pto_file']}, {example['cpp_file']}" + ), + } + ) + continue + + commands = [ + "#!/usr/bin/env bash", + "set -e", + py_cmd, + ptoas_cmd, + "", + ] + (dst / "compile.sh").write_text("\n".join(commands), encoding="utf-8") + + copied += 1 + results.append( + { + "name": example_name, + "status": "OK", + "reason": f"collected to {dst.relative_to(repo_root)}", + } + ) + + print(f"Discovered {found} python->pto candidates under {aot_dir}") + for item in results: + print(f"[{item['status']}] {item['name']} - {item['reason']}") + print(f"Collected {copied} translation examples into {out_dir}") + print(f"Failed to collect {failed} examples") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.agent/skills/translate_cpp2py/scripts/example_list.json b/.agent/skills/translate_cpp2py/scripts/example_list.json new file mode 100644 index 00000000..8af6ab1a --- /dev/null +++ b/.agent/skills/translate_cpp2py/scripts/example_list.json @@ -0,0 +1,141 @@ +[ + { + "example_dir": "activations/geglu_dynamic_multicore", + "compile_script": "compile.sh", + "py_source": "geglu_builder.py", + "py_command": "python ./geglu_builder.py > ./geglu.pto", + "ptoas_command": "ptoas --enable-insert-sync ./geglu.pto -o ./geglu.cpp", + "pto_file": "geglu.pto", + "cpp_file": "geglu.cpp" + }, + { + "example_dir": "activations/relu_dynamic_multicore", + "compile_script": "compile.sh", + "py_source": "relu_builder.py", + "py_command": "python relu_builder.py > ./relu.pto", + "ptoas_command": "ptoas --enable-insert-sync ./relu.pto > generated_relu.cpp", + "pto_file": "relu.pto", + "cpp_file": "generated_relu.cpp" + }, + { + "example_dir": "batch_matmul/matmul_dynbatch_multicore", + "compile_script": "compile.sh", + "py_source": "matmul_builder.py", + "py_command": "python ./matmul_builder.py > matmul.pto", + "ptoas_command": "ptoas matmul.pto -o matmul.cpp", + "pto_file": "matmul.pto", + "cpp_file": "matmul.cpp" + }, + { + "example_dir": "batch_matmul/matmul_dynbatch_multicore_opt", + "compile_script": "compile.sh", + "py_source": "matmul_builder.py", + "py_command": "python ./matmul_builder.py > matmul.pto", + "ptoas_command": "ptoas matmul.pto -o matmul.cpp", + "pto_file": "matmul.pto", + "cpp_file": "matmul.cpp" + }, + { + "example_dir": "elementwise/add_dynamic_multicore", + "compile_script": "compile.sh", + "py_source": "add_builder.py", + "py_command": "python ./add_builder.py > ./add.pto", + "ptoas_command": "ptoas --enable-insert-sync ./add.pto -o ./add.cpp", + "pto_file": "add.pto", + "cpp_file": "add.cpp" + }, + { + "example_dir": "elementwise/add_dynamic_multicore", + "compile_script": "compile_double.sh", + "py_source": "add_double_builder.py", + "py_command": "python ./add_double_builder.py > ./add_double.pto", + "ptoas_command": "ptoas --enable-insert-sync ./add_double.pto -o ./add_double.cpp", + "pto_file": "add_double.pto", + "cpp_file": "add_double.cpp" + }, + { + "example_dir": "fast_hadamard", + "compile_script": "compile.sh", + "py_source": "hadamard_builder.py", + "py_command": "python ./hadamard_builder.py > ./hadamard_auto_sync.pto", + "ptoas_command": "ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp", + "pto_file": "hadamard_auto_sync.pto", + "cpp_file": "hadamard_auto_sync.cpp" + }, + { + "example_dir": "fast_hadamard", + "compile_script": "compile.sh", + "py_source": "hadamard_builder.py", + "py_command": "python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto", + "ptoas_command": "ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp", + "pto_file": "hadamard_manual_sync.pto", + "cpp_file": "hadamard_manual_sync.cpp" + }, + { + "example_dir": "matmul_optimization_guide", + "compile_script": "compile.sh", + "py_source": "step1_baseline.py", + "py_command": "python ./step1_baseline.py > ./step1_baseline.pto", + "ptoas_command": "ptoas --enable-insert-sync ./step1_baseline.pto -o ./step1_baseline.cpp", + "pto_file": "step1_baseline.pto", + "cpp_file": "step1_baseline.cpp", + "dependency": ["common_utils.py"] + }, + { + "example_dir": "matmul_optimization_guide", + "compile_script": "compile.sh", + "py_source": "step2_doublebuffer.py", + "py_command": "python ./step2_doublebuffer.py > ./step2_doublebuffer.pto", + "ptoas_command": "ptoas --enable-insert-sync ./step2_doublebuffer.pto -o ./step2_doublebuffer.cpp", + "pto_file": "step2_doublebuffer.pto", + "cpp_file": "step2_doublebuffer.cpp", + "dependency": ["common_utils.py"] + }, + { + "example_dir": "matmul_optimization_guide", + "compile_script": "compile.sh", + "py_source": "step3_swizzle.py", + "py_command": "python ./step3_swizzle.py > ./step3_swizzle.pto", + "ptoas_command": "ptoas --enable-insert-sync ./step3_swizzle.pto -o ./step3_swizzle.cpp", + "pto_file": "step3_swizzle.pto", + "cpp_file": "step3_swizzle.cpp", + "dependency": ["common_utils.py"] + }, + { + "example_dir": "matmul_optimization_guide", + "compile_script": "compile.sh", + "py_source": "step4_manual_pipelining.py", + "py_command": "python ./step4_manual_pipelining.py > ./step4_manual_pipelining.pto", + "ptoas_command": "ptoas ./step4_manual_pipelining.pto -o ./step4_manual_pipelining.cpp", + "pto_file": "step4_manual_pipelining.pto", + "cpp_file": "step4_manual_pipelining.cpp", + "dependency": ["common_utils.py"] + }, + { + "example_dir": "matmul_optimization_guide/experimental", + "compile_script": "compile.sh", + "py_source": "matmul_builder.py", + "py_command": "python ./matmul_builder.py > matmul.pto", + "ptoas_command": "ptoas matmul.pto -o matmul.cpp", + "pto_file": "matmul.pto", + "cpp_file": "matmul.cpp" + }, + { + "example_dir": "simple_static/add_static_multicore", + "compile_script": "compile.sh", + "py_source": "add_builder.py", + "py_command": "python ./add_builder.py > ./add.pto", + "ptoas_command": "ptoas --enable-insert-sync ./add.pto -o ./add.cpp", + "pto_file": "add.pto", + "cpp_file": "add.cpp" + }, + { + "example_dir": "simple_static/matmul_static_singlecore", + "compile_script": "compile.sh", + "py_source": "matmul_builder.py", + "py_command": "python ./matmul_builder.py > matmul.pto", + "ptoas_command": "ptoas --enable-insert-sync matmul.pto -o matmul.cpp", + "pto_file": "matmul.pto", + "cpp_file": "matmul.cpp" + } +] From 0b53c8d79235f7fca496f06b46a5b04a7b7891c7 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Mon, 16 Mar 2026 21:36:56 +0100 Subject: [PATCH 40/53] Python version of fast-inverse-trick (#90) * update PTOAS to https://github.com/huawei-csl/PTOAS/releases/tag/0.8 * temporarily commit all example translations (will remove before merging PR) * one-shot translation to python prompt: Translate @kernel_tri_inv_trick.cpp and test script @test_tri_inv_trick.py to @fast_inverse using skill @SKILL.md * force docker rebuild * temporary add generated IR and Cpp (remove before merge) * fix bisheng compile * use newer pto-isa version to avoid bisheng error on TMOV * do not early-exit on fail * use a known working version (padding + write to GM) * add TODO for TMOV * update PTO-ISA header * fix TMOV mismatch * fix tmov order * update generated cpp * fix TMOV type mismatch * update generated cpp * fix caller dtype mismatch * try fix manual sync * inline spill_acc_to_mat * remove translation references * try support smaller blocks like 64x64 * update generated cpp * try build all shapes * fix context * remove dynamic valid shape * note on auto-sync bug * remove artifacts * change error to warning, and use larger ftol * build_artifacts dir * ignore build_artifacts * remove unused MAX_MATRIX_SIZE = 128 * run pre-commit run --all-files --------- Co-authored-by: jiawei_zhuang --- docker/Dockerfile | 11 +- docker/README.md | 2 +- examples/aot/fast_inverse/.gitignore | 1 + examples/aot/fast_inverse/README.md | 7 + examples/aot/fast_inverse/caller.cpp | 92 +++++++ examples/aot/fast_inverse/compile.sh | 74 ++++++ examples/aot/fast_inverse/inverse_builder.py | 241 +++++++++++++++++++ examples/aot/fast_inverse/run_inverse.py | 184 ++++++++++++++ 8 files changed, 607 insertions(+), 5 deletions(-) create mode 100644 examples/aot/fast_inverse/.gitignore create mode 100644 examples/aot/fast_inverse/README.md create mode 100644 examples/aot/fast_inverse/caller.cpp create mode 100644 examples/aot/fast_inverse/compile.sh create mode 100644 examples/aot/fast_inverse/inverse_builder.py create mode 100644 examples/aot/fast_inverse/run_inverse.py diff --git a/docker/Dockerfile b/docker/Dockerfile index d93eef83..01689ca6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,11 +17,14 @@ RUN pip install --no-cache-dir \ # cache above layers unrelated to ptoas version change +# change this ununsed arg if need to force rebuild later lines +ARG CACHE_BURST=1 + # ARG ARCH=x86_64 ARG ARCH=aarch64 ARG RELEASE_REPO=huawei-csl/PTOAS -ARG RELEASE_TAG=20260228 -ARG WHEEL_NAME=ptoas-0.1.1-cp311-none-manylinux_2_34_${ARCH}.whl +ARG RELEASE_TAG=0.8 +ARG WHEEL_NAME=ptoas-${RELEASE_TAG}-cp311-none-manylinux_2_34_${ARCH}.whl ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz WORKDIR /installers/ @@ -52,8 +55,8 @@ RUN python ./abs.py > ./abs.pto RUN ptoas --enable-insert-sync ./abs.pto -o ./abs.cpp # certain operations need latest isa header, not CANN 8.5.0 default -# header on 2026/02/14 -ARG PTOISA_COMMIT=672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc +# header on 2026/03/16 +ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613 WORKDIR /sources RUN git clone https://gitcode.com/cann/pto-isa.git \ && cd pto-isa && git checkout $PTOISA_COMMIT diff --git a/docker/README.md b/docker/README.md index ff1f404c..c992124a 100644 --- a/docker/README.md +++ b/docker/README.md @@ -3,7 +3,7 @@ Recommend using [Ascend Docker Runtime](https://gitcode.com/Ascend/mind-cluster/ Then, build and run docker image: ```bash -RELEASE_TAG=20260309 +RELEASE_TAG=0.8 sudo docker build \ --build-arg RELEASE_TAG=$RELEASE_TAG \ . -t pto_dsl:$RELEASE_TAG diff --git a/examples/aot/fast_inverse/.gitignore b/examples/aot/fast_inverse/.gitignore new file mode 100644 index 00000000..2672482f --- /dev/null +++ b/examples/aot/fast_inverse/.gitignore @@ -0,0 +1 @@ +build_artifacts diff --git a/examples/aot/fast_inverse/README.md b/examples/aot/fast_inverse/README.md new file mode 100644 index 00000000..fff824a9 --- /dev/null +++ b/examples/aot/fast_inverse/README.md @@ -0,0 +1,7 @@ +Usage: + +```bash +bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs +python ./run_inverse.py --manual-sync # test manual-sync lib +# python ./run_inverse.py # TODO: fix auto-sync bug in ptoas +``` diff --git a/examples/aot/fast_inverse/caller.cpp b/examples/aot/fast_inverse/caller.cpp new file mode 100644 index 00000000..fde1b237 --- /dev/null +++ b/examples/aot/fast_inverse/caller.cpp @@ -0,0 +1,92 @@ +#ifndef KERNEL_CPP_16 +#define KERNEL_CPP_16 "inverse_auto_sync_16.cpp" +#endif +#ifndef KERNEL_CPP_32 +#define KERNEL_CPP_32 "inverse_auto_sync_32.cpp" +#endif +#ifndef KERNEL_CPP_64 +#define KERNEL_CPP_64 "inverse_auto_sync_64.cpp" +#endif +#ifndef KERNEL_CPP_96 +#define KERNEL_CPP_96 "inverse_auto_sync_96.cpp" +#endif +#ifndef KERNEL_CPP_128 +#define KERNEL_CPP_128 "inverse_auto_sync_128.cpp" +#endif + +#ifndef KERNEL_FN_16 +#define KERNEL_FN_16 tri_inv_trick_fp16_16 +#endif +#ifndef KERNEL_FN_32 +#define KERNEL_FN_32 tri_inv_trick_fp16_32 +#endif +#ifndef KERNEL_FN_64 +#define KERNEL_FN_64 tri_inv_trick_fp16_64 +#endif +#ifndef KERNEL_FN_96 +#define KERNEL_FN_96 tri_inv_trick_fp16_96 +#endif +#ifndef KERNEL_FN_128 +#define KERNEL_FN_128 tri_inv_trick_fp16_128 +#endif + +#include KERNEL_CPP_16 +#include KERNEL_CPP_32 +#include KERNEL_CPP_64 +#include KERNEL_CPP_96 +#include KERNEL_CPP_128 + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *tensor_out, + uint8_t *tensor_in, + uint8_t *identity_in, + uint32_t matrix_size, + uint32_t max_block_size) +{ + switch (matrix_size) { + case 16: + KERNEL_FN_16<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(matrix_size), + static_cast(max_block_size)); + break; + case 32: + KERNEL_FN_32<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(matrix_size), + static_cast(max_block_size)); + break; + case 64: + KERNEL_FN_64<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(matrix_size), + static_cast(max_block_size)); + break; + case 96: + KERNEL_FN_96<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(matrix_size), + static_cast(max_block_size)); + break; + case 128: + KERNEL_FN_128<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(matrix_size), + static_cast(max_block_size)); + break; + default: + break; + } +} diff --git a/examples/aot/fast_inverse/compile.sh b/examples/aot/fast_inverse/compile.sh new file mode 100644 index 00000000..d6a65003 --- /dev/null +++ b/examples/aot/fast_inverse/compile.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARTIFACT_DIR="./build_artifacts" + +mkdir -p "${ARTIFACT_DIR}" +rm -f \ + "${ARTIFACT_DIR}"/inverse_auto_sync_*.pto "${ARTIFACT_DIR}"/inverse_manual_sync_*.pto \ + "${ARTIFACT_DIR}"/inverse_auto_sync_*.cpp "${ARTIFACT_DIR}"/inverse_manual_sync_*.cpp \ + inverse_auto_sync_lib.so inverse_manual_sync_lib.so + +SIZES=(16 32 64 96 128) + +# Auto-sync path: rely on ptoas synchronization insertion. +for size in "${SIZES[@]}"; do + python ./inverse_builder.py \ + --matrix-size "${size}" \ + --kernel-name "tri_inv_trick_fp16_${size}" \ + > "${ARTIFACT_DIR}/inverse_auto_sync_${size}.pto" + ptoas --enable-insert-sync "${ARTIFACT_DIR}/inverse_auto_sync_${size}.pto" -o "${ARTIFACT_DIR}/inverse_auto_sync_${size}.cpp" +done + +# Manual-sync path: explicit record/wait events from builder. +for size in "${SIZES[@]}"; do + python ./inverse_builder.py \ + --manual-sync \ + --matrix-size "${size}" \ + --kernel-name "tri_inv_trick_fp16_${size}" \ + > "${ARTIFACT_DIR}/inverse_manual_sync_${size}.pto" + ptoas "${ARTIFACT_DIR}/inverse_manual_sync_${size}.pto" -o "${ARTIFACT_DIR}/inverse_manual_sync_${size}.cpp" +done + +PTO_LIB_PATH=/sources/pto-isa +# PTO_LIB_PATH=$ASCEND_TOOLKIT_HOME + +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP_16="\"${ARTIFACT_DIR}/inverse_auto_sync_16.cpp\"" \ + -DKERNEL_CPP_32="\"${ARTIFACT_DIR}/inverse_auto_sync_32.cpp\"" \ + -DKERNEL_CPP_64="\"${ARTIFACT_DIR}/inverse_auto_sync_64.cpp\"" \ + -DKERNEL_CPP_96="\"${ARTIFACT_DIR}/inverse_auto_sync_96.cpp\"" \ + -DKERNEL_CPP_128="\"${ARTIFACT_DIR}/inverse_auto_sync_128.cpp\"" \ + ./caller.cpp \ + -o ./inverse_auto_sync_lib.so + +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP_16="\"${ARTIFACT_DIR}/inverse_manual_sync_16.cpp\"" \ + -DKERNEL_CPP_32="\"${ARTIFACT_DIR}/inverse_manual_sync_32.cpp\"" \ + -DKERNEL_CPP_64="\"${ARTIFACT_DIR}/inverse_manual_sync_64.cpp\"" \ + -DKERNEL_CPP_96="\"${ARTIFACT_DIR}/inverse_manual_sync_96.cpp\"" \ + -DKERNEL_CPP_128="\"${ARTIFACT_DIR}/inverse_manual_sync_128.cpp\"" \ + ./caller.cpp \ + -o ./inverse_manual_sync_lib.so diff --git a/examples/aot/fast_inverse/inverse_builder.py b/examples/aot/fast_inverse/inverse_builder.py new file mode 100644 index 00000000..2dd2c324 --- /dev/null +++ b/examples/aot/fast_inverse/inverse_builder.py @@ -0,0 +1,241 @@ +# pyright: reportUndefinedVariable=false +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 96, 128) + + +def make_meta_data(matrix_size: int): + def meta_data(): + # Match the hand-written kernel: + # - MAT/LEFT/RIGHT tiles are fp16 + # - ACC and global output are fp32 + # This enables legal TMOV Acc(fp32) -> Mat(fp16) lowering. + in_dtype = pto.float16 + out_dtype = pto.float32 + i32 = pto.int32 + + in_ptr_type = pto.PtrType(in_dtype) + out_ptr_type = pto.PtrType(out_dtype) + + in_tensor_type = pto.TensorType(rank=2, dtype=in_dtype) + out_tensor_type = pto.TensorType(rank=2, dtype=out_dtype) + in_subtensor = pto.SubTensorType( + shape=[matrix_size, matrix_size], dtype=in_dtype + ) + out_subtensor = pto.SubTensorType( + shape=[matrix_size, matrix_size], dtype=out_dtype + ) + l1_tile_type = pto.TileBufType( + shape=[matrix_size, matrix_size], + valid_shape=[matrix_size, matrix_size], + dtype=in_dtype, + memory_space="MAT", + ) + l0a_tile_type = pto.TileBufType( + shape=[matrix_size, matrix_size], + valid_shape=[matrix_size, matrix_size], + dtype=in_dtype, + memory_space="LEFT", + ) + l0b_tile_type = pto.TileBufType( + shape=[matrix_size, matrix_size], + valid_shape=[matrix_size, matrix_size], + dtype=in_dtype, + memory_space="RIGHT", + ) + l0c_tile_type = pto.TileBufType( + shape=[matrix_size, matrix_size], + valid_shape=[matrix_size, matrix_size], + dtype=out_dtype, + memory_space="ACC", + ) + + return { + "in_ptr_type": in_ptr_type, + "out_ptr_type": out_ptr_type, + "i32": i32, + "in_tensor_type": in_tensor_type, + "out_tensor_type": out_tensor_type, + "in_subtensor": in_subtensor, + "out_subtensor": out_subtensor, + "l1_tile_type": l1_tile_type, + "l0a_tile_type": l0a_tile_type, + "l0b_tile_type": l0b_tile_type, + "l0c_tile_type": l0c_tile_type, + } + + return meta_data + + +def build_kernel(manual_sync: bool, matrix_size: int, kernel_name: str): + def tri_inv_trick_fp16( + out_ptr: "out_ptr_type", + in_ptr: "in_ptr_type", + i_neg_ptr: "in_ptr_type", + matrix_size_i32: "i32", + max_block_size_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c4 = const(4) + c8 = const(8) + matrix_size_c = const(matrix_size) + + max_block_size = s.index_cast(max_block_size_i32) + block_idx = s.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + + total_rows = num_blocks * matrix_size_c + row_offset = block_idx * matrix_size_c + + # Keep the runtime signature unchanged while emitting + # compile-time-specialized tile/subtensor types. + _ = matrix_size_i32 + + tv_m = pto.as_tensor( + in_tensor_type, + ptr=in_ptr, + shape=[total_rows, matrix_size_c], + strides=[matrix_size_c, c1], + ) + tv_out = pto.as_tensor( + out_tensor_type, + ptr=out_ptr, + shape=[total_rows, matrix_size_c], + strides=[matrix_size_c, c1], + ) + tv_i_neg = pto.as_tensor( + in_tensor_type, + ptr=i_neg_ptr, + shape=[matrix_size_c, matrix_size_c], + strides=[matrix_size_c, c1], + ) + + sv_m = pto.slice_view( + in_subtensor, + source=tv_m, + offsets=[row_offset, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + sv_i_neg = pto.slice_view( + in_subtensor, + source=tv_i_neg, + offsets=[c0, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + sv_out = pto.slice_view( + out_subtensor, + source=tv_out, + offsets=[row_offset, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + + x_l1 = pto.alloc_tile(l1_tile_type) + y_l1 = pto.alloc_tile(l1_tile_type) + i_l1 = pto.alloc_tile(l1_tile_type) + a_l0 = pto.alloc_tile(l0a_tile_type) + b_l0 = pto.alloc_tile(l0b_tile_type) + c_l0 = pto.alloc_tile(l0c_tile_type) + + def sync(record_op, wait_op): + if manual_sync: + pto.record_wait_pair(record_op, wait_op, event_id=0) + + pto.load(sv_m, y_l1) + pto.load(sv_i_neg, x_l1) + sync("LOAD", "MOV_M2L") + + tile.mov(y_l1, a_l0) + sync("MOV_M2L", "MATMUL") + + tile.mov(y_l1, b_l0) + sync("MOV_M2L", "MATMUL") + + tile.matmul(a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_V2M") + tile.mov(c_l0, y_l1) + sync("MOV_V2M", "MOV_M2L") + + tile.mov(x_l1, b_l0) + sync("MOV_M2L", "MATMUL") + tile.matmul(a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_M2L") + + tile.mov(x_l1, a_l0) + sync("MOV_M2L", "MATMUL") + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_V2M") + tile.mov(c_l0, x_l1) + sync("MOV_V2M", "MATMUL") + + tile.matmul(a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_V2M") + tile.mov(c_l0, i_l1) + sync("MOV_V2M", "MOV_M2L") + + def run_iteration(iter_i): + tile.mov(x_l1, a_l0) + tile.mov(i_l1, b_l0) + sync("MOV_M2L", "MATMUL") + tile.matmul(a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_M2L") + + tile.mov(y_l1, b_l0) + sync("MOV_M2L", "MATMUL") + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + + with pto.if_context(iter_i < (max_block_size // c2)): + sync("MATMUL", "MOV_V2M") + tile.mov(c_l0, x_l1) + sync("MOV_V2M", "MOV_M2L") + tile.mov(y_l1, a_l0) + sync("MOV_M2L", "MATMUL") + tile.matmul(a_l0, b_l0, c_l0) + sync("MATMUL", "MOV_V2M") + tile.mov(c_l0, y_l1) + sync("MOV_V2M", "MOV_M2L") + + # Mirror C++ `for (i = 1; i < max_block_size; i *= 2)`. + # Using pto.range(1, max_block_size, 1) adds many no-op + # iterations that still perturb generated sync scheduling. + for loop_i in (c1, c2, c4, c8): + with pto.if_context(loop_i < max_block_size): + run_iteration(loop_i) + + sync("MATMUL", "STORE_ACC") + pto.store(c_l0, sv_out) + + tri_inv_trick_fp16.__name__ = kernel_name + return to_ir_module(meta_data=make_meta_data(matrix_size))(tri_inv_trick_fp16) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Emit explicit record/wait events instead of relying on --enable-insert-sync.", + ) + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=128, + help="Compile-time specialized matrix size.", + ) + parser.add_argument( + "--kernel-name", + type=str, + default=None, + help="Kernel symbol name in emitted module.", + ) + args = parser.parse_args() + kernel_name = args.kernel_name or f"tri_inv_trick_fp16_{args.matrix_size}" + module = build_kernel(args.manual_sync, args.matrix_size, kernel_name) + print(module) diff --git a/examples/aot/fast_inverse/run_inverse.py b/examples/aot/fast_inverse/run_inverse.py new file mode 100644 index 00000000..48817fb2 --- /dev/null +++ b/examples/aot/fast_inverse/run_inverse.py @@ -0,0 +1,184 @@ +import argparse +import ctypes +import random +import warnings +from typing import Callable + +import numpy as np +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +random.seed(42) +torch.manual_seed(42) +np.random.seed(42) + +MAX_BLOCK_SIZE = 16 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # out + ctypes.c_void_p, # in + ctypes.c_void_p, # identity_neg + ctypes.c_uint32, # matrix_size + ctypes.c_uint32, # max_block_size + ] + lib.call_kernel.restype = None + return lib + + +def random_matrix(n, block_dim_x, block_dim_y, scale=0.01): + return scale * torch.rand((block_dim_x, block_dim_y, n, n)) + + +def block_ones_matrix(n, block_dim_x, block_dim_y): + block = np.ones((16, 16)) + n_blocks = n // 16 + out = np.zeros((block_dim_x, block_dim_y, n, n)) + for x in range(block_dim_x): + for y in range(block_dim_y): + for i in range(n_blocks): + start = i * 16 + end = start + 16 + out[x, y, start:end, start:end] = block + return torch.from_numpy(np.triu(out, 1)) + + +def block_random_matrix(n, block_dim_x, block_dim_y, scale=0.2): + block = scale * np.random.rand(16, 16) + block = np.triu(block, k=1) + out = np.zeros((block_dim_x, block_dim_y, n, n)) + for x in range(block_dim_x): + for y in range(block_dim_y): + for i in range(0, n, 16): + out[x, y, i : i + 16, i : i + 16] = block.copy() + return torch.from_numpy(out) + + +def run_kernel(lib, inp): + inp_fp16 = inp.to(torch.float16).contiguous() + n = int(inp_fp16.shape[-1]) + block_dim = int(inp_fp16.shape[0] * inp_fp16.shape[1]) + + # Run true matrix sizes directly (e.g., 32x32, 64x64) without padding. + run_n = n + inp_run = inp_fp16 + + out = torch.zeros_like(inp_run, dtype=torch.float32, device=inp_run.device) + identity_neg = torch.zeros( + (run_n, run_n), dtype=torch.float16, device=inp_run.device + ) + identity_neg.fill_diagonal_(-1) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(out), + torch_to_ctypes(inp_run), + torch_to_ctypes(identity_neg), + run_n, + MAX_BLOCK_SIZE, + ) + torch.npu.synchronize() + return out + + +def reference_inverse(inp): + n = inp.shape[-1] + identity = np.eye(n, dtype=np.double) + golden = np.zeros(inp.shape, dtype=np.double) + inp_cpu = inp.cpu() + for x in range(inp.shape[0]): + for y in range(inp.shape[1]): + golden[x, y] = np.linalg.inv( + inp_cpu[x, y].numpy().astype(np.double) + identity + ) + return torch.from_numpy(golden) + + +def check_case(lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float): + n_list = [16, 32, 64, 96, 128] + block_dim_x_list = [1, 3, 7, 16] + block_dim_y_list = [1, 2, 4, 16] + failures = [] + passes = 0 + for n in n_list: + for block_dim_x in block_dim_x_list: + for block_dim_y in block_dim_y_list: + inp = matrix_gen(n, block_dim_x, block_dim_y).to(device) + ref = reference_inverse(inp).to(torch.float64) + out = run_kernel(lib, inp).cpu().to(torch.float64) + + frob_error = torch.sqrt( + torch.sum((ref - out) * (ref - out)) / torch.sum(ref * ref) + ) + + nan_count = int(torch.isnan(out).sum().item()) + inf_count = int(torch.isinf(out).sum().item()) + + allclose_ok = np.allclose( + out.numpy(), ref.numpy(), atol=atol, rtol=rtol + ) + frob_ok = bool(frob_error <= ftol) + if allclose_ok and frob_ok: + passes += 1 + print( + f"[pass] n={n}, bx={block_dim_x}, by={block_dim_y}, " + f"frob={float(frob_error):.3e}" + ) + else: + msg = ( + f"[fail] n={n}, bx={block_dim_x}, by={block_dim_y}, " + f"frob={float(frob_error):.3e}, nan={nan_count}, inf={inf_count}" + ) + print(msg) + failures.append(msg) + + total = len(n_list) * len(block_dim_x_list) * len(block_dim_y_list) + print(f"summary: pass={passes}, fail={len(failures)}, total={total}") + return failures + + +def run_test(lib): + failures = [] + failures.extend(check_case(lib, block_ones_matrix, atol=0.0, rtol=0.0, ftol=0.0)) + failures.extend( + check_case(lib, block_random_matrix, atol=5e-5, rtol=0.1, ftol=1.2e-4) + ) + if failures: + warnings.warn( + f"{len(failures)} cases failed. First: {failures[0]}", + stacklevel=2, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Use manual-sync library instead of the default auto-sync library.", + ) + args = parser.parse_args() + + lib_path = ( + "./inverse_manual_sync_lib.so" + if args.manual_sync + else "./inverse_auto_sync_lib.so" + ) + device = get_test_device() + torch.npu.set_device(device) + + kernel_lib = load_lib(lib_path) + run_test(kernel_lib) + print(f"All tests passed for {lib_path}.") From 87617a92d375e76be757ba9189669eb33ab95c3f Mon Sep 17 00:00:00 2001 From: learning-chip Date: Tue, 17 Mar 2026 09:33:01 +0000 Subject: [PATCH 41/53] ignore artifacts --- examples/aot/matmul_optimization_guide/experimental/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/aot/matmul_optimization_guide/experimental/.gitignore diff --git a/examples/aot/matmul_optimization_guide/experimental/.gitignore b/examples/aot/matmul_optimization_guide/experimental/.gitignore new file mode 100644 index 00000000..03567fc4 --- /dev/null +++ b/examples/aot/matmul_optimization_guide/experimental/.gitignore @@ -0,0 +1 @@ +outputs From a94bd08c7667eaffabedc6399b405d8720c46fb4 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:50:29 +0100 Subject: [PATCH 42/53] Enable auto-sync for fast inverse example (#91) * update PTOAS to https://github.com/zhangstevenunity/PTOAS/releases/tag/v0.9 * auto-sync now works * test more general block-diag size * separate out manual vs auto functions * clean up * precommit check * loop count should cover larger block size * precommit check --------- Co-authored-by: jiawei_zhuang --- docker/Dockerfile | 7 +- docker/README.md | 14 +- examples/aot/fast_inverse/README.md | 4 +- examples/aot/fast_inverse/inverse_builder.py | 191 ++++++++++++++++--- examples/aot/fast_inverse/run_inverse.py | 78 ++++++-- 5 files changed, 230 insertions(+), 64 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 01689ca6..73c20dfc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -22,9 +22,10 @@ ARG CACHE_BURST=1 # ARG ARCH=x86_64 ARG ARCH=aarch64 -ARG RELEASE_REPO=huawei-csl/PTOAS -ARG RELEASE_TAG=0.8 -ARG WHEEL_NAME=ptoas-${RELEASE_TAG}-cp311-none-manylinux_2_34_${ARCH}.whl +ARG RELEASE_REPO=zhangstevenunity/PTOAS +ARG RELEASE_VER=0.9 +ARG RELEASE_TAG=v${RELEASE_VER} +ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz WORKDIR /installers/ diff --git a/docker/README.md b/docker/README.md index c992124a..93febcd0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -3,19 +3,19 @@ Recommend using [Ascend Docker Runtime](https://gitcode.com/Ascend/mind-cluster/ Then, build and run docker image: ```bash -RELEASE_TAG=0.8 +RELEASE_VER=0.9 sudo docker build \ - --build-arg RELEASE_TAG=$RELEASE_TAG \ - . -t pto_dsl:$RELEASE_TAG + --build-arg RELEASE_VER=$RELEASE_VER \ + . -t pto_dsl:$RELEASE_VER # for specific arch (x86_64 vs aarch64) sudo docker build \ --build-arg ARCH=x86_64 \ - --build-arg RELEASE_TAG=$RELEASE_TAG \ - . -t pto_dsl:$RELEASE_TAG + --build-arg RELEASE_VER=$RELEASE_VER \ + . -t pto_dsl:$RELEASE_VER # to test compile-only -sudo docker run --rm -it pto_dsl:$RELEASE_TAG /bin/bash +sudo docker run --rm -it pto_dsl:$RELEASE_VER /bin/bash # to test on-device execution sudo docker run --rm -it --ipc=host --privileged \ @@ -30,7 +30,7 @@ sudo docker run --rm -it --ipc=host --privileged \ -v /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro \ -v /etc/ascend_install.info:/etc/ascend_install.info:ro \ -v $HOME:/mounted_home -w /mounted_home \ - pto_dsl:$RELEASE_TAG /bin/bash + pto_dsl:$RELEASE_VER /bin/bash ``` ## Appendix: NPU driver diff --git a/examples/aot/fast_inverse/README.md b/examples/aot/fast_inverse/README.md index fff824a9..204f17c4 100644 --- a/examples/aot/fast_inverse/README.md +++ b/examples/aot/fast_inverse/README.md @@ -2,6 +2,6 @@ Usage: ```bash bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs -python ./run_inverse.py --manual-sync # test manual-sync lib -# python ./run_inverse.py # TODO: fix auto-sync bug in ptoas +python ./run_inverse.py # default to auto-sync kernel +python ./run_inverse.py --manual-sync # test manual-sync kernel ``` diff --git a/examples/aot/fast_inverse/inverse_builder.py b/examples/aot/fast_inverse/inverse_builder.py index 2dd2c324..8ec0df20 100644 --- a/examples/aot/fast_inverse/inverse_builder.py +++ b/examples/aot/fast_inverse/inverse_builder.py @@ -71,8 +71,8 @@ def meta_data(): return meta_data -def build_kernel(manual_sync: bool, matrix_size: int, kernel_name: str): - def tri_inv_trick_fp16( +def build_kernel_autosync(matrix_size: int, kernel_name: str): + def tri_inv_trick_fp16_autosync( out_ptr: "out_ptr_type", in_ptr: "in_ptr_type", i_neg_ptr: "in_ptr_type", @@ -85,6 +85,8 @@ def tri_inv_trick_fp16( c2 = const(2) c4 = const(4) c8 = const(8) + c16 = const(16) + c32 = const(32) matrix_size_c = const(matrix_size) max_block_size = s.index_cast(max_block_size_i32) @@ -143,76 +145,203 @@ def tri_inv_trick_fp16( b_l0 = pto.alloc_tile(l0b_tile_type) c_l0 = pto.alloc_tile(l0c_tile_type) - def sync(record_op, wait_op): - if manual_sync: - pto.record_wait_pair(record_op, wait_op, event_id=0) + pto.load(sv_m, y_l1) + pto.load(sv_i_neg, x_l1) + + tile.mov(y_l1, a_l0) + + tile.mov(y_l1, b_l0) + + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y_l1) + + tile.mov(x_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + + tile.mov(x_l1, a_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + tile.mov(c_l0, x_l1) + + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, i_l1) + + def run_iteration(iter_i): + tile.mov(x_l1, a_l0) + tile.mov(i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + + tile.mov(y_l1, b_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + + with pto.if_context(iter_i < (max_block_size // c2)): + tile.mov(c_l0, x_l1) + tile.mov(y_l1, a_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y_l1) + + # Mirror C++ `for (i = 1; i < max_block_size; i *= 2)`. + # TODO: simplify this code logic + for loop_i in (c1, c2, c4, c8, c16, c32): + # here only considers max_block_size up to 64 + with pto.if_context(loop_i < max_block_size): + run_iteration(loop_i) + + pto.store(c_l0, sv_out) + + tri_inv_trick_fp16_autosync.__name__ = kernel_name + return to_ir_module(meta_data=make_meta_data(matrix_size))( + tri_inv_trick_fp16_autosync + ) + + +def build_kernel_manualsync(matrix_size: int, kernel_name: str): + def tri_inv_trick_fp16_manualsync( + out_ptr: "out_ptr_type", + in_ptr: "in_ptr_type", + i_neg_ptr: "in_ptr_type", + matrix_size_i32: "i32", + max_block_size_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + c2 = const(2) + c4 = const(4) + c8 = const(8) + c16 = const(16) + c32 = const(32) + matrix_size_c = const(matrix_size) + + max_block_size = s.index_cast(max_block_size_i32) + block_idx = s.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + + total_rows = num_blocks * matrix_size_c + row_offset = block_idx * matrix_size_c + + # Keep the runtime signature unchanged while emitting + # compile-time-specialized tile/subtensor types. + _ = matrix_size_i32 + + tv_m = pto.as_tensor( + in_tensor_type, + ptr=in_ptr, + shape=[total_rows, matrix_size_c], + strides=[matrix_size_c, c1], + ) + tv_out = pto.as_tensor( + out_tensor_type, + ptr=out_ptr, + shape=[total_rows, matrix_size_c], + strides=[matrix_size_c, c1], + ) + tv_i_neg = pto.as_tensor( + in_tensor_type, + ptr=i_neg_ptr, + shape=[matrix_size_c, matrix_size_c], + strides=[matrix_size_c, c1], + ) + + sv_m = pto.slice_view( + in_subtensor, + source=tv_m, + offsets=[row_offset, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + sv_i_neg = pto.slice_view( + in_subtensor, + source=tv_i_neg, + offsets=[c0, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + sv_out = pto.slice_view( + out_subtensor, + source=tv_out, + offsets=[row_offset, c0], + sizes=[matrix_size_c, matrix_size_c], + ) + + x_l1 = pto.alloc_tile(l1_tile_type) + y_l1 = pto.alloc_tile(l1_tile_type) + i_l1 = pto.alloc_tile(l1_tile_type) + a_l0 = pto.alloc_tile(l0a_tile_type) + b_l0 = pto.alloc_tile(l0b_tile_type) + c_l0 = pto.alloc_tile(l0c_tile_type) pto.load(sv_m, y_l1) pto.load(sv_i_neg, x_l1) - sync("LOAD", "MOV_M2L") + pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) tile.mov(y_l1, a_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.mov(y_l1, b_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul(a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_V2M") + pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) tile.mov(c_l0, y_l1) - sync("MOV_V2M", "MOV_M2L") + pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) tile.mov(x_l1, b_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul(a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_M2L") + pto.record_wait_pair("MATMUL", "MOV_M2L", event_id=0) tile.mov(x_l1, a_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_V2M") + pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) tile.mov(c_l0, x_l1) - sync("MOV_V2M", "MATMUL") + pto.record_wait_pair("MOV_V2M", "MATMUL", event_id=0) tile.matmul(a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_V2M") + pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) tile.mov(c_l0, i_l1) - sync("MOV_V2M", "MOV_M2L") + pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) def run_iteration(iter_i): tile.mov(x_l1, a_l0) tile.mov(i_l1, b_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul(a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_M2L") + pto.record_wait_pair("MATMUL", "MOV_M2L", event_id=0) tile.mov(y_l1, b_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) with pto.if_context(iter_i < (max_block_size // c2)): - sync("MATMUL", "MOV_V2M") + pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) tile.mov(c_l0, x_l1) - sync("MOV_V2M", "MOV_M2L") + pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) tile.mov(y_l1, a_l0) - sync("MOV_M2L", "MATMUL") + pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) tile.matmul(a_l0, b_l0, c_l0) - sync("MATMUL", "MOV_V2M") + pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) tile.mov(c_l0, y_l1) - sync("MOV_V2M", "MOV_M2L") + pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) # Mirror C++ `for (i = 1; i < max_block_size; i *= 2)`. - # Using pto.range(1, max_block_size, 1) adds many no-op - # iterations that still perturb generated sync scheduling. - for loop_i in (c1, c2, c4, c8): + # TODO: simplify this code logic + for loop_i in (c1, c2, c4, c8, c16, c32): + # here only considers max_block_size up to 64 with pto.if_context(loop_i < max_block_size): run_iteration(loop_i) - sync("MATMUL", "STORE_ACC") + pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) pto.store(c_l0, sv_out) - tri_inv_trick_fp16.__name__ = kernel_name - return to_ir_module(meta_data=make_meta_data(matrix_size))(tri_inv_trick_fp16) + tri_inv_trick_fp16_manualsync.__name__ = kernel_name + return to_ir_module(meta_data=make_meta_data(matrix_size))( + tri_inv_trick_fp16_manualsync + ) + + +def build_kernel(manual_sync: bool, matrix_size: int, kernel_name: str): + if manual_sync: + return build_kernel_manualsync(matrix_size, kernel_name) + return build_kernel_autosync(matrix_size, kernel_name) if __name__ == "__main__": diff --git a/examples/aot/fast_inverse/run_inverse.py b/examples/aot/fast_inverse/run_inverse.py index 48817fb2..7aef879d 100644 --- a/examples/aot/fast_inverse/run_inverse.py +++ b/examples/aot/fast_inverse/run_inverse.py @@ -14,8 +14,6 @@ torch.manual_seed(42) np.random.seed(42) -MAX_BLOCK_SIZE = 16 - def torch_to_ctypes(tensor): return ctypes.c_void_p(tensor.data_ptr()) @@ -30,41 +28,50 @@ def load_lib(lib_path): ctypes.c_void_p, # in ctypes.c_void_p, # identity_neg ctypes.c_uint32, # matrix_size - ctypes.c_uint32, # max_block_size + ctypes.c_uint32, # block_size for block-diag-matrices ] lib.call_kernel.restype = None return lib def random_matrix(n, block_dim_x, block_dim_y, scale=0.01): + # TODO: this data generator is not used yet return scale * torch.rand((block_dim_x, block_dim_y, n, n)) -def block_ones_matrix(n, block_dim_x, block_dim_y): - block = np.ones((16, 16)) - n_blocks = n // 16 +def blockdiag_ones_matrix(n, block_dim_x, block_dim_y, block_size=16): + block = np.ones((block_size, block_size)) + n_blocks = n // block_size out = np.zeros((block_dim_x, block_dim_y, n, n)) for x in range(block_dim_x): for y in range(block_dim_y): for i in range(n_blocks): - start = i * 16 - end = start + 16 + start = i * block_size + end = start + block_size out[x, y, start:end, start:end] = block return torch.from_numpy(np.triu(out, 1)) -def block_random_matrix(n, block_dim_x, block_dim_y, scale=0.2): - block = scale * np.random.rand(16, 16) +def blockdiag_random_matrix(n, block_dim_x, block_dim_y, block_size=16): + if block_size == 16: + scale = 0.2 + elif block_size == 32: + scale = 0.05 + elif block_size == 64: + scale = 0.01 + else: + raise ValueError("block_size must be 16/32/64") + block = scale * np.random.rand(block_size, block_size) block = np.triu(block, k=1) out = np.zeros((block_dim_x, block_dim_y, n, n)) for x in range(block_dim_x): for y in range(block_dim_y): - for i in range(0, n, 16): - out[x, y, i : i + 16, i : i + 16] = block.copy() + for i in range(0, n, block_size): + out[x, y, i : i + block_size, i : i + block_size] = block.copy() return torch.from_numpy(out) -def run_kernel(lib, inp): +def run_kernel(lib, inp, blockdiag_size=16): inp_fp16 = inp.to(torch.float16).contiguous() n = int(inp_fp16.shape[-1]) block_dim = int(inp_fp16.shape[0] * inp_fp16.shape[1]) @@ -87,7 +94,7 @@ def run_kernel(lib, inp): torch_to_ctypes(inp_run), torch_to_ctypes(identity_neg), run_n, - MAX_BLOCK_SIZE, + blockdiag_size, ) torch.npu.synchronize() return out @@ -106,8 +113,17 @@ def reference_inverse(inp): return torch.from_numpy(golden) -def check_case(lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float): - n_list = [16, 32, 64, 96, 128] +def check_case( + lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float, blockdiag_size=16 +): + if blockdiag_size == 16: + n_list = [16, 32, 64, 96, 128] + elif blockdiag_size == 32: + n_list = [32, 64, 128] + elif blockdiag_size == 64: + n_list = [64, 128] + else: + raise ValueError("blockdiag_size must be 16/32/64") block_dim_x_list = [1, 3, 7, 16] block_dim_y_list = [1, 2, 4, 16] failures = [] @@ -115,7 +131,9 @@ def check_case(lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float) for n in n_list: for block_dim_x in block_dim_x_list: for block_dim_y in block_dim_y_list: - inp = matrix_gen(n, block_dim_x, block_dim_y).to(device) + inp = matrix_gen( + n, block_dim_x, block_dim_y, block_size=blockdiag_size + ).to(device) ref = reference_inverse(inp).to(torch.float64) out = run_kernel(lib, inp).cpu().to(torch.float64) @@ -151,10 +169,28 @@ def check_case(lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float) def run_test(lib): failures = [] - failures.extend(check_case(lib, block_ones_matrix, atol=0.0, rtol=0.0, ftol=0.0)) - failures.extend( - check_case(lib, block_random_matrix, atol=5e-5, rtol=0.1, ftol=1.2e-4) - ) + for blockdiag_size in (16,): + failures.extend( + check_case( + lib, + blockdiag_ones_matrix, + atol=0.0, + rtol=0.0, + ftol=0.0, + blockdiag_size=blockdiag_size, + ) + ) + for blockdiag_size in (16, 32, 64): + failures.extend( + check_case( + lib, + blockdiag_random_matrix, + atol=5e-5, + rtol=0.1, + ftol=1.2e-4, + blockdiag_size=blockdiag_size, + ) + ) if failures: warnings.warn( f"{len(failures)} cases failed. First: {failures[0]}", From 2af0bc7ea0ef413f417849974cbfb625f0230edf Mon Sep 17 00:00:00 2001 From: jiawei_zhuang Date: Tue, 17 Mar 2026 17:14:32 +0100 Subject: [PATCH 43/53] use on-the-fly clone for external references --- .agent/skills/translate_cpp2py/SKILL.md | 12 +- .../references/external_repo/README.md | 21 + .../references/ptoas_source/PTOOps.td | 3648 -------- .../references/ptoas_source/PTOToEmitC.cpp | 7713 ----------------- .../references/ptoas_source/README.md | 4 - .../references/ptoas_source/pto.py | 280 - .../references/ptoisa_source/README.md | 6 - .../references/ptoisa_source/pto-inst.hpp | 830 -- 8 files changed, 27 insertions(+), 12487 deletions(-) create mode 100644 .agent/skills/translate_cpp2py/references/external_repo/README.md delete mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td delete mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp delete mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/README.md delete mode 100644 .agent/skills/translate_cpp2py/references/ptoas_source/pto.py delete mode 100644 .agent/skills/translate_cpp2py/references/ptoisa_source/README.md delete mode 100644 .agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp diff --git a/.agent/skills/translate_cpp2py/SKILL.md b/.agent/skills/translate_cpp2py/SKILL.md index 533bfabf..61f7b8aa 100644 --- a/.agent/skills/translate_cpp2py/SKILL.md +++ b/.agent/skills/translate_cpp2py/SKILL.md @@ -145,11 +145,11 @@ If required C++ op has no convenient Python wrapper: ## Escalation Path (Only When Mapping Is Missing) -Check in order: -1. MLIR Python op bindings: `references/ptoas_source/pto.py` -2. Dialect op definitions/contracts: `references/ptoas_source/PTOOps.td` -3. C++ codegen lowering: `references/ptoas_source/PTOToEmitC.cpp` -4. ISA semantics: `references/ptoisa_source/pto-inst.hpp` +Check in order in the `references/external_repo` +1. Clone the `PTOAS` and `pto-isa` repos +2. Check Dialect op definitions: `PTOOps.td` in `PTOAS` repo +3. C++ codegen lowering: `PTOToEmitC.cpp` in `PTOAS` repo +4. ISA semantics: `pto-inst.hpp` in `pto-isa` repo If op exists in dialect but not lowered in `PTOToEmitC.cpp`, translation requires PTOAS compiler work (not only DSL wrapper work). In this case, suggest an issue report to PTOAS project (https://github.com/zhangstevenunity/PTOAS) @@ -173,4 +173,4 @@ Use these first: - `examples/aot/elementwise/add_dynamic_multicore/*` (caller/test/build pattern) - `examples/aot/matmul_optimization_guide/matmul_optim_guide.md` (sync and runtime-control semantics) -Consult `references/ptoas_source/**` and ISA headers only for patterns not covered by examples. +Consult `references/external_repo/**` only for patterns not covered by examples. diff --git a/.agent/skills/translate_cpp2py/references/external_repo/README.md b/.agent/skills/translate_cpp2py/references/external_repo/README.md new file mode 100644 index 00000000..c36bc440 --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/external_repo/README.md @@ -0,0 +1,21 @@ +This directory holds the 3rd-party repos that are used internally by PTO-DSL: +- https://github.com/zhangstevenunity/PTOAS: implements "ptoas" command line tool, the PTO MLIR dialect and its Python bindings, and the InjectSync pass to insert set_flag/wait_flag for "auto-sync" mode. Important files are: + - `PTOAS/include/PTO/IR/PTOOps.td` defines the MLIR PTO dialect + - `PTOAS/python/pto/dialects/pto.py` has low-level Python wrappers of PTO MLIR python binding (more Pythonic wrappers are in pto-dsl package) + - `PTOAS/lib/PTO/Transforms/PTOToEmitC.cpp` the compile pass that converts `*.pto` IR to C++ source code based on PTO-ISA headers. +- https://gitcode.com/cann/pto-isa: header-only library that defined the C++ APIs of PTO-ISA. It is the target API set for the `PTOToEmitC` pass in PTOAS. Important files are: + - `pto-isa/include/pto/common/pto_instr.hpp` the top-level interface + - `pto-isa/include/pto/common/*` common type definitions + - `pto-isa/include/pto/npu/a2a3/*` implementation for current hardware (used in current pto-dsl examples) + - `pto-isa/include/pto/npu/a5/*` implementation for next-generation hardware (not used in current pto-dsl examples) + +Current directory is empty by default, and the repos should be cloned on-the-fly when the agent needs to access extra context. + +For difficult task that needs to look into PTOAS and pto-isa repos, the agent or user can clone them by: + +```bash +git clone https://github.com/zhangstevenunity/PTOAS.git +git clone https://gitcode.com/cann/pto-isa.git +``` + +Remind the user to check if the commit id of PTOAS and pto-isa matches the test environment (usually a pre-built docker image), to avoid mismatch between the context and the real execution. diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td deleted file mode 100644 index b46efe33..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOOps.td +++ /dev/null @@ -1,3648 +0,0 @@ -//===- PTOOps.td - Pattern descriptor operations -----------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file declares the PTO dialect operations. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_DIALECT_PTO_IR_PTOOPS -#define MLIR_DIALECT_PTO_IR_PTOOPS - -include "PTO/IR/PTODialect.td" -include "PTO/IR/PTOAttrs.td" -include "PTO/IR/PTOTypeDefs.td" -include "PTO/IR/PTOInterfaces.td" - -include "mlir/IR/OpBase.td" -include "mlir/IR/OpAsmInterface.td" -include "mlir/IR/SymbolInterfaces.td" - -include "mlir/Interfaces/DestinationStyleOpInterface.td" -include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Interfaces/ViewLikeInterface.td" - -//===----------------------------------------------------------------------===// -// Types -//===----------------------------------------------------------------------===// - -def TensorOrMemref : - AnyTypeOf<[AnyMemRef, AnyRankedTensor], "Tensor or Memref">; - -def PTODpsType : - AnyTypeOf<[AnyRankedTensor, AnyMemRef, PartitionTensorViewType, TileBufType]>; - -def PtrOrMemRef : - AnyTypeOf<[PtrType, AnyMemRef], "Ptr or MemRef">; - -def ScalarPtrOrMemRef : - TypeConstraint< - CPred<"::mlir::pto::isScalarPtrOrMemRef($_self)">, - "Ptr or MemRef in GM">; - -def PrintScalarType : - AnyTypeOf<[Index, AnySignlessInteger, AnyFloat], "numeric (index/integer/float)">; - -//===----------------------------------------------------------------------===// -// Op Class -//===----------------------------------------------------------------------===// -class PTO_TOp traits = []> - : Op; - -class PTO_DpsOp traits = []> - : Op { - let extraClassDeclaration = [{ - ::mlir::MutableOperandRange getDpsInitsMutable() { - return getDstMutable(); - } - }]; -} - -class PTO_Op traits = []> - : Op; - -//===----------------------------------------------------------------------===// -// Pointer/View Ops (for your front-end IR) -//===----------------------------------------------------------------------===// - -def AddPtrOp : PTO_Op<"addptr", [ - Pure, - AllTypesMatch<["ptr", "result"]>, - DeclareOpInterfaceMethods - ]> { - let summary = "Add an element offset to a !pto.ptr"; - let description = [{ - Computes a new pointer by adding an element offset to the base pointer. - The offset is in elements (not bytes). - }]; - - let arguments = (ins - PtrType:$ptr, - Index:$offset - ); - - let results = (outs PtrType:$result); - - let hasVerifier = 1; - - let assemblyFormat = [{ - $ptr `,` $offset attr-dict `:` type($ptr) `->` type($result) - }]; -} - -//===----------------------------------------------------------------------===// -// Scalar pointer load/store -//===----------------------------------------------------------------------===// - -def LoadScalarOp : PTO_Op<"load_scalar", [ - DeclareOpInterfaceMethods - ]> { - let summary = "Load a single scalar element from a pointer at offset."; - - let arguments = (ins - ScalarPtrOrMemRef:$ptr, - Index:$offset - ); - - let results = (outs AnyType:$value); - - let hasVerifier = 1; - - let assemblyFormat = [{ - $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value) - }]; -} - -def StoreScalarOp : PTO_Op<"store_scalar", [ - DeclareOpInterfaceMethods - ]> { - let summary = "Store a single scalar element to a pointer at offset."; - - let arguments = (ins - ScalarPtrOrMemRef:$ptr, - Index:$offset, - AnyType:$value - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value) - }]; -} - -def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> { - let summary = "Wrap a pointer as a tensor_view descriptor (no allocation, no copy)."; - - let arguments = (ins - AnyType:$ptr, - Variadic:$shape, - Variadic:$strides, - OptionalAttr:$layout - ); - - let results = (outs AnyType:$result); - - let hasCustomAssemblyFormat = 1; - let hasVerifier = 1; -} - -// ============================================================================= -// PartitionViewOp -// ============================================================================= -def PartitionViewOp : PTO_Op<"partition_view", [AttrSizedOperandSegments]> { - let summary = "Partition a tensor view into a smaller logical view (logical slicing)."; - let description = [{ - Captures a specific calculation region from a large view. - It carries offsets (where to read) and sizes (how much to read). - - IR Example: - %1 = pto.partition_view %0, offsets=[...], sizes=[...] - : !pto.tensor_view -> !pto.partition_tensor_view - }]; - - let arguments = (ins - TensorViewType:$source, // 输入: 物理大底座 (MakeTensorViewOp 的结果) - Variadic:$offsets, // 动态 offsets - Variadic:$sizes // 动态 sizes - ); - - let results = (outs PartitionTensorViewType:$result); // 输出: 逻辑切片 - - let assemblyFormat = [{ - $source `,` `offsets` `=` `[` $offsets `]` `,` `sizes` `=` `[` $sizes `]` - attr-dict `:` qualified(type($source)) `->` qualified(type($result)) - }]; -} - -// Helper: tensor_view or memref (after lowering tensor_view to memref). -def TensorViewOrMemRef : - AnyTypeOf<[TensorViewType, AnyMemRef], "TensorView or MemRef">; - -// Get the size of a dimension of a tensor_view or its lowered memref view. -// Result type: Index (use arith.index_cast if i32 is needed). -def GetTensorViewDimOp : PTO_Op<"get_tensor_view_dim", [Pure]> { - let summary = "Get the size of a dimension of a tensor_view."; - let description = [{ - Returns the size of the given dimension of a logical tensor view. - This op accepts either !pto.tensor_view or the memref it is lowered to. - IR (tensor_view form): - %dim_size = pto.get_tensor_view_dim %tv, %dim_index - : !pto.tensor_view, index -> index - IR (memref form, after lowering): - %dim_size = pto.get_tensor_view_dim %mr, %dim_index - : memref<...>, index -> index - }]; - let arguments = (ins - TensorViewOrMemRef:$tensor_view, - Index:$dim_index - ); - let results = (outs Index:$result); - let assemblyFormat = [{ - $tensor_view `,` $dim_index `:` qualified(type($tensor_view)) `->` qualified(type($result)) - attr-dict - }]; -} - -def AllocTileOp : PTO_Op<"alloc_tile", [AttrSizedOperandSegments]> { - let summary = "Allocates a tile buffer (logical buffer)."; - - let arguments = (ins - Optional:$addr, - Optional:$valid_row, - Optional:$valid_col - ); - - let results = (outs TileBufType:$result); - - let assemblyFormat = [{ - (`addr` `=` $addr^)? - (`valid_row` `=` $valid_row^)? - (`valid_col` `=` $valid_col^)? - attr-dict `:` qualified(type($result)) - }]; - - let extraClassDeclaration = [{ - ::mlir::LogicalResult verify(); - }]; -} - - -// ============================================================================ -// BindTileOp: 将 Config 和 Valid Dims 绑定到 MemRef 上 -// ============================================================================ -def BindTileOp : PTO_Op<"bind_tile", [ - Pure, - AttrSizedOperandSegments - // 允许输入 offset:0 -> 输出 offset:? - ]> { - let summary = "Binds metadata and implicitly casts layout"; - let description = [{ - Wraps a memref with PTO metadata (valid dimensions and config). - }]; - - // [修改] 以前是 Variadic:$valid_dims - // 现在改为明确的 Optional Row/Col,与 PointerCastOp 保持一致 - let arguments = (ins - AnyMemRef:$source, - Optional:$valid_row, - Optional:$valid_col, - TileBufConfigAttr:$config - ); - - let results = (outs AnyMemRef:$result); - - // [修改] assemblyFormat: 去掉 valid_dims,改为 ($row, $col)? - let assemblyFormat = [{ - $source (`,` $valid_row^ `,` $valid_col)? attr-dict `:` qualified(type($source)) `->` qualified(type($result)) - }]; -} - -def SubsetOp : PTO_Op<"subset", [ - Pure, - ViewLikeOpInterface, - DeclareOpInterfaceMethods // 启用 C++ 推导 - ]> { - - let summary = "Create a strided view (subset) from a parent tile."; - let description = [{ - Creates a view into the source tile. - - Result Shape: Defined by static `sizes`. - - Result Strides: Inherited from `source`. - - Result Offset: Represented as multi-dimensional symbols (s0, s1...) in the layout map. - }]; - - let arguments = (ins - TileBufType:$source, - Variadic:$offsets, // 运行时动态偏移 [i, j] - I64ArrayAttr:$sizes // 静态形状 [32, 32] - ); - - let results = (outs TileBufType:$result); - let hasVerifier = 1; - - // 语法示例: %sub = pto.subset %src[%i, %j] sizes [32, 32] : !type - // 注意:没有 -> qualified(type($result)) - let assemblyFormat = [{ - $source `[` $offsets `]` `sizes` $sizes attr-dict `:` qualified(type($source)) - }]; - - // [新增] 显式实现 ViewLikeOpInterface 缺失的方法 - let extraClassDeclaration = [{ - // 接口要求 getViewSource,我们转发给自动生成的 getSource - ::mlir::Value getViewSource() { return getSource(); } - - // ViewLikeOpInterface 可能还需要 getOffsets (如果 Variadic 不自动匹配) - // 但通常 Variadic:$offsets 会生成 getOffsets(),这应该没问题。 - // 如果后续报 getOffsets 错,也可以在这里加。 - }]; -} - -// ============================================================================ -// SSA TileBuf Config Ops (aliasing views) -// ============================================================================ - -def BitcastOp : PTO_Op<"bitcast", [ - Pure, - ViewLikeOpInterface - ]> { - let summary = "SSA dtype reinterpretation of a tile buffer view (aliases src storage)"; - let description = [{ - Returns a tile buffer view with a different element type (dtype) while - reusing the same underlying storage as the source. - - This op is a metadata/config rewrite only (no data movement). - }]; - - let arguments = (ins - TileBufType:$src - ); - - let results = (outs - TileBufType:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))"; - - let extraClassDeclaration = [{ - ::mlir::Value getViewSource() { return getSrc(); } - }]; -} - -//===----------------------------------------------------------------------===// -// DMA Ops -//===----------------------------------------------------------------------===// -// ------------------------- -// DPS versions in tile world -// ------------------------- -def TLoadOp : PTO_TOp<"tload", [ - PTO_DpsInitOpInterface, - AttrSizedOperandSegments, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO data load operation (Dimension Collapse: PartitionView -> TileBuf)."; - let description = [{ - Loads data from a high-dimensional logical partition view into a 2D physical tile buffer. - Constraint: The product of partition view sizes must match the product of tile buffer valid dims. - DPS form: - pto.tload ins(%partition_view) outs(%tile_buf) - }]; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst, - OptionalAttr:$pad_mode, - Optional:$pad_value, - Optional:$left_padding_num, - Optional:$right_padding_num, - DefaultValuedOptionalAttr:$init_out_buffer, - Optional:$init_condition - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let builders = [ - // 1. Basic: (src, dst) - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst), [{ - build($_builder, $_state, res, src, dst, - /*pad_mode=*/nullptr, /*pad_value=*/nullptr, - /*left=*/nullptr, /*right=*/nullptr, - /*init_out=*/nullptr, /*init_cond=*/nullptr); - }]>, - - // 2. With left_padding_num - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, - "Value":$left_padding_num), [{ - build($_builder, $_state, res, src, dst, - nullptr, nullptr, left_padding_num, nullptr, nullptr, nullptr); - }]>, - - // 3. With pad_mode, pad_value - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, - "pto::PadModeAttr":$pad_mode, "Value":$pad_value), [{ - build($_builder, $_state, res, src, dst, - pad_mode, pad_value, nullptr, nullptr, nullptr, nullptr); - }]>, - - // 4. ... + left - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, - "pto::PadModeAttr":$pad_mode, "Value":$pad_value, - "Value":$left_padding_num), [{ - build($_builder, $_state, res, src, dst, - pad_mode, pad_value, left_padding_num, nullptr, nullptr, nullptr); - }]>, - - // 5. ... + left + right - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, - "pto::PadModeAttr":$pad_mode, "Value":$pad_value, - "Value":$left_padding_num, "Value":$right_padding_num), [{ - build($_builder, $_state, res, src, dst, - pad_mode, pad_value, left_padding_num, right_padding_num, nullptr, nullptr); - }]>, - - // 6. ... + left + right + bool - OpBuilder<(ins "TypeRange":$res, "Value":$src, "Value":$dst, - "pto::PadModeAttr":$pad_mode, "Value":$pad_value, - "Value":$left_padding_num, "Value":$right_padding_num, - "bool":$init_out_buffer), [{ - build($_builder, $_state, res, src, dst, - pad_mode, pad_value, left_padding_num, right_padding_num, - init_out_buffer, nullptr); - }]> - ]; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - (`pad_mode` `=` $pad_mode^)? - (`pad_value` `=` $pad_value^ `:` type($pad_value))? - (`left_padding_num` `=` $left_padding_num^ `:` type($left_padding_num))? - (`init_out_buffer` `=` $init_out_buffer^ )? - (`right_padding_num` `=` $right_padding_num^ `:` type($right_padding_num))? - (`init_condition` `=` $init_condition^ `:` type($init_condition))? - (`->` qualified(type($result))^)? - }]; - - let extraClassDeclaration = [{ - static StringRef getOpName() { return "pto_load"; } - ShapedType getSrcOperandType() { - return cast(getSrc().getType()); - } - ShapedType getDstOperandType() { - return cast(getDst().getType()); - } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_MTE2; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TStoreOp: PTO_TOp<"tstore", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO data store operation (TileBuf -> PartitionView)."; - let description = [{ - Stores data from a 2D physical tile buffer back to a high-dimensional logical partition view. - - DPS form: - pto.tstore ins(%tile_buf) outs(%partition_view) - }]; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - (`->` qualified(type($result))^)? - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - // pto-isa lowers TSTORE differently depending on the source tile domain: - // - ACC (L0C) -> GM uses the FIX pipe (copy_matrix_cc_to_gm). - // - VEC/UB and MAT -> GM use the MTE3 pipe. - // - // Sync insertion must use the correct pipe, otherwise the generated - // set_flag/wait_flag pairs won't order the actual hardware instructions. - auto isAcc = [](Type ty) -> bool { - if (auto tb = ::mlir::dyn_cast<::mlir::pto::TileBufType>(ty)) { - if (auto as = ::mlir::dyn_cast_or_null<::mlir::pto::AddressSpaceAttr>( - tb.getMemorySpace())) - return as.getAddressSpace() == ::mlir::pto::AddressSpace::ACC; - return false; - } - if (auto mr = ::mlir::dyn_cast<::mlir::MemRefType>(ty)) { - if (auto ms = mr.getMemorySpace()) { - if (auto as = ::mlir::dyn_cast<::mlir::pto::AddressSpaceAttr>(ms)) - return as.getAddressSpace() == ::mlir::pto::AddressSpace::ACC; - } - return false; - } - return false; - }; - if (isAcc(getSrc().getType())) - return ::mlir::pto::PIPE::PIPE_FIX; - return ::mlir::pto::PIPE::PIPE_MTE3; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TTransOp : PTO_TOp<"ttrans", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO matrix trans operation (destination-style, DPS)."; - let description = [{ - DPS form: - tile buf: pto.ttrans ins(%src %tmp) outs(%dst) - }]; - let arguments = (ins - PTODpsType:$src, - PTODpsType:$tmp, - PTODpsType:$dst - ); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_V; // Vector Pipe - } - ::mlir::MutableOperandRange getDpsInitsMutable() { - return getDstMutable(); - } - }]; -} -//===----------------------------------------------------------------------===// -// TMATMUL_BIAS / TMATMUL_MX family -// - DPS: AnyMemRef -// - TOp: PTODpsType -// - All in ins/outs form -//===----------------------------------------------------------------------===// - -// ------------------------- -// DPS versions (AnyMemRef) -// ------------------------- - -// ------------------------- -// Tile-world TOp versions (PTODpsType) -// ------------------------- - -def TMatmulBiasOp : PTO_TOp<"tmatmul.bias", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Matmul with bias add (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$a, - PTODpsType:$b, - PTODpsType:$bias, - PTODpsType:$dst - ); - - let results = (outs Optional:$result); - let hasVerifier = 1; - - //let builders = [ - // OpBuilder<(ins "TypeRange":$resultTypes, "Value":$a, "Value":$b, "Value":$bias, "Value":$dst), [{ - // build($_builder, $_state, resultTypes, a, b, bias, dst, ValueRange{}); - // }]> - //]; - - let assemblyFormat = [{ - `ins` `(` $a `,` $b `,` $bias `:` qualified(type($a)) `,` qualified(type($b)) `,` qualified(type($bias)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "TMATMUL_BIAS"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_M; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -// pto.tmatmul.mx ins(%a,%a_scale,%b,%b_scale) outs(%dst) -> tensor? -def TMatmulMxOp : PTO_TOp<"tmatmul.mx", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods - ]> { - let summary = "Matmul mx (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$a, - PTODpsType:$a_scale, - PTODpsType:$b, - PTODpsType:$b_scale, - PTODpsType:$dst); - - let results = (outs Optional:$result); - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $a `,` $a_scale `,` $b `,` $b_scale - `:` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - (`->` qualified(type($result))^)? - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "TMATMUL_MX"; } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -// pto.tmatmul.mx.acc ins(%c_in,%a,%a_scale,%b,%b_scale) outs(%dst) -> tensor? -def TMatmulMxAccOp : PTO_TOp<"tmatmul.mx.acc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Matmul mx accumulate (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$c_in, - PTODpsType:$a, - PTODpsType:$a_scale, - PTODpsType:$b, - PTODpsType:$b_scale, - PTODpsType:$dst); - - let results = (outs Optional:$result); - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $c_in `,` $a `,` $a_scale `,` $b `,` $b_scale - `:` type($c_in) `,` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - (`->` qualified(type($result))^)? - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "TMATMUL_MX_ACC"; } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -// pto.tmatmul.mx.bias ins(%a,%a_scale,%b,%b_scale,%bias) outs(%dst) -> tensor? -def TMatmulMxBiasOp : PTO_TOp<"tmatmul.mx.bias",[ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Matmul mx with bias (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$a, - PTODpsType:$a_scale, - PTODpsType:$b, - PTODpsType:$b_scale, - PTODpsType:$bias, - PTODpsType:$dst); - - let results = (outs Optional:$result); - let hasVerifier = 1; - - - let assemblyFormat = [{ - `ins` `(` $a `,` $a_scale `,` $b `,` $b_scale `,` $bias - `:` type($a) `,` type($a_scale) `,` type($b) `,` type($b_scale) `,` qualified(type($bias)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - (`->` qualified(type($result))^)? - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "TMATMUL_MX_BIAS"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_M; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMatmulOp : PTO_TOp<"tmatmul", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO matrix multiplication operation (optional bias), destination-style (DPS) in tile world."; - - let arguments = (ins - PTODpsType:$lhs, - PTODpsType:$rhs, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $lhs `,` $rhs `:` qualified(type($lhs)) `,` qualified(type($rhs)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - static StringRef getOpName() { return "matmul_dps"; } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMatmulAccOp : PTO_TOp<"tmatmul.acc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO matrix multiplication accumulate operation, destination-style (DPS, no bias)."; - - let arguments = (ins - PTODpsType:$acc_in, - PTODpsType:$lhs, - PTODpsType:$rhs, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $acc_in `,` $lhs `,` $rhs `:` qualified(type($acc_in) ) `,` qualified(type($lhs)) `,` qualified(type($rhs)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TGemvOp : PTO_TOp<"tgemv", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO matrix-vector multiplication operation (optional bias), destination-style (DPS) in tile world."; - - let arguments = (ins - PTODpsType:$lhs, - PTODpsType:$rhs, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $lhs `,` $rhs `:` qualified(type($lhs)) `,` qualified(type($rhs)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - static StringRef getOpName() { return "gemv"; } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TGemvAccOp : PTO_TOp<"tgemv.acc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "PTO matrix-vector multiplication accumulate operation, destination-style (DPS, no bias)."; - - let arguments = (ins - PTODpsType:$acc_in, - PTODpsType:$lhs, - PTODpsType:$rhs, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $acc_in `,` $lhs `,` $rhs `:` qualified(type($acc_in) ) `,` qualified(type($lhs)) `,` qualified(type($rhs)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TGemvBiasOp : PTO_TOp<"tgemv.bias", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "GEMV with bias add (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType :$a, - PTODpsType :$b, - PTODpsType :$bias, - PTODpsType :$dst - ); - - let results = (outs Optional:$result); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $a `,` $b `,` $bias `:` qualified(type($a)) `,` qualified(type($b)) `,` qualified(type($bias)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "TGEMV_BIAS"; } - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_M; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMovOp : PTO_TOp<"tmov", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Move data between domains (DPS version)."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs - Optional:$result - ); - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - ( `->` qualified(type($result))^ )? - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - // TMOV spans multiple hardware pipelines depending on the source/dest - // domains. Most tile-domain moves are executed by MTE1 (e.g. MAT->L0), - // while UB->UB copies are vector-pipe operations. - auto getASFromType = [](Type ty) - -> std::optional<::mlir::pto::AddressSpace> { - // Pre-lowering: tile_buf carries the address space in its memorySpace. - if (auto tb = llvm::dyn_cast<::mlir::pto::TileBufType>(ty)) { - if (auto as = llvm::dyn_cast_or_null<::mlir::pto::AddressSpaceAttr>( - tb.getMemorySpace())) - return as.getAddressSpace(); - return std::nullopt; - } - // Post PTOViewToMemref: tile_buf is erased to memref but memorySpace is - // preserved in memref's memorySpace attribute. - if (auto mr = llvm::dyn_cast<::mlir::MemRefType>(ty)) { - if (auto ms = mr.getMemorySpace()) { - if (auto as = - llvm::dyn_cast<::mlir::pto::AddressSpaceAttr>(ms)) - return as.getAddressSpace(); - } - return std::nullopt; - } - return std::nullopt; - }; - - auto sOpt = getASFromType(getSrc().getType()); - auto dOpt = getASFromType(getDst().getType()); - if (!sOpt.has_value() || !dOpt.has_value()) - return ::mlir::pto::PIPE::PIPE_V; - - const auto s = sOpt.value(); - const auto d = dOpt.value(); - - // UB -> UB copy is vector pipe. - if (s == ::mlir::pto::AddressSpace::VEC && d == ::mlir::pto::AddressSpace::VEC) { - return ::mlir::pto::PIPE::PIPE_V; - } - - // MAT -> L0 (Left/Right/Bias/Scaling) and ACC -> MAT are MTE1 moves. - if ((s == ::mlir::pto::AddressSpace::MAT && - (d == ::mlir::pto::AddressSpace::LEFT || d == ::mlir::pto::AddressSpace::RIGHT || - d == ::mlir::pto::AddressSpace::BIAS || d == ::mlir::pto::AddressSpace::SCALING)) || - (s == ::mlir::pto::AddressSpace::ACC && d == ::mlir::pto::AddressSpace::MAT)) { - return ::mlir::pto::PIPE::PIPE_MTE1; - } - - // Fallback: treat as vector pipe (safe default for most intra-domain moves). - return ::mlir::pto::PIPE::PIPE_V; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - - -//===----------------------------------------------------------------------===// -// Pointer Cast Op (existing) -//===----------------------------------------------------------------------===// - -def PointerCastOp : PTO_Op<"pointer_cast", [AttrSizedOperandSegments, Pure]> { - let summary = "Casts an integer address to a MemRef with optional valid dims"; - - // 参数定义 (保持 Optional) - let arguments = (ins - Variadic:$addrs, - Optional:$valid_row, - Optional:$valid_col, - OptionalAttr:$config - ); - - let results = (outs Res:$result); - - // Assembly Format (去掉了 []) - let assemblyFormat = [{ - `(` $addrs `)` ($valid_row^ `,` $valid_col)? attr-dict `:` qualified(type($result)) - }]; - - // Builder (修复 addOperand -> addOperands) - let builders = [ - OpBuilder<(ins "Type":$result, "ValueRange":$addrs, "Value":$vRow, "Value":$vCol, "Attribute":$config), [{ - $_state.addTypes(result); - $_state.addOperands(addrs); - // [关键修复] addOperand -> addOperands - if (vRow) $_state.addOperands(vRow); - if (vCol) $_state.addOperands(vCol); - if (config) $_state.addAttribute("config", config); - - int32_t addrsSize = addrs.size(); - int32_t vRowSize = vRow ? 1 : 0; - int32_t vColSize = vCol ? 1 : 0; - $_state.addAttribute("operandSegmentSizes", - $_builder.getDenseI32ArrayAttr({addrsSize, vRowSize, vColSize})); - }]> - ]; -} - -// ============================================================================= -// System/Runtime Query Ops -// ============================================================================= - -def GetBlockIdxOp : PTO_Op<"get_block_idx", [Pure]> { - let summary = "Get the current block index (core ID)."; - let description = [{ - Returns the linear index of the current compute unit (CUBE Core) within the task. - The return value is in the range [0, BlockNum - 1]. - }]; - - let arguments = (ins); - let results = (outs I64:$result); - - let assemblyFormat = "attr-dict"; -} - -def GetSubBlockIdxOp : PTO_Op<"get_subblock_idx", [Pure]> { - let summary = "Get the current vector core ID."; - let description = [{ - Returns the ID of the current compute unit (Vector Core). - The return value is in the range [0, 1]. - }]; - - let arguments = (ins); - let results = (outs I64:$result); - - let assemblyFormat = "attr-dict"; -} - -def GetBlockNumOp : PTO_Op<"get_block_num", [Pure]> { - let summary = "Get the total number of blocks (cores)."; - let description = [{ - Returns the total number of compute units (Blocks) configured for the current task. - }]; - - let arguments = (ins); - let results = (outs I64:$result); - - let assemblyFormat = "attr-dict"; -} - -def GetSubBlockNumOp : PTO_Op<"get_subblock_num", [Pure]> { - let summary = "Get the number of vector cores."; - let description = [{ - Returns the total number of vector compute units. - }]; - - let arguments = (ins); - let results = (outs I64:$result); - - let assemblyFormat = "attr-dict"; -} - -//===----------------------------------------------------------------------===// -// High-Level Synchronization Ops -//===----------------------------------------------------------------------===// - -def PTO_PipeEventTypeLikeAttr : AnyAttrOf<[PTO_PipeEventTypeAttr, PTO_SyncOpTypeAttr]>; - -def RecordEventOp : PTO_Op<"record_event"> { - let summary = "Record an event for synchronization (High Level)"; - let description = [{ - Records an event from a source operation type to a destination operation type. - Will be lowered to `pto.set_flag` based on op-to-pipe mapping. - }]; - let arguments = (ins - PTO_PipeEventTypeLikeAttr:$src_op, - PTO_PipeEventTypeLikeAttr:$dst_op, - PTO_EventAttr:$event_id - ); - let results = (outs); - let assemblyFormat = [{ - `[` $src_op `,` $dst_op `,` $event_id `]` attr-dict - }]; -} - -def WaitEventOp : PTO_Op<"wait_event"> { - let summary = "Wait for an event (High Level)"; - let description = [{ - Waits for an event from a source operation type to a destination operation type. - Will be lowered to `pto.wait_flag` based on op-to-pipe mapping. - }]; - let arguments = (ins - PTO_PipeEventTypeLikeAttr:$src_op, - PTO_PipeEventTypeLikeAttr:$dst_op, - PTO_EventAttr:$event_id - ); - let results = (outs); - let assemblyFormat = [{ - `[` $src_op `,` $dst_op `,` $event_id `]` attr-dict - }]; -} - -// High-Level Barrier (single pipe) with op type mapping -def BarrierSyncOp : PTO_Op<"barrier_sync"> { - let summary = "High-level barrier mapped from SyncOpType to PIPE"; - let description = [{ - A convenience barrier that specifies a SyncOpType instead of PIPE. The lowering - pass maps the op type to the corresponding hardware pipe and emits `pto.barrier`. - }]; - let arguments = (ins - PTO_SyncOpTypeAttr:$op_type - ); - let results = (outs); - let assemblyFormat = [{ - `[` $op_type `]` attr-dict - }]; -} - -//===----------------------------------------------------------------------===// -// Section Ops (Macros Containers) -//===----------------------------------------------------------------------===// - -class PTO_SectionOp - : PTO_Op { - let summary = "Container for core-specific code guarded by macros"; - let description = [{ - During conversion to EmitC, this op is lowered to: - emitc.verbatim("#if defined(MACRO)") - ... inlined body ... - emitc.verbatim("#endif") - }]; - - let regions = (region SizedRegion<1>:$body); - let assemblyFormat = "$body attr-dict"; -} - -def SectionCubeOp : PTO_SectionOp<"section.cube">; -def SectionVectorOp : PTO_SectionOp<"section.vector">; - -//===----------------------------------------------------------------------===// -// Synchronization Ops -//===----------------------------------------------------------------------===// - -def SetFlagOp : PTO_Op<"set_flag"> { - let summary = "Set synchronization flag between pipes"; - let arguments = (ins - PTO_PipeAttr:$src_pipe, - PTO_PipeAttr:$dst_pipe, - PTO_EventAttr:$event_id - ); - let results = (outs); - let assemblyFormat = [{ - `[` $src_pipe `,` $dst_pipe `,` $event_id `]` attr-dict - }]; -} - -def WaitFlagOp : PTO_Op<"wait_flag"> { - let summary = "Wait for synchronization flag"; - let arguments = (ins - PTO_PipeAttr:$src_pipe, - PTO_PipeAttr:$dst_pipe, - PTO_EventAttr:$event_id - ); - let results = (outs); - let assemblyFormat = [{ - `[` $src_pipe `,` $dst_pipe `,` $event_id `]` attr-dict - }]; -} - -//===----------------------------------------------------------------------===// -// Buffer-ID Synchronization (A5) -//===----------------------------------------------------------------------===// - -def GetBufOp : PTO_Op<"get_buf"> { - let summary = "Acquire a buffer-id token on a given pipe (A5)"; - let description = [{ - `pto.get_buf` participates in a buffer-id based ordering model. Operations - in the same pipe that are guarded by the same buffer-id are enforced to - execute in program order relative to other pipes using the same buffer-id. - - This op is intended to be lowered to the CCEC builtin intrinsic `get_buf`. - }]; - - let arguments = (ins - PTO_PipeAttr:$pipe, - I32Attr:$buf_id, - DefaultValuedAttr:$mode - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `[` $pipe `,` $buf_id `]` attr-dict - }]; -} - -def RlsBufOp : PTO_Op<"rls_buf"> { - let summary = "Release a buffer-id token on a given pipe (A5)"; - let description = [{ - Releases the previously acquired buffer-id token for the given pipe. - - This op is intended to be lowered to the CCEC builtin intrinsic `rls_buf`. - }]; - - let arguments = (ins - PTO_PipeAttr:$pipe, - I32Attr:$buf_id, - DefaultValuedAttr:$mode - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `[` $pipe `,` $buf_id `]` attr-dict - }]; -} - -def SyncSetOp : PTO_Op<"sync.set"> { - let summary = "Set a synchronization signal (trigger) between cube and vector."; - let description = [{ - Sets a synchronization signal on the specified pipeline stage. - Corresponds to `ffts_cross_core_sync` (A3) or `set_intra_block` (A5). - }]; - - let arguments = (ins - PTO_PipeAttr:$pipe, - I32Attr:$event_id - ); - - let assemblyFormat = "$pipe `,` $event_id attr-dict"; -} - -def SyncWaitOp : PTO_Op<"sync.wait"> { - let summary = "Wait for a synchronization signal (barrier) between cube and vector."; - let description = [{ - Waits for a synchronization signal on the specified pipeline stage. - Corresponds to `wait_flag_dev` (A3) or `wait_intra_block` (A5). - }]; - - let arguments = (ins - PTO_PipeAttr:$pipe, - I32Attr:$event_id - ); - - let assemblyFormat = "$pipe `,` $event_id attr-dict"; -} - -def BarrierOp : PTO_Op<"barrier"> { - let summary = "Intra-pipeline memory barrier"; - let arguments = (ins PTO_PipeAttr:$pipe); - let assemblyFormat = "$pipe attr-dict"; -} - - -//===----------------------------------------------------------------------===// -// FFT Configuration Operation -//===----------------------------------------------------------------------===// - -def SetFFTsOp : PTO_Op<"set_ffts", [MemoryEffects<[MemRead, MemWrite]>]> { - let summary = "Set FFTS/flags pointer for runtime (side-effecting)."; - let arguments = (ins AnyMemRef:$ffts); - let results = (outs); - - let assemblyFormat = "$ffts attr-dict `:` type($ffts)"; - - let hasVerifier = 1; -} - - -def PrintOp : PTO_Op<"print", [MemoryEffects<[MemRead, MemWrite]>]> { - let summary = "Print debug: format string (attribute) and scalar value."; - let description = [{ - Debug print op. First argument is the format string (string attribute), second is a scalar value. - Format in IR: `pto.print ins("format", %scalar : type(%scalar))` - }]; - let arguments = (ins StrAttr:$format, PrintScalarType:$scalar); - let results = (outs); - let assemblyFormat = [{ - `ins` `(` $format `,` $scalar `:` type($scalar) `)` - attr-dict - }]; -} - -def TrapOp : PTO_Op<"trap"> { - let summary = "Trap: abort execution (no operands)."; - let description = [{ - Inserts a trap to stop execution. No arguments, no results. - Format in IR: pto.trap - }]; - let arguments = (ins); - let results = (outs); - let assemblyFormat = "attr-dict"; -} - -// ---- tile-world TOp version (with 't') ---- -// pto.tmgather ins(%mem, %idx) outs(%dst) [ ...] -> tensor? -def MGatherOp : PTO_TOp<"mgather", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Gather-load elements from memory into a tile using per-element indices (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$mem, - PTODpsType:$idx, - PTODpsType:$dst); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $mem `,` $idx `:` type($mem) `,` type($idx) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "MGATHER"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE2; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - - - -// ---- tile-world TOp version ---- -// pto.tsetval ins(%offset, %val) outs(%dst) : PTODpsType, index, T -def TSetValOp : PTO_TOp<"tsetval", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Write a scalar value into a single element of dst at offset (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$dst, - Index:$offset, - AnyType:$val - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $offset `,` $val `:` type($offset) `,` type($val) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "SETVAL"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -// ---- tile-world TOp version ---- -// pto.tgetval ins(%src, %offset) outs(%dst) : PTODpsType, index -> T -def TGetValOp : PTO_TOp<"tgetval", [ - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Read a single element from tile-like src at offset into a scalar (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$src, - Index:$offset - ); - - let results = (outs AnyType:$dst); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $offset `:` qualified(type($src)) `,` type($offset) `)` - `outs` `:` qualified(type($dst) ) - attr-dict - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "GETVAL"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - }]; -} - - -// ---- tile-world TOp version (with 't') ---- -// pto.mscatter ins(%src, %idx) outs(%mem) [ ...] -def MScatterOp : PTO_TOp<"mscatter", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Scatter-store elements from a tile into memory using per-element indices (tile world, ins/outs)."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$idx, - PTODpsType:$mem // outs target - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $idx `:` qualified(type($src)) `,` type($idx) `)` - `outs` `(` $mem `:` type($mem) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - static StringRef getIntrinsicName() { return "MSCATTER"; } - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE3; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getMemMutable(); } - }]; -} - -//===----------------------------------------------------------------------===// -// PTO_TOPs(Tilebuffer, DPS) -//===----------------------------------------------------------------------===// - -def TAbsOp : PTO_TOp<"tabs", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise absolute value of a tile "; - let description = [{ - For each element (i, j): dst[i,j] = |src[i,j]|. - }]; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_V; // Vector pipe for elementwise ops - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TAddOp : PTO_TOp<"tadd", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise add of two tiles "; - let description = [{ - For each element (i, j): dst[i,j] = src0[i,j] + src1[i,j]. - }]; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - return ::mlir::pto::PIPE::PIPE_V; - } - - ::mlir::MutableOperandRange getDpsInitsMutable() { - return getDstMutable(); - } - }]; -} - -def TAddCOp : PTO_TOp<"taddc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise ternary add of tiles "; - let description = [{ - For each element (i, j): dst[i,j] = src0[i,j] + src1[i,j] + src2[i,j]. - }]; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$src2, - PTODpsType:$dst - ); - - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `,` $src2 `:` - qualified(type($src0)) `,` qualified(type($src1)) `,` qualified(type($src2)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TAddSOp : PTO_TOp<"tadds", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise add a scalar to a tile "; - let description = [{ - For each element (i, j): dst[i,j] = src[i,j] + scalar. - Tile operands are tilebuffers; scalar is a builtin scalar type (e.g. f32). - }]; - - let arguments = (ins - PTODpsType:$src, - AnyType:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TAddSCOp : PTO_TOp<"taddsc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "dst = src0 + scalar + src1 "; - let arguments = (ins - PTODpsType:$src0, - AnyType:$scalar, - PTODpsType:$src1, - PTODpsType:$dst - ); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $scalar `,` $src1 `:` qualified(type($src0)) `,` type($scalar) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - - -def TAndOp : PTO_TOp<"tand", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TAND(dst, src0, src1) bitwise-and on tiles"; - let arguments = (ins PTODpsType:$src0, PTODpsType:$src1, PTODpsType:$dst); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TAndSOp : PTO_TOp<"tands", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TANDS(dst, src, scalar) bitwise-and tile with scalar"; - let arguments = (ins PTODpsType:$src, AnyType:$scalar, PTODpsType:$dst); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - - -def TCIOp : PTO_TOp<"tci", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Generate contiguous integer sequence into dst tile."; - - let arguments = (ins - AnyInteger:$S, - PTODpsType:$dst, - DefaultValuedAttr:$descending - ); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $S - attr-dict - `:` type($S) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TCmpOp : PTO_TOp<"tcmp", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Compare two tiles and write a packed predicate mask"; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst, - OptionalAttr:$cmpMode - ); - let results = (outs); - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 - attr-dict - `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - }]; - - let builders = [ - OpBuilder<(ins "TypeRange":$res, "Value":$src0, "Value":$src1, "Value":$dst), [{ - build($_builder, $_state, res, src0, src1, dst, /*cmpMode=*/nullptr); - }]>]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TCmpSOp : PTO_TOp<"tcmps", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Compare scalar value against a tile and write a packed predicate mask"; - - let arguments = (ins - PTODpsType:$src, - AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, - DefaultValuedAttr:$cmpMode, - PTODpsType:$dst - ); - - let results = (outs); - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar - attr-dict - `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - - }]; -} - -def TColExpandOp : PTO_TOp<"tcolexpand", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Broadcast src(0, j) to all rows in column j "; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; -} - -def TColMaxOp : PTO_TOp<"tcolmax", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Reduce each column by taking the maximum across rows "; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; -} - -def TColMinOp : PTO_TOp<"tcolmin", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Reduce each column by taking the minimum across rows "; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; -} - -def TColSumOp : PTO_TOp<"tcolsum", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Reduce each column by summing across rows (tilebuf-based, explicit tmp)"; - - let arguments = (ins - PTODpsType:$src, - Optional:$tmp, - PTODpsType:$dst, - DefaultValuedOptionalAttr:$isBinary - ); - - let results = (outs); - - let hasVerifier = 1; - - let hasCustomAssemblyFormat = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { - return ::mlir::MutableOperandRange(getOperation(), 1, getOperation()->getNumOperands()); - } - }]; -} - -def TCvtOp : PTO_TOp<"tcvt", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise type conversion with rounding mode (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst, - DefaultValuedAttr:$rmode - ); - - let results = (outs); - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src - attr-dict - `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - }]; -} - -def TDivOp : PTO_TOp<"tdiv", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise division of two tiles (tilebuf, DPS)"; - let description = [{ - For each element (i, j): dst[i,j] = src0[i,j] / src1[i,j]. - Division-by-zero behavior is target-defined. - }]; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - -} - -def TDivSOp : PTO_TOp<"tdivs", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise division with a scalar (tilebuf, DPS)"; - - let arguments = (ins - AnyType:$src, - AnyType:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let hasCustomAssemblyFormat = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - -} - -def TExpOp : PTO_TOp<"texp", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise exponential (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TExpandsOp : PTO_TOp<"texpands", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Broadcast scalar into dst (tilebuf, DPS)"; - - let arguments = (ins - AnyTypeOf<[F16, F32, I16, I32, I8, UI8, UI16, UI32]>:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $scalar `:` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TExtractOp : PTO_TOp<"textract", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Extract sub-tile window from src into dst (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - Index:$indexRow, - Index:$indexCol, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $indexRow `,` $indexCol `:` qualified(type($src)) `,` type($indexRow) `,` type($indexCol) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - // TEXTRACT moves data between memory domains (L1/cbuf -> L0A/L0B/L0C), - // which is executed by the MTE1 pipeline. - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TFillPadOp : PTO_TOp<"tfillpad", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Copy src into dst and fill padded elements using dst PadVal (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TGatherOp : PTO_TOp<"tgather", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Gather/select elements using an index tile or a mask pattern (tilebuf, DPS)"; - - // --- operands (DPS): src0 + optional indices + outs(dst) --- - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst, - Optional:$indices, - OptionalAttr:$maskPattern - ); - - // --- DPS op: no SSA results --- - let results = (outs); - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TGatherBOp : PTO_TOp<"tgatherb", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Gather elements using byte offsets (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$offsets, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $offsets `:` qualified(type($src)) `,` type($offsets) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TLogOp : PTO_TOp<"tlog", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise natural logarithm (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TLReluOp : PTO_TOp<"tlrelu", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Leaky ReLU with a scalar slope (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - F32:$slope, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $slope `:` qualified(type($src)) `,` type($slope) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMaxOp : PTO_TOp<"tmax", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise maximum of two tiles (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMaxSOp : PTO_TOp<"tmaxs", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise max of a tile and a scalar (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - F32:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMinOp : PTO_TOp<"tmin", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise minimum of two tiles (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMinSOp : PTO_TOp<"tmins", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Elementwise minimum of a tile and a scalar (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - F32:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMovFPOp : PTO_TOp<"tmov.fp", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TMOV_FP: move/convert using fp (scaling) tile (tilebuf, DPS)"; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$fp, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $fp `:` qualified(type($src)) `,` qualified(type($fp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - // TMOV_FP is an ACC->MAT move (Cc->Cb) with vector quant parameters in - // SCALING (fbuf). Treat it as a data-movement op for sync insertion. - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -def TMrgSortOp: PTO_TOp<"tmrgsort", [ - AttrSizedOperandSegments, - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TMRGSORT: Merge sort (format1: ins(src,blockLen) out(dst); format2: ins(src0..src3) outs(dst,tmp,executed))."; - - let arguments = (ins - Variadic:$srcs, - Optional:$blockLen, - Variadic:$dsts, - Optional:$excuted, - DefaultValuedAttr:$exhausted - ); - - let results = (outs); - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool isFormat1() { return getSrcs().size() == 1u && getBlockLen() && getDsts().size() == 1u; } - bool isFormat2() { return getSrcs().size() == 4u && getDsts().size() == 2u && getExcuted(); } - Value getSrc() { return getSrcs().front(); } - Value getDst() { return getDsts().front(); } - Value getTmp() { return getDsts()[1]; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstsMutable(); } - void print(::mlir::OpAsmPrinter &p); - static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result); - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - }]; -} - -def TMulOp: PTO_TOp<"tmul", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TMUL: Elementwise multiply of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -//===----------------------------------------------------------------------===// -// PTOOps.td (add TMULS DPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TMulSOp: PTO_TOp<"tmuls", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TMULS: Elementwise multiply a tile by a scalar."; - - let arguments = (ins - PTODpsType:$src0, - F32:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $scalar `:` qualified(type($src0)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TNEG TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TNegOp: PTO_TOp<"tneg", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TNEG: Elementwise negation of a tile."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TNOT TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TNotOp: PTO_TOp<"tnot", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TNOT: Elementwise bitwise NOT of a tile."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TOR TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TOrOp: PTO_TOp<"tor", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TOR: Elementwise bitwise OR of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TORS TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TOrSOp: PTO_TOp<"tors", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TORS: Elementwise bitwise OR of a tile and a scalar."; - - let arguments = (ins - PTODpsType:$src, - AnySignlessInteger:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TPARTADD TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TPartAddOp: PTO_TOp<"tpartadd", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TPARTADD: Partial elementwise add with implementation-defined handling of mismatched valid regions."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TPARTMAX TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TPartMaxOp: PTO_TOp<"tpartmax", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Partial elementwise max with implementation-defined handling of mismatched valid regions."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TPARTMIN TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TPartMinOp: PTO_TOp<"tpartmin", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "Partial elementwise min with implementation-defined handling of mismatched valid regions."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TPRELU TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TPReluOp: PTO_TOp<"tprelu", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TPRELU: Elementwise PReLU (parametric ReLU) with a per-element slope tile."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TRECIP TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRecipOp: PTO_TOp<"trecip", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TRECIP: Elementwise reciprocal of a tile."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TRELU TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TReluOp: PTO_TOp<"trelu", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TRELU: Elementwise ReLU of a tile."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TREM TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRemOp: PTO_TOp<"trem", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TREM: Elementwise remainder of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TREMS TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRemSOp: PTO_TOp<"trems", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TREMS: Elementwise remainder with a scalar"; - - let arguments = (ins - PTODpsType:$src, - F32:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (TRESHAPE: SSA view op; aliases src storage) -//===----------------------------------------------------------------------===// - -def TReshapeOp: PTO_TOp<"treshape", [ - OpPipeInterface, - Pure, - ViewLikeOpInterface -]> { - let summary = "TRESHAPE: Reinterpret a tile buffer view (SSA; aliases src storage)"; - - let arguments = (ins - PTODpsType:$src - ); - - let results = (outs - PTODpsType:$result - ); - - let hasVerifier = 1; - - let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))"; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::Value getViewSource() { return getSrc(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWEXPAND TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowExpandOp: PTO_TOp<"trowexpand", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWEXPAND: Broadcast the first element of each source row across the destination row."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWEXPANDDIV TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowExpandDivOp: PTO_TOp<"trowexpanddiv", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWEXPANDDIV: Row-wise broadcast divide: divide each row of src0 by a per-row scalar vector src1."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWEXPANDMUL TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowExpandMulOp: PTO_TOp<"trowexpandmul", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWEXPANDMUL: Row-wise broadcast divide: divide each row of src0 by a per-row scalar vector src1."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWEXPANDSUB TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowExpandSubOp: PTO_TOp<"trowexpandsub", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWEXPANDSUB: Row-wise broadcast subtract: subtract a per-row scalar vector src1 from each row of src0."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWMAX TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowMaxOp: PTO_TOp<"trowmax", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWMAX: Reduce each row by taking the maximum across columns."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$tmp, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWMIN TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowMinOp: PTO_TOp<"trowmin", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWMIN: Reduce each row by taking the minimum across columns."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$tmp, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TROWSUM TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRowSumOp: PTO_TOp<"trowsum", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TROWSUM: Reduce each row by summing across columns."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$tmp, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $tmp `:` qualified(type($src)) `,` qualified(type($tmp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TRSQRT TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TRsqrtOp: PTO_TOp<"trsqrt", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TRSQRT: Elementwise reciprocal square root."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSCATTER TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TScatterOp: PTO_TOp<"tscatter", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSCATTER: Scatter elements of a source tile into a destination tile using per-element indices."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$indexes, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $indexes `:` qualified(type($src)) `,` qualified(type($indexes) ) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { - // NOTE: On dav-c220 (Ascend910 A2/A3), pto-isa implements TSCATTER as a - // scalar loop over UB pointers, which executes on the scalar pipeline - // (PIPE_S). Waiting on PIPE_V does not block scalar UB accesses and can - // lead to using uninitialized indices/data (crash / aivec exception). - // - // On A5 instruction set devices, TSCATTER is implemented with vector - // scatter instructions and should be treated as PIPE_V. - auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>(); - if (moduleOp) { - if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) { - auto s = spec.getValue(); - if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) { - return ::mlir::pto::PIPE::PIPE_V; - } - } - } - return ::mlir::pto::PIPE::PIPE_S; - } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSEL TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSelOp: PTO_TOp<"tsel", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSEL: Select between two tiles using a mask tile (per-element selection)."; - - let arguments = (ins - PTODpsType:$mask, - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $mask `,` $src0 `,` $src1 `:` qualified(type($mask)) `,` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSELS TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSelSOp: PTO_TOp<"tsels", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSELS: Select one of two source tiles using a scalar selectMode (global select)."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - AnyInteger:$selectMode, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `,` $selectMode `:` qualified(type($src0)) `,` qualified(type($src1)) `,` type($selectMode) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSHL TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TShlOp: PTO_TOp<"tshl", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSHL: Elementwise shift-left of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSHR TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TShrOp: PTO_TOp<"tshr", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSHR: Elementwise shift-right of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSHLS/TSHRS TBDPS: shift-left/right by scalar) -//===----------------------------------------------------------------------===// - -def TShlSOp : PTO_TOp<"tshls", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSHLS: Elementwise shift-left of a tile by a scalar (shift count)."; - let arguments = (ins - PTODpsType:$src, - AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, - PTODpsType:$dst - ); - let results = (outs); - let hasVerifier = 1; - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; -} - -def TShrSOp : PTO_TOp<"tshrs", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSHRS: Elementwise shift-right of a tile by a scalar (shift count)."; - let arguments = (ins - PTODpsType:$src, - AnyTypeOf<[AnyFloat, AnySignlessInteger, Index]>:$scalar, - PTODpsType:$dst - ); - let results = (outs); - let hasVerifier = 1; - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSORT32 TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSort32Op: PTO_TOp<"tsort32", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSORT32: Sort a fixed-size 32-element block and produce an index mapping."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst, - PTODpsType:$idx - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `,` $idx `:` qualified(type($dst) ) `,` qualified(type($idx)) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSQRT TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSqrtOp: PTO_TOp<"tsqrt", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSQRT: Elementwise square root."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSTORE_FP TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TStoreFPOp: PTO_TOp<"tstore_fp", [ - PTO_DpsInitOpInterface, -]> { - let summary = "TSTORE_FP: Store an accumulator tile into global memory using a scaling (fp) tile for vector quantization parameters."; - - let arguments = (ins - PTODpsType:$src, - PTODpsType:$fp, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $fp `:` qualified(type($src)) `,` qualified(type($fp)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - let extraClassDeclaration = [{ - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSUB TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSubOp: PTO_TOp<"tsub", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSUB: Elementwise subtract of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSUBC TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSubCOp: PTO_TOp<"tsubc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSUBC: Elementwise ternary op: src0 - src1 + src2."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$src2, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `,` $src2 `:` - qualified(type($src0)) `,` qualified(type($src1)) `,` qualified(type($src2)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSUBS TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSubSOp: PTO_TOp<"tsubs", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSUBS: Elementwise subtract a scalar from a tile."; - - let arguments = (ins - PTODpsType:$src, - F32:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - - -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSUBSC TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSubSCOp: PTO_TOp<"tsubsc", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TSUBSC: Elementwise fused op: src0 - scalar + src1."; - - let arguments = (ins - PTODpsType:$src0, - F32:$scalar, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $scalar `,` $src1 `:` - qualified(type($src0)) `,` type($scalar) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - -} - -//===----------------------------------------------------------------------===// -// PTOOps.td (add TXORS TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TXorSOp: PTO_TOp<"txors", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TXORS: Elementwise bitwise XOR of a tile and a scalar."; - - let arguments = (ins - PTODpsType:$src, - AnyInteger:$scalar, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `,` $scalar `:` qualified(type($src)) `,` type($scalar) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - -} -//===----------------------------------------------------------------------===// -// PTOOps.td (add TSYNC TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TSyncOp: PTO_TOp<"tsync", [ - PTO_DpsInitOpInterface, -]> { - let summary = "TSYNC: Synchronize PTO execution with event tokens or single-op barrier."; - - let arguments = (ins - PTODpsType:$events, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $events `:` type($events) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - let extraClassDeclaration = [{ - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; -} - -//===----------------------------------------------------------------------===// -// PTOOps.td (add TXOR TBDPS/tile buffer op) -//===----------------------------------------------------------------------===// - -def TXorOp: PTO_TOp<"txor", [ - PTO_DpsInitOpInterface, - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TXOR: Elementwise bitwise XOR of two tiles."; - - let arguments = (ins - PTODpsType:$src0, - PTODpsType:$src1, - PTODpsType:$dst - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src0 `,` $src1 `:` qualified(type($src0)) `,` qualified(type($src1)) `)` - `outs` `(` $dst `:` qualified(type($dst) ) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } - }]; - -} - -def TPrintOp: PTO_TOp<"tprint", [ - OpPipeInterface, - DeclareOpInterfaceMethods -]> { - let summary = "TPRINT: Print the contents of a Tile or GlobalTensor for debugging purposes directly from device code.."; - - let arguments = (ins - PTODpsType:$src - ); - - let results = (outs); - - let hasVerifier = 1; - - let assemblyFormat = [{ - `ins` `(` $src `:` qualified(type($src)) `)` - attr-dict - }]; - - let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_V; } - }]; - } - -#endif // MLIR_DIALECT_PTO_IR_PTOOPS diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp b/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp deleted file mode 100644 index 45b1798d..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoas_source/PTOToEmitC.cpp +++ /dev/null @@ -1,7713 +0,0 @@ -//===- PTOToEmitC.cpp - PTO to EmitC conversion pass ----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "PTO/IR/PTO.h" -#include "PTO/Transforms/Passes.h" - -#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" -#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" -#include "mlir/Analysis/DataFlowFramework.h" - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/EmitC/IR/EmitC.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" - -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/IRMapping.h" -#include "mlir/IR/Operation.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/TypeRange.h" - -#include "mlir/Pass/Pass.h" -#include "mlir/Support/LLVM.h" -#include "mlir/Target/Cpp/CppEmitter.h" -#include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/Func/Transforms/FuncConversions.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h" -#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" - -#include -#include -#include -#include -namespace mlir { -#define GEN_PASS_DEF_EMITPTOMANUAL -#include "PTO/Transforms/Passes.h.inc" -} // namespace mlir - -using namespace mlir; -using namespace mlir::pto; - -static const char *addrSpaceQualifier(pto::AddressSpace as) { - switch (as) { - case pto::AddressSpace::Zero: - return "__gm__"; - case pto::AddressSpace::VEC: - return "__ubuf__"; - case pto::AddressSpace::GM: - return "__gm__"; - case pto::AddressSpace::MAT: - return "__cbuf__"; - case pto::AddressSpace::LEFT: - return "__ca__"; - case pto::AddressSpace::RIGHT: - return "__cb__"; - case pto::AddressSpace::ACC: - return "__cc__"; - case pto::AddressSpace::BIAS: - // Bias tiles are special in pto-isa; keep a safe fallback qualifier. - return "__gm__"; - case pto::AddressSpace::SCALING: - // pto-isa TileType::Scaling maps to __fbuf__ (see pto/common/memory.hpp). - return "__fbuf__"; - } - return "__gm__"; -} - -static Value peelUnrealized(Value v) { - if (auto castOp = v.getDefiningOp()) - return castOp.getOperand(0); - return v; -} - -static std::optional getLayoutAttrFromOp(Operation *op) { - if (!op) - return std::nullopt; - if (auto attr = op->getAttrOfType("layout")) - return attr.getLayout(); - return std::nullopt; -} - -static std::optional resolveLayoutFromValueChain(Value v) { - v = peelUnrealized(v); - while (Operation *def = v.getDefiningOp()) { - if (auto layout = getLayoutAttrFromOp(def)) - return layout; - if (auto subview = dyn_cast(def)) { - v = peelUnrealized(subview.getSource()); - continue; - } - if (auto reinterpret = dyn_cast(def)) { - v = peelUnrealized(reinterpret.getSource()); - continue; - } - if (auto cast = dyn_cast(def)) { - v = peelUnrealized(cast.getSource()); - continue; - } - if (auto unrealized = dyn_cast(def)) { - if (unrealized->getNumOperands() == 0) - break; - v = peelUnrealized(unrealized.getOperand(0)); - continue; - } - break; - } - return std::nullopt; -} - -static std::optional -resolveLayoutForGlobalTensor(Operation *anchor, Value basePtr) { - if (auto layout = getLayoutAttrFromOp(anchor)) - return layout; - return resolveLayoutFromValueChain(basePtr); -} - -static std::string layoutToEmitCString(mlir::pto::Layout layout) { - switch (layout) { - case mlir::pto::Layout::ND: - return "pto::Layout::ND"; - case mlir::pto::Layout::DN: - return "pto::Layout::DN"; - case mlir::pto::Layout::NZ: - return "pto::Layout::NZ"; - } - return "pto::Layout::ND"; -} - -//===----------------------------------------------------------------------===// -// Type Converter -//===----------------------------------------------------------------------===// - -class PTOToEmitCTypeConverter : public TypeConverter { -public: - PTOToEmitCTypeConverter(MLIRContext *Ctx) { - // --------------------------------------------------------- - // 1. 基本类型 (f32, i32, index) - // --------------------------------------------------------- - addConversion([Ctx](FloatType type) -> Type { - if (type.isF32()) return emitc::OpaqueType::get(Ctx, "float"); - if (type.isF16()) return emitc::OpaqueType::get(Ctx, "half"); - if (type.isBF16()) return emitc::OpaqueType::get(Ctx, "bfloat16_t"); - if (type.isF64()) return emitc::OpaqueType::get(Ctx, "double"); - llvm::errs() << "[Debug] Unsupported FloatType: " << type << "\n"; - return Type{}; - }); - - addConversion([Ctx](IntegerType type) -> Type { - // [关键修改] i1 保持为 i1,不要转为 emitc.opaque<"bool"> - // 这样 emitc.if (接受 i1) 就不会报错。 - // 在打印 C++ 代码时,i1 会自动打印为 bool。 - //if (type.getWidth() == 1) return IntegerType::get(Ctx, 1); - if (type.getWidth() == 1) return type; // <--- 保持 i1 不变 - - // Prefer fixed-width C types. Preserve signedness if the MLIR integer is - // explicitly signed/unsigned; treat signless as signed by default. - const bool isUnsigned = type.isUnsignedInteger(); - switch (type.getWidth()) { - case 8: - return emitc::OpaqueType::get(Ctx, isUnsigned ? "uint8_t" : "int8_t"); - case 16: - return emitc::OpaqueType::get(Ctx, - isUnsigned ? "uint16_t" : "int16_t"); - case 32: - return emitc::OpaqueType::get(Ctx, - isUnsigned ? "uint32_t" : "int32_t"); - case 64: - return emitc::OpaqueType::get(Ctx, - isUnsigned ? "uint64_t" : "int64_t"); - default: - llvm::errs() << "[Debug] Unsupported IntegerType width: " - << type.getWidth() << "\n"; - return emitc::OpaqueType::get(Ctx, "int32_t"); // Fallback - } - }); - - addConversion([Ctx](IndexType type) -> Type { - return emitc::OpaqueType::get(Ctx, "int32_t"); - }); - - // vector<4xi16> (e.g. TMRGSORT executedNumList) -> pto::MrgSortExecutedNumList - addConversion([Ctx](VectorType type) -> Type { - if (type.getRank() == 1 && type.getNumElements() == 4 && - type.getElementType().isInteger(16)) - return emitc::OpaqueType::get(Ctx, "pto::MrgSortExecutedNumList"); - return Type{}; - }); - - // --------------------------------------------------------- - // 2. PTO 特殊类型 (透传或转换) - // --------------------------------------------------------- - addConversion([Ctx](emitc::OpaqueType type) { return type; }); - addConversion([Ctx](emitc::PointerType type) { return type; }); - - // --------------------------------------------------------- - // 2.5 PtrType 转换 (指针类型) - // --------------------------------------------------------- - addConversion([this, Ctx](pto::PtrType type) -> std::optional { - Type elemType = type.getElementType(); - Type newElemType = convertType(elemType); - if (!newElemType) - return std::nullopt; - - std::string elemTypeStr; - if (auto opq = dyn_cast(newElemType)) { - elemTypeStr = opq.getValue().str(); - } else { - llvm::errs() << " [Error] PtrType elem type is not OpaqueType: " - << newElemType << "\n"; - return std::nullopt; - } - - std::string qualifier = "__gm__"; - - std::string finalTypeStr = qualifier + " " + elemTypeStr; - return emitc::PointerType::get( - emitc::OpaqueType::get(Ctx, finalTypeStr)); - }); - - // --------------------------------------------------------- - // 3. MemRef 转换 (Debug 重点) - // --------------------------------------------------------- - addConversion([this, Ctx](MemRefType type) -> std::optional { - llvm::errs() << "[Debug] Converting MemRef: " << type << "\n"; - - // A. 转换元素类型 - Type elemType = type.getElementType(); - Type newElemType = convertType(elemType); - if (!newElemType) { - llvm::errs() << " [Error] Failed to convert element type: " << elemType << "\n"; - return std::nullopt; - } - - // 获取元素类型的字符串 - std::string elemTypeStr; - if (auto opq = dyn_cast(newElemType)) { - elemTypeStr = opq.getValue().str(); - } else { - llvm::errs() << " [Error] Converted element type is not OpaqueType: " << newElemType << "\n"; - return std::nullopt; - } - - // B. 处理 Memory Space - std::string qualifier = ""; - Attribute memorySpace = type.getMemorySpace(); - - if (!memorySpace) { - qualifier = "__gm__"; - } else if (auto ptoAttr = dyn_cast(memorySpace)) { - qualifier = addrSpaceQualifier(ptoAttr.getAddressSpace()); - } else { - llvm::errs() << " [Warning] Unknown MemorySpace Attribute type: " << memorySpace << "\n"; - qualifier = "__gm__"; // Fallback - } - - std::string finalTypeStr = qualifier + " " + elemTypeStr; - llvm::errs() << " [Success] -> " << finalTypeStr << "*\n"; - - return emitc::PointerType::get(emitc::OpaqueType::get(Ctx, finalTypeStr)); - }); - - // --------------------------------------------------------- - // 4. Function & Materialization - // --------------------------------------------------------- - addConversion([this](FunctionType type) -> Type { - SmallVector inputs; - if (failed(convertTypes(type.getInputs(), inputs))) return Type{}; - SmallVector results; - if (failed(convertTypes(type.getResults(), results))) return Type{}; - return FunctionType::get(type.getContext(), inputs, results); - }); - - auto materializeCast = [](OpBuilder &Builder, Type ResultType, - ValueRange Inputs, Location Loc) -> Value { - if (Inputs.size() != 1) return Value(); - return Builder.create(Loc, ResultType, Inputs[0]).getResult(0); - }; - - addSourceMaterialization(materializeCast); - addTargetMaterialization(materializeCast); - // Needed for region/block signature conversions (e.g. CFG block args). - addArgumentMaterialization(materializeCast); - } -}; - -static constexpr unsigned kPTOIndexBitWidth = - 32; // keep consistent with IndexType conversion - -// Forward declarations (definitions below). -static emitc::OpaqueType getSignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth); -static emitc::OpaqueType getUnsignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth); -static emitc::OpaqueType getWiderSignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth); -static emitc::OpaqueType getWiderUnsignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth); -static Value makeEmitCOpaqueConstant(ConversionPatternRewriter &rewriter, - Location loc, Type type, - llvm::StringRef literal); -static Value makeEmitCIntConstant(ConversionPatternRewriter &rewriter, - Location loc, Type type, int64_t value); -static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, - Type dstType, Value src); -static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter, - Location loc, Value v, - unsigned bitWidth); - -//===----------------------------------------------------------------------===// -// Arith -> EmitC (full dialect coverage for scalar ops) -//===----------------------------------------------------------------------===// - -template -struct ArithSimpleBinaryToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type dstTy = this->getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getOperands()); - return success(); - } -}; - -// Integer bitwise ops (andi/ori/xori) on signless integers: perform in unsigned -// to avoid signedness pitfalls, then cast back. -template -struct ArithUnsignedBitwiseBinaryToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = this->getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - if (bitWidth == 1) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value resU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, resU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithDivUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::DivUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value divU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, divU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithRemUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::RemUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value remU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, remU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithCeilDivUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::CeilDivUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value one = makeEmitCIntConstant(rewriter, loc, uTy, 1); - Value rhsMinusOne = rewriter.create(loc, uTy, rhsU, one); - Value num = rewriter.create(loc, uTy, lhsU, rhsMinusOne); - Value divU = rewriter.create(loc, uTy, num, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, divU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithCeilDivSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::CeilDivSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); - Value one = makeEmitCIntConstant(rewriter, loc, dstTy, 1); - - Value q0 = rewriter.create(loc, dstTy, adaptor.getLhs(), - adaptor.getRhs()); - Value r = rewriter.create(loc, dstTy, adaptor.getLhs(), - adaptor.getRhs()); - - Value rNeZero = rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::ne, r, - zero); - Value lhsLt0 = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, adaptor.getLhs(), - zero); - Value rhsLt0 = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, adaptor.getRhs(), - zero); - Value signsSame = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::eq, lhsLt0, rhsLt0); - Value adjust = - rewriter.create(loc, rewriter.getI1Type(), - rNeZero, signsSame); - - Value qPlusOne = rewriter.create(loc, dstTy, q0, one); - Value result = rewriter.create(loc, dstTy, adjust, - qPlusOne, q0); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithFloorDivSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::FloorDivSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); - Value one = makeEmitCIntConstant(rewriter, loc, dstTy, 1); - - Value q0 = rewriter.create(loc, dstTy, adaptor.getLhs(), - adaptor.getRhs()); - Value r = rewriter.create(loc, dstTy, adaptor.getLhs(), - adaptor.getRhs()); - - Value rNeZero = rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::ne, r, - zero); - Value lhsLt0 = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, adaptor.getLhs(), - zero); - Value rhsLt0 = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, adaptor.getRhs(), - zero); - Value signsDifferent = - rewriter.create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::ne, lhsLt0, rhsLt0); - Value adjust = - rewriter.create(loc, rewriter.getI1Type(), - rNeZero, signsDifferent); - - Value qMinusOne = rewriter.create(loc, dstTy, q0, one); - Value result = rewriter.create(loc, dstTy, adjust, - qMinusOne, q0); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithShiftLeftToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::ShLIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - if (bitWidth == 1) { - // Compute on u8 and truncate to i1. - auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); - Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); - Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); - Value sh = rewriter.create(loc, u8Ty, lhsU8, - rhsU8); - Value masked = - rewriter.create(loc, u8Ty, sh, - makeEmitCIntConstant(rewriter, loc, - u8Ty, 1)); - rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value shU = - rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, shU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithShiftRightUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::ShRUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - if (bitWidth == 1) { - // (x >> y) on i1 is either x (y==0) or 0 (y!=0); approximate in u8. - auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); - Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); - Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); - Value sh = rewriter.create(loc, u8Ty, lhsU8, - rhsU8); - Value masked = - rewriter.create(loc, u8Ty, sh, - makeEmitCIntConstant(rewriter, loc, - u8Ty, 1)); - rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value shU = - rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, shU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithShiftRightSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::ShRSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - if (bitWidth == 1) { - // (x >> y) on i1 is either x (y==0) or 0 (y!=0); approximate in u8. - auto u8Ty = getUnsignedIntOpaqueType(rewriter.getContext(), 8); - Value lhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getLhs()); - Value rhsU8 = emitCCast(rewriter, loc, u8Ty, adaptor.getRhs()); - Value sh = rewriter.create(loc, u8Ty, lhsU8, - rhsU8); - Value masked = - rewriter.create(loc, u8Ty, sh, - makeEmitCIntConstant(rewriter, loc, - u8Ty, 1)); - rewriter.replaceOp(op, emitCCast(rewriter, loc, dstTy, masked)); - return success(); - } - - // Signed arithmetic shift; cast RHS to unsigned to interpret shift amount. - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value sh = - rewriter.create(loc, dstTy, adaptor.getLhs(), - rhsU); - rewriter.replaceOp(op, sh); - return success(); - } -}; - -struct ArithNegFToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::NegFOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getOperand()); - return success(); - } -}; - -struct ArithRemFToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::RemFOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - // Use builtin `fmod` when possible. For f16, compute in float and cast back. - Type callTy = dstTy; - Value lhs = adaptor.getLhs(); - Value rhs = adaptor.getRhs(); - - if (auto opFloatTy = dyn_cast(op.getType())) { - if (opFloatTy.isF16()) { - auto f32Ty = emitc::OpaqueType::get(rewriter.getContext(), "float"); - lhs = emitCCast(rewriter, loc, f32Ty, lhs); - rhs = emitCCast(rewriter, loc, f32Ty, rhs); - callTy = f32Ty; - } - } - - // Prefer `__builtin_fmod*` to avoid relying on extra headers. - llvm::StringRef callee = "__builtin_fmod"; - if (auto opFloatTy = dyn_cast(op.getType())) { - if (opFloatTy.isF32() || opFloatTy.isF16()) - callee = "__builtin_fmodf"; - else if (opFloatTy.isF64()) - callee = "__builtin_fmod"; - } - - auto call = rewriter.create( - loc, TypeRange{callTy}, callee, ValueRange{lhs, rhs}, - /*args=*/ArrayAttr{}, /*template_args=*/ArrayAttr{}); - Value result = call.getResult(0); - if (callTy != dstTy) - result = emitCCast(rewriter, loc, dstTy, result); - - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithSelectToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::SelectOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getCondition().getType().isInteger(1)) - return rewriter.notifyMatchFailure( - op, "only scalar i1 conditions supported for arith.select"); - - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - auto cond = - rewriter.create(op.getLoc(), dstTy, - adaptor.getCondition(), - adaptor.getTrueValue(), - adaptor.getFalseValue()); - rewriter.replaceOp(op, cond.getResult()); - return success(); - } -}; - -struct ArithExtUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::ExtUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto dstIntTy = dyn_cast(op.getType()); - auto srcIntTy = dyn_cast(op.getIn().getType()); - if (!dstIntTy || !srcIntTy) - return rewriter.notifyMatchFailure(op, "expected scalar integer types"); - - Type dstTy = getTypeConverter()->convertType(dstIntTy); - if (!dstTy) - return failure(); - - // i1 -> iN: bool to integer already behaves as 0/1. - if (srcIntTy.getWidth() == 1) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } - - auto uSrcTy = - getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); - auto uDstTy = - getUnsignedIntOpaqueType(rewriter.getContext(), dstIntTy.getWidth()); - Value srcU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), - srcIntTy.getWidth()); - Value extU = emitCCast(rewriter, loc, uDstTy, srcU); - Value result = emitCCast(rewriter, loc, dstTy, extU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithExtSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::ExtSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto dstIntTy = dyn_cast(op.getType()); - auto srcIntTy = dyn_cast(op.getIn().getType()); - if (!dstIntTy || !srcIntTy) - return rewriter.notifyMatchFailure(op, "expected scalar integer types"); - - Type dstTy = getTypeConverter()->convertType(dstIntTy); - if (!dstTy) - return failure(); - - // i1 sign-extension: 0 -> 0, 1 -> -1. - if (srcIntTy.getWidth() == 1) { - Value zero = makeEmitCIntConstant(rewriter, loc, dstTy, 0); - Value asInt = emitCCast(rewriter, loc, dstTy, adaptor.getIn()); - Value neg = rewriter.create(loc, dstTy, zero, asInt).getResult(); - rewriter.replaceOp(op, neg); - return success(); - } - - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } -}; - -template -struct ArithCastToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(CastOp op, typename CastOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type dstTy = this->getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } -}; - -struct ArithIndexCastUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::IndexCastUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - // MemRef casts are handled elsewhere; for safety, fall back to emitc.cast. - if (isa(op.getIn().getType()) || isa(op.getType())) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } - - auto getBW = [](Type t) -> std::optional { - if (auto i = dyn_cast(t)) - return i.getWidth(); - if (isa(t)) - return kPTOIndexBitWidth; - return std::nullopt; - }; - - auto srcBW = getBW(op.getIn().getType()); - auto dstBW = getBW(op.getType()); - if (!srcBW || !dstBW) - return rewriter.notifyMatchFailure(op, "unsupported index_castui types"); - - if (*dstBW <= *srcBW) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } - - auto uSrcTy = getUnsignedIntOpaqueType(rewriter.getContext(), *srcBW); - auto uDstTy = getUnsignedIntOpaqueType(rewriter.getContext(), *dstBW); - Value srcU = emitCCast(rewriter, loc, uSrcTy, adaptor.getIn()); - Value extU = emitCCast(rewriter, loc, uDstTy, srcU); - Value result = emitCCast(rewriter, loc, dstTy, extU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithUIToFPToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::UIToFPOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto srcIntTy = dyn_cast(op.getIn().getType()); - if (!srcIntTy) - return rewriter.notifyMatchFailure(op, "expected scalar integer input"); - - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - // Convert via an unsigned integer type of the same width. - if (srcIntTy.getWidth() == 1) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } - auto uSrcTy = - getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); - Value srcU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), - srcIntTy.getWidth()); - Value fp = rewriter.create(loc, dstTy, srcU).getResult(); - rewriter.replaceOp(op, fp); - return success(); - } -}; - -struct ArithFPToUIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::FPToUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto dstIntTy = dyn_cast(op.getType()); - if (!dstIntTy) - return rewriter.notifyMatchFailure(op, "expected scalar integer result"); - - Type dstTy = getTypeConverter()->convertType(dstIntTy); - if (!dstTy) - return failure(); - - auto uDstTy = - getUnsignedIntOpaqueType(rewriter.getContext(), dstIntTy.getWidth()); - Value asU = rewriter.create(loc, uDstTy, adaptor.getIn()).getResult(); - Value result = emitCCast(rewriter, loc, dstTy, asU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithBitcastToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::BitcastOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - // For pointer-like types, a regular cast is fine. - if (isa(dstTy)) { - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } - - // Only support scalar int/float/index bitcasts here. - auto srcTy = op.getIn().getType(); - auto dstOrigTy = op.getType(); - - auto getBitWidth = [](Type t) -> std::optional { - if (auto it = dyn_cast(t)) - return it.getWidth(); - if (auto ft = dyn_cast(t)) - return ft.getWidth(); - if (isa(t)) - return kPTOIndexBitWidth; - return std::nullopt; - }; - auto srcBW = getBitWidth(srcTy); - auto dstBW = getBitWidth(dstOrigTy); - if (!srcBW || !dstBW || *srcBW != *dstBW) - return rewriter.notifyMatchFailure(op, "bitcast requires equal bitwidth"); - - // Determine the template argument from the destination type string. - auto dstOpaque = dyn_cast(dstTy); - if (!dstOpaque) - return rewriter.notifyMatchFailure(op, "expected emitc opaque dest type"); - - auto templateArgs = - rewriter.getArrayAttr({emitc::OpaqueAttr::get(rewriter.getContext(), - dstOpaque.getValue())}); - auto call = rewriter.create( - loc, TypeRange{dstTy}, "ptoas_bitcast", /*operands=*/ValueRange{adaptor.getIn()}, - /*args=*/ArrayAttr{}, /*template_args=*/templateArgs); - rewriter.replaceOp(op, call.getResult(0)); - return success(); - } -}; - -// arith.cmpf lowering with ordered/unordered semantics. -struct ArithCmpFToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - static Value isNaN(ConversionPatternRewriter &rewriter, Location loc, - Value v) { - return rewriter - .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::ne, - v, v) - .getResult(); - } - - static Value isNotNaN(ConversionPatternRewriter &rewriter, Location loc, - Value v) { - return rewriter - .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::eq, - v, v) - .getResult(); - } - - LogicalResult matchAndRewrite(arith::CmpFOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!isa(op.getLhs().getType())) - return rewriter.notifyMatchFailure(op, "cmpf only supported on scalar floats"); - - auto loc = op.getLoc(); - auto i1Ty = rewriter.getI1Type(); - - bool unordered = false; - emitc::CmpPredicate pred = emitc::CmpPredicate::eq; - - switch (op.getPredicate()) { - case arith::CmpFPredicate::AlwaysFalse: { - auto cst = makeEmitCOpaqueConstant(rewriter, loc, i1Ty, "false"); - rewriter.replaceOp(op, cst); - return success(); - } - case arith::CmpFPredicate::AlwaysTrue: { - auto cst = makeEmitCOpaqueConstant(rewriter, loc, i1Ty, "true"); - rewriter.replaceOp(op, cst); - return success(); - } - case arith::CmpFPredicate::OEQ: - unordered = false; - pred = emitc::CmpPredicate::eq; - break; - case arith::CmpFPredicate::OGT: - unordered = false; - pred = emitc::CmpPredicate::gt; - break; - case arith::CmpFPredicate::OGE: - unordered = false; - pred = emitc::CmpPredicate::ge; - break; - case arith::CmpFPredicate::OLT: - unordered = false; - pred = emitc::CmpPredicate::lt; - break; - case arith::CmpFPredicate::OLE: - unordered = false; - pred = emitc::CmpPredicate::le; - break; - case arith::CmpFPredicate::ONE: - unordered = false; - pred = emitc::CmpPredicate::ne; - break; - case arith::CmpFPredicate::ORD: { - Value ordered = rewriter.create( - loc, i1Ty, isNotNaN(rewriter, loc, adaptor.getLhs()), - isNotNaN(rewriter, loc, adaptor.getRhs())); - rewriter.replaceOp(op, ordered); - return success(); - } - case arith::CmpFPredicate::UEQ: - unordered = true; - pred = emitc::CmpPredicate::eq; - break; - case arith::CmpFPredicate::UGT: - unordered = true; - pred = emitc::CmpPredicate::gt; - break; - case arith::CmpFPredicate::UGE: - unordered = true; - pred = emitc::CmpPredicate::ge; - break; - case arith::CmpFPredicate::ULT: - unordered = true; - pred = emitc::CmpPredicate::lt; - break; - case arith::CmpFPredicate::ULE: - unordered = true; - pred = emitc::CmpPredicate::le; - break; - case arith::CmpFPredicate::UNE: - unordered = true; - pred = emitc::CmpPredicate::ne; - break; - case arith::CmpFPredicate::UNO: { - Value unord = rewriter.create( - loc, i1Ty, isNaN(rewriter, loc, adaptor.getLhs()), - isNaN(rewriter, loc, adaptor.getRhs())); - rewriter.replaceOp(op, unord); - return success(); - } - } - - Value cmp = rewriter - .create(loc, i1Ty, pred, adaptor.getLhs(), - adaptor.getRhs()) - .getResult(); - - Value unord = rewriter.create( - loc, i1Ty, isNaN(rewriter, loc, adaptor.getLhs()), - isNaN(rewriter, loc, adaptor.getRhs())); - Value ord = rewriter.create( - loc, i1Ty, isNotNaN(rewriter, loc, adaptor.getLhs()), - isNotNaN(rewriter, loc, adaptor.getRhs())); - - if (unordered) { - Value res = - rewriter.create(loc, i1Ty, unord, cmp).getResult(); - rewriter.replaceOp(op, res); - return success(); - } - - Value res = - rewriter.create(loc, i1Ty, ord, cmp).getResult(); - rewriter.replaceOp(op, res); - return success(); - } -}; - -struct ArithAddUIExtendedToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(arith::AddUIExtendedOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getSum().getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, - "expected scalar integer or index operands"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - SmallVector newResultTypes; - if (failed(getTypeConverter()->convertTypes(op->getResultTypes(), - newResultTypes))) - return failure(); - if (newResultTypes.size() != 2) - return failure(); - - Type sumDstTy = newResultTypes[0]; - Type overflowDstTy = newResultTypes[1]; - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - auto wideTy = getWiderUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value lhsWide = emitCCast(rewriter, loc, wideTy, lhsU); - Value rhsWide = emitCCast(rewriter, loc, wideTy, rhsU); - Value sumWide = - rewriter.create(loc, wideTy, lhsWide, rhsWide).getResult(); - - Value sumN = emitCCast(rewriter, loc, uTy, sumWide); - Value sum = emitCCast(rewriter, loc, sumDstTy, sumN); - - Value shiftAmt = makeEmitCIntConstant(rewriter, loc, wideTy, bitWidth); - Value high = rewriter - .create(loc, wideTy, sumWide, - shiftAmt) - .getResult(); - Value zeroWide = makeEmitCIntConstant(rewriter, loc, wideTy, 0); - Value overflow = - rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::ne, high, zeroWide) - .getResult(); - overflow = emitCCast(rewriter, loc, overflowDstTy, overflow); - - rewriter.replaceOp(op, {sum, overflow}); - return success(); - } -}; - -template -struct ArithMulExtendedToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getResult(0).getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, - "expected scalar integer or index operands"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - SmallVector newResultTypes; - if (failed(this->getTypeConverter()->convertTypes(op->getResultTypes(), - newResultTypes))) - return failure(); - if (newResultTypes.size() != 2) - return failure(); - - Type lowDstTy = newResultTypes[0]; - Type highDstTy = newResultTypes[1]; - - Type wideTy = isUnsigned ? (Type)getWiderUnsignedIntOpaqueType(rewriter.getContext(), - bitWidth) - : (Type)getWiderSignedIntOpaqueType(rewriter.getContext(), - bitWidth); - - Value lhsWide; - Value rhsWide; - if constexpr (isUnsigned) { - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - lhsWide = emitCCast(rewriter, loc, wideTy, lhsU); - rhsWide = emitCCast(rewriter, loc, wideTy, rhsU); - } else { - lhsWide = emitCCast(rewriter, loc, wideTy, adaptor.getLhs()); - rhsWide = emitCCast(rewriter, loc, wideTy, adaptor.getRhs()); - } - - Value prodWide = - rewriter.create(loc, wideTy, lhsWide, rhsWide).getResult(); - Value low = emitCCast(rewriter, loc, lowDstTy, prodWide); - - Value shiftAmt = makeEmitCIntConstant(rewriter, loc, wideTy, bitWidth); - Value highWide = rewriter - .create(loc, wideTy, prodWide, - shiftAmt) - .getResult(); - Value high = emitCCast(rewriter, loc, highDstTy, highWide); - - rewriter.replaceOp(op, {low, high}); - return success(); - } -}; - -using ArithMulSIExtendedToEmitC = - ArithMulExtendedToEmitC; -using ArithMulUIExtendedToEmitC = - ArithMulExtendedToEmitC; - -struct ArithMinMaxIToEmitCBase { - static Value makeSelect(ConversionPatternRewriter &rewriter, Location loc, - Type dstTy, Value cond, Value trueV, Value falseV) { - return rewriter - .create(loc, dstTy, cond, trueV, falseV) - .getResult(); - } -}; - -struct ArithMaxSIToEmitC : public OpConversionPattern, - ArithMinMaxIToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MaxSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - Value cond = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getRhs(), - adaptor.getLhs()); - rewriter.replaceOp(op, res); - return success(); - } -}; - -struct ArithMinSIToEmitC : public OpConversionPattern, - ArithMinMaxIToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MinSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - Value cond = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getLhs(), - adaptor.getRhs()); - rewriter.replaceOp(op, res); - return success(); - } -}; - -struct ArithMaxUIToEmitC : public OpConversionPattern, - ArithMinMaxIToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MaxUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - Value lhsU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value cond = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, lhsU, rhsU) - .getResult(); - Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getRhs(), - adaptor.getLhs()); - rewriter.replaceOp(op, res); - return success(); - } -}; - -struct ArithMinUIToEmitC : public OpConversionPattern, - ArithMinMaxIToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MinUIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - Value lhsU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = - castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value cond = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, lhsU, rhsU) - .getResult(); - Value res = makeSelect(rewriter, loc, dstTy, cond, adaptor.getLhs(), - adaptor.getRhs()); - rewriter.replaceOp(op, res); - return success(); - } -}; - -// Floating-point max/min variants. -struct ArithFloatMinMaxToEmitCBase { - static Value isNaN(ConversionPatternRewriter &rewriter, Location loc, - Value v) { - return rewriter - .create(loc, rewriter.getI1Type(), emitc::CmpPredicate::ne, - v, v) - .getResult(); - } - - static Value makeFZero(ConversionPatternRewriter &rewriter, Location loc, - Type ty) { - return makeEmitCOpaqueConstant(rewriter, loc, ty, "0.0f"); - } -}; - -struct ArithMaxNumFToEmitC : public OpConversionPattern, - ArithFloatMinMaxToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MaxNumFOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); - Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); - - Value cmpLt = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value maxNoNaN = - rewriter - .create(loc, dstTy, cmpLt, adaptor.getRhs(), - adaptor.getLhs()) - .getResult(); - - Value rhsOrMax = - rewriter - .create(loc, dstTy, rhsNaN, adaptor.getLhs(), - maxNoNaN) - .getResult(); - Value res = - rewriter - .create(loc, dstTy, lhsNaN, adaptor.getRhs(), - rhsOrMax) - .getResult(); - rewriter.replaceOp(op, res); - return success(); - } -}; - -struct ArithMinNumFToEmitC : public OpConversionPattern, - ArithFloatMinMaxToEmitCBase { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::MinNumFOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); - Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); - - Value cmpLt = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value minNoNaN = - rewriter - .create(loc, dstTy, cmpLt, adaptor.getLhs(), - adaptor.getRhs()) - .getResult(); - - Value rhsOrMin = - rewriter - .create(loc, dstTy, rhsNaN, adaptor.getLhs(), - minNoNaN) - .getResult(); - Value res = - rewriter - .create(loc, dstTy, lhsNaN, adaptor.getRhs(), - rhsOrMin) - .getResult(); - rewriter.replaceOp(op, res); - return success(); - } -}; - -template -struct ArithMinMaxFPropagateNaNToEmitC : public OpConversionPattern, - ArithFloatMinMaxToEmitCBase { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!isa(op.getType())) - return rewriter.notifyMatchFailure(op, "expected scalar float type"); - - auto loc = op.getLoc(); - Type dstTy = this->getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - - Value lhsNaN = isNaN(rewriter, loc, adaptor.getLhs()); - Value rhsNaN = isNaN(rewriter, loc, adaptor.getRhs()); - - // Basic compare-based min/max. - Value cmpLt = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::lt, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value candidate = rewriter - .create( - loc, dstTy, cmpLt, - isMaximum ? adaptor.getRhs() : adaptor.getLhs(), - isMaximum ? adaptor.getLhs() : adaptor.getRhs()) - .getResult(); - - // Fix signed zero tie-breaking for equal zeros. - Value zero = makeFZero(rewriter, loc, dstTy); - Value eq = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::eq, - adaptor.getLhs(), adaptor.getRhs()) - .getResult(); - Value lhsZero = rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::eq, - adaptor.getLhs(), zero) - .getResult(); - Value bothZero = rewriter - .create(loc, rewriter.getI1Type(), - eq, lhsZero) - .getResult(); - - auto floatTy = cast(op.getType()); - auto bitsTy = getUnsignedIntOpaqueType(rewriter.getContext(), floatTy.getWidth()); - auto templateArgs = - rewriter.getArrayAttr({emitc::OpaqueAttr::get(rewriter.getContext(), - cast(bitsTy).getValue())}); - Value lhsBits = - rewriter - .create(loc, TypeRange{bitsTy}, "ptoas_bitcast", - ValueRange{adaptor.getLhs()}, - /*args=*/ArrayAttr{}, - /*template_args=*/templateArgs) - .getResult(0); - - Value oneBits = makeEmitCIntConstant(rewriter, loc, bitsTy, 1); - Value shAmt = makeEmitCIntConstant(rewriter, loc, bitsTy, - floatTy.getWidth() - 1); - Value signMask = rewriter - .create(loc, bitsTy, oneBits, - shAmt) - .getResult(); - Value signBit = rewriter - .create(loc, bitsTy, lhsBits, signMask) - .getResult(); - Value zeroBits = makeEmitCIntConstant(rewriter, loc, bitsTy, 0); - Value lhsIsNegZero = - rewriter - .create(loc, rewriter.getI1Type(), - emitc::CmpPredicate::ne, signBit, zeroBits) - .getResult(); - - Value tie = - rewriter - .create( - loc, dstTy, lhsIsNegZero, - isMaximum ? adaptor.getRhs() : adaptor.getLhs(), - isMaximum ? adaptor.getLhs() : adaptor.getRhs()) - .getResult(); - Value noNaN = rewriter - .create(loc, dstTy, bothZero, tie, - candidate) - .getResult(); - - // Propagate NaN: if lhs is NaN return lhs, else if rhs is NaN return rhs. - Value rhsOrNoNaN = rewriter - .create(loc, dstTy, rhsNaN, - adaptor.getRhs(), noNaN) - .getResult(); - Value res = rewriter - .create(loc, dstTy, lhsNaN, - adaptor.getLhs(), rhsOrNoNaN) - .getResult(); - rewriter.replaceOp(op, res); - return success(); - } -}; - -using ArithMaximumFToEmitC = - ArithMinMaxFPropagateNaNToEmitC; -using ArithMinimumFToEmitC = - ArithMinMaxFPropagateNaNToEmitC; - -//===----------------------------------------------------------------------===// -// Arith -> EmitC helpers -//===----------------------------------------------------------------------===// - -static emitc::OpaqueType getSignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth) { - switch (bitWidth) { - case 1: - return emitc::OpaqueType::get(ctx, "int8_t"); - case 8: - return emitc::OpaqueType::get(ctx, "int8_t"); - case 16: - return emitc::OpaqueType::get(ctx, "int16_t"); - case 32: - return emitc::OpaqueType::get(ctx, "int32_t"); - case 64: - return emitc::OpaqueType::get(ctx, "int64_t"); - case 128: - return emitc::OpaqueType::get(ctx, "__int128"); - default: - llvm::errs() << "[Debug] Unsupported signed integer bitwidth: " << bitWidth - << "\n"; - return emitc::OpaqueType::get(ctx, "int64_t"); - } -} - -static emitc::OpaqueType getUnsignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth) { - switch (bitWidth) { - case 1: - return emitc::OpaqueType::get(ctx, "uint8_t"); - case 8: - return emitc::OpaqueType::get(ctx, "uint8_t"); - case 16: - return emitc::OpaqueType::get(ctx, "uint16_t"); - case 32: - return emitc::OpaqueType::get(ctx, "uint32_t"); - case 64: - return emitc::OpaqueType::get(ctx, "uint64_t"); - case 128: - return emitc::OpaqueType::get(ctx, "unsigned __int128"); - default: - llvm::errs() << "[Debug] Unsupported unsigned integer bitwidth: " - << bitWidth << "\n"; - return emitc::OpaqueType::get(ctx, "uint64_t"); - } -} - -static emitc::OpaqueType getWiderSignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth) { - switch (bitWidth) { - case 1: - case 8: - return getSignedIntOpaqueType(ctx, 16); - case 16: - return getSignedIntOpaqueType(ctx, 32); - case 32: - return getSignedIntOpaqueType(ctx, 64); - case 64: - return getSignedIntOpaqueType(ctx, 128); - default: - return getSignedIntOpaqueType(ctx, 128); - } -} - -static emitc::OpaqueType getWiderUnsignedIntOpaqueType(MLIRContext *ctx, - unsigned bitWidth) { - switch (bitWidth) { - case 1: - case 8: - return getUnsignedIntOpaqueType(ctx, 16); - case 16: - return getUnsignedIntOpaqueType(ctx, 32); - case 32: - return getUnsignedIntOpaqueType(ctx, 64); - case 64: - return getUnsignedIntOpaqueType(ctx, 128); - default: - return getUnsignedIntOpaqueType(ctx, 128); - } -} - -static Value makeEmitCOpaqueConstant(ConversionPatternRewriter &rewriter, - Location loc, Type type, - llvm::StringRef literal) { - auto attr = emitc::OpaqueAttr::get(rewriter.getContext(), literal); - return rewriter.create(loc, type, attr); -} - -static Value makeEmitCIntConstant(ConversionPatternRewriter &rewriter, - Location loc, Type type, int64_t value) { - return makeEmitCOpaqueConstant(rewriter, loc, type, std::to_string(value)); -} - -static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, - Type dstType, Value src) { - if (src.getType() == dstType) - return src; - return rewriter.createOrFold(loc, dstType, src); -} - -// For signless iN integers lowered to signed C++ types, this creates a value -// representing the same N-bit pattern in an unsigned C++ type of the same -// width. This avoids incorrect sign-extension when later widening to a larger -// unsigned type. -static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter, - Location loc, Value v, - unsigned bitWidth) { - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - return emitCCast(rewriter, loc, uTy, v); -} - -struct ArithMulIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::MulIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - // i1 mul is equivalent to bitwise AND (mod 2 arithmetic). - if (bitWidth == 1) { - rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value mulU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, mulU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithAddIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::AddIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - // i1 add is equivalent to XOR (mod 2 arithmetic). - if (bitWidth == 1) { - rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value addU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, addU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithCastOPToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(arith::IndexCastOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type newTy = getTypeConverter()->convertType(op.getType()); - if (!newTy) - return failure(); - rewriter.replaceOpWithNewOp(op, newTy, adaptor.getIn()); - return success(); - } -}; - -struct ArithSubIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::SubIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Type opTy = op.getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure(op, "expected scalar integer or index type"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - - Type dstTy = getTypeConverter()->convertType(opTy); - if (!dstTy) - return failure(); - - // i1 sub is equivalent to XOR (mod 2 arithmetic). - if (bitWidth == 1) { - rewriter.replaceOpWithNewOp(op, opTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } - - auto uTy = getUnsignedIntOpaqueType(rewriter.getContext(), bitWidth); - Value lhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getLhs(), - bitWidth); - Value rhsU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getRhs(), - bitWidth); - Value subU = rewriter.create(loc, uTy, lhsU, rhsU); - Value result = emitCCast(rewriter, loc, dstTy, subU); - rewriter.replaceOp(op, result); - return success(); - } -}; - -struct ArithDivSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::DivSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type newTy = getTypeConverter()->convertType(op.getType()); - if (!newTy) - return failure(); - rewriter.replaceOpWithNewOp(op, newTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } -}; - -struct ArithRemSIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::RemSIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type newTy = getTypeConverter()->convertType(op.getType()); - if (!newTy) - return failure(); - rewriter.replaceOpWithNewOp(op, newTy, adaptor.getLhs(), - adaptor.getRhs()); - return success(); - } -}; - -struct ArithTruncIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::TruncIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - auto dstIntTy = dyn_cast(op.getType()); - auto srcIntTy = dyn_cast(op.getIn().getType()); - if (!dstIntTy || !srcIntTy) - return rewriter.notifyMatchFailure(op, "expected scalar integer types"); - - Type dstTy = getTypeConverter()->convertType(dstIntTy); - if (!dstTy) - return failure(); - - // to-i1 conversions: Arith wants truncation to the low bit, while C/C++ - // casts to bool are equivalent to `v != 0`. Implement as `(bool)(v & 1)`. - if (dstIntTy.getWidth() == 1) { - if (srcIntTy.getWidth() == 1) { - rewriter.replaceOp(op, adaptor.getIn()); - return success(); - } - - auto uSrcTy = - getUnsignedIntOpaqueType(rewriter.getContext(), srcIntTy.getWidth()); - Value inU = castSignlessIntToUnsignedSameWidth(rewriter, loc, adaptor.getIn(), - srcIntTy.getWidth()); - Value one = makeEmitCIntConstant(rewriter, loc, uSrcTy, 1); - Value masked = - rewriter.create(loc, uSrcTy, inU, one); - Value asBool = emitCCast(rewriter, loc, dstTy, masked); - rewriter.replaceOp(op, asBool); - return success(); - } - - rewriter.replaceOpWithNewOp(op, dstTy, adaptor.getIn()); - return success(); - } -}; - - struct ArithConstantToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Type newType = getTypeConverter()->convertType(op.getType()); - if (!newType) return failure(); - - // `adaptor.getValue()` may be null if attribute conversion isn't defined. - // Use the original attribute as fallback and always cast null-safely. - Attribute valueAttr = adaptor.getValue(); - if (!valueAttr) valueAttr = op.getValue(); - - if (auto floatAttr = dyn_cast_or_null(valueAttr)) { - SmallString<32> valStr; - floatAttr.getValue().toString(valStr); - llvm::StringRef s(valStr); - // Ensure the literal parses as a floating-point constant in C/C++. - // `APFloat::toString` may emit "1" for integral values; make it "1.0". - const bool hasFloatMarker = - s.contains('.') || s.contains('e') || s.contains('E') || - s.contains('p') || s.contains('P') || s.starts_with("0x") || - s.starts_with("0X") || s.starts_with("nan") || - s.starts_with("-nan") || s.starts_with("inf") || - s.starts_with("-inf"); - if (!hasFloatMarker) - valStr.append(".0"); - // Suffix: keep `f` for f16/f32; omit for f64. - if (!floatAttr.getType().isF64()) - valStr.append("f"); - auto constAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); - rewriter.replaceOpWithNewOp(op, newType, constAttr); - return success(); - } - - if (auto intAttr = dyn_cast_or_null(valueAttr)) { - std::string valStr = std::to_string(intAttr.getValue().getSExtValue()); - auto constAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); - rewriter.replaceOpWithNewOp(op, newType, constAttr); - return success(); - } - - return failure(); - } - }; -//===----------------------------------------------------------------------===// -// pto.mgather lowering -> MGATHER(dst, mem, idx) -// %dst = pto.mgather %mem, %idx : memref<...>, memref<...> -> memref<...> -//===----------------------------------------------------------------------===// - -struct PTOMGatherToMGATHER : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::MGatherOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value mem = peelUnrealized(adaptor.getMem()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa currently has no NPU implementation for MGATHER/MSCATTER. - // Fallback to a smoke-friendly lowering to keep compile/run coverage. - rewriter.create( - op.getLoc(), TypeRange{}, "TLOAD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, mem}); - - if (op->getNumResults() == 0) { - rewriter.eraseOp(op); - } else { - rewriter.replaceOp(op, dst); - } - return success(); - } -}; - -struct AffineApplyMulConstToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(affine::AffineApplyOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto map = op.getAffineMap(); - - if (map.getNumDims() != 0 || map.getNumSymbols() != 1) - return failure(); - - auto expr = map.getResult(0); - auto bin = dyn_cast(expr); - if (!bin || bin.getKind() != AffineExprKind::Mul) - return failure(); - - auto lhs = bin.getLHS(); - auto rhs = bin.getRHS(); - - auto symExpr = dyn_cast(lhs); - auto constExpr = dyn_cast(rhs); - if (!symExpr || !constExpr) - return failure(); - - Value inputVal = adaptor.getMapOperands()[0]; - - std::string valStr = std::to_string(constExpr.getValue()); - auto cstAttr = emitc::OpaqueAttr::get(rewriter.getContext(), valStr); - auto cstOp = rewriter.create( - op.getLoc(), inputVal.getType(), cstAttr); - - rewriter.replaceOpWithNewOp( - op, inputVal.getType(), inputVal, cstOp); - - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Kernel inference helpers -//===----------------------------------------------------------------------===// - -enum class KernelKind { VecAdd, Matmul, Unknown }; - -static KernelKind inferKernelKind(func::FuncOp f) { - bool hasAdd = false; - bool hasMM = false; - f.walk([&](Operation *op) { - if (isa(op)) hasAdd = true; - if (isa(op)) hasMM = true; - if (isa(op)) hasMM = true; - }); - if (hasMM) return KernelKind::Matmul; - if (hasAdd) return KernelKind::VecAdd; - return KernelKind::Unknown; -} - -static void inferTileMNK(func::FuncOp f, int &M, int &N, int &K) { - M = 32; N = 32; K = 32; - SmallVector subs; - f.walk([&](memref::SubViewOp sv) { subs.push_back(sv); }); - - auto readShape2D = [&](memref::SubViewOp sv, int &d0, int &d1) { - auto resTy = mlir::cast(sv.getResult().getType()); - if (resTy.getRank() == 2 && resTy.hasStaticShape()) { - d0 = (int)resTy.getDimSize(0); - d1 = (int)resTy.getDimSize(1); - } - }; - - if (subs.empty()) return; - - int a0=32, a1=32; - readShape2D(subs[0], a0, a1); - M = a0; N = a1; - - if (subs.size() >= 2) { - int b0=32, b1=32; - readShape2D(subs[0], a0, a1); - readShape2D(subs[1], b0, b1); - M = a0; K = a1; N = b1; - } -} - -struct FuncToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(func::FuncOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - // Convert the function signature with the type converter. - Type convertedTy = getTypeConverter()->convertType(op.getFunctionType()); - auto funcType = dyn_cast_or_null(convertedTy); - if (!funcType) - return rewriter.notifyMatchFailure(op, "failed to convert function type"); - if (funcType.getNumResults() > 1) - return rewriter.notifyMatchFailure( - op, "EmitC cannot return multiple values"); - - // Create the EmitC function with the converted signature. - auto emitcFunc = rewriter.create(op.getLoc(), op.getName(), - funcType); - emitcFunc.setSpecifiersAttr( - rewriter.getStrArrayAttr({"__global__ AICORE"})); - - // Inline the original body, then convert region/block argument types to - // match the converted signature (also covers CFG blocks introduced by - // pre-lowering, e.g. scf.while -> cf.br/cf.cond_br). - rewriter.inlineRegionBefore(op.getBody(), emitcFunc.getBody(), - emitcFunc.end()); - - TypeConverter::SignatureConversion entryConv(op.getNumArguments()); - for (unsigned i = 0; i < op.getNumArguments(); ++i) - entryConv.addInputs(i, funcType.getInput(i)); - - if (failed(rewriter.convertRegionTypes(&emitcFunc.getBody(), - *getTypeConverter(), &entryConv))) - return failure(); - - // [Compatibility patch] Preserve existing snippets that rely on `T`. - { - Block &entryBlock = emitcFunc.getBody().front(); - rewriter.setInsertionPointToStart(&entryBlock); - rewriter.create(op.getLoc(), "using T = float;"); - } - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// SubView lowering to GlobalTensor (keep your existing code) -//===----------------------------------------------------------------------=== - -enum class Role { A, B, C, Unknown }; - -static Role inferSubviewRole(memref::SubViewOp sv) { - for (Operation *u : sv.getResult().getUsers()) { - if (auto ld = dyn_cast(u)) { - Value ub = ld.getDst(); - if (!ub) continue; - for (Operation *uu : ub.getUsers()) { - if (auto mm = dyn_cast(uu)) { - if (mm.getLhs() == ub) return Role::A; - if (mm.getRhs() == ub) return Role::B; - } - if (auto mmacc = dyn_cast(uu)) { - if (mmacc.getLhs() == ub) return Role::A; - if (mmacc.getRhs() == ub) return Role::B; - } - } - } - - if (auto st = dyn_cast(u)) { - if (st.getDst() == sv.getResult()) return Role::C; - } - } - return Role::Unknown; -} - -// ============================================================================= -// 4. MemRef SubView -> Explicit Shape/Stride Construction (Full Implementation) -// ============================================================================= -struct SubviewToEmitCPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - // 辅助函数:尝试从 OpFoldResult 中提取静态整数值 - std::optional extractStaticInt(OpFoldResult ofr) const { - if (auto attr = ofr.dyn_cast()) { - if (auto intAttr = dyn_cast(attr)) - return intAttr.getInt(); - } else { - Value v = ofr.get(); - if (auto cOp = v.getDefiningOp()) { - if (auto iAttr = dyn_cast(cOp.getValue())) - return iAttr.getInt(); - } else if (auto idxOp = v.getDefiningOp()) { - return idxOp.value(); - } - } - return std::nullopt; - } - - LogicalResult matchAndRewrite(memref::SubViewOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - // 获取源 MemRef 类型信息 - auto srcType = mlir::cast(op.getSource().getType()); - int64_t rank = srcType.getRank(); - - auto elemTypeToString = [&](Type elemTy) -> std::string { - if (elemTy.isF16()) - return "half"; - if (elemTy.isBF16()) - return "bfloat16_t"; - if (elemTy.isF32()) - return "float"; - if (elemTy.isF64()) - return "double"; - if (elemTy.isInteger(8)) { - if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) - return "int8_t"; - return "uint8_t"; - } - if (elemTy.isInteger(16)) { - if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) - return "int16_t"; - return "uint16_t"; - } - if (elemTy.isInteger(32)) { - if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) - return "int32_t"; - return "uint32_t"; - } - if (elemTy.isInteger(64)) { - return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; - } - return "float"; - }; - - // ------------------------------------------------------------------------- - // Part 1: 指针偏移计算 (Runtime Pointer Arithmetic) - // ------------------------------------------------------------------------- - - // 准备类型: unsigned - Type u32Ty = emitc::OpaqueType::get(ctx, "unsigned"); - - // Helper: 创建 unsigned 常量 - auto mkU32 = [&](int64_t v) -> Value { - return rewriter.create( - loc, u32Ty, emitc::OpaqueAttr::get(ctx, std::to_string(v))); - }; - - // Helper: 将 OpFoldResult 转为 EmitC Value (用于计算) - auto ofrToEmitCValue = [&](OpFoldResult ofr) -> Value { - if (auto v = ofr.dyn_cast()) { - Value rv = rewriter.getRemappedValue(v); - // 如果类型不匹配,插入 Cast - if (rv.getType() != u32Ty) - return rewriter.create(loc, u32Ty, rv).getResult(); - return rv; - } - if (auto attr = ofr.dyn_cast()) { - if (auto ia = dyn_cast(attr)) - return mkU32(ia.getValue().getSExtValue()); - } - return mkU32(0); - }; - - // 1. 获取 Source 的 Strides (支持动态 Stride 收集) - SmallVector sourceStrides; - - if (auto rc = op.getSource().getDefiningOp()) { - sourceStrides = rc.getMixedStrides(); - } else { - SmallVector strideInts; - int64_t offset = ShapedType::kDynamic; - bool useTypeStrides = succeeded(getStridesAndOffset(srcType, strideInts, offset)); - (void)offset; - if (useTypeStrides) { - for (int64_t s : strideInts) { - if (s == ShapedType::kDynamic) { - useTypeStrides = false; - break; - } - } - } - if (useTypeStrides) { - for (int64_t s : strideInts) { - sourceStrides.push_back(rewriter.getIndexAttr(s)); - } - } else { - // Fallback: Compact Layout - auto shape = srcType.getShape(); - int64_t current = 1; - sourceStrides.resize(rank); - for (int i = rank - 1; i >= 0; --i) { - sourceStrides[i] = rewriter.getIndexAttr(current); - if (shape[i] != ShapedType::kDynamic) current *= shape[i]; - } - } - } - - // 2. 计算运行时 Offset - auto staticOffsets = op.getStaticOffsets(); - auto dynamicOffsets = adaptor.getOffsets(); - int dynOffIdx = 0; - Value totalOffset = mkU32(0); - - for (int i = 0; i < rank; ++i) { - // A. 获取 Offset - Value offVal; - if (staticOffsets[i] == ShapedType::kDynamic) { - Value rawDyn = dynamicOffsets[dynOffIdx++]; - offVal = rewriter.create(loc, u32Ty, rawDyn); - } else { - offVal = mkU32(staticOffsets[i]); - } - - // B. 获取 Stride (用于指针计算) - Value strideVal = mkU32(1); - if (i < (int)sourceStrides.size()) { - strideVal = ofrToEmitCValue(sourceStrides[i]); - } - - // C. 累加 - Value term = rewriter.create(loc, u32Ty, offVal, strideVal); - totalOffset = rewriter.create(loc, u32Ty, totalOffset, term); - } - - // 3. 生成新指针 - // - // NOTE: Some toolchains may materialize kernel pointer params as `void*` even - // when the underlying element type is i16. Pointer arithmetic on `void*` - // is ill-formed in C++, so we explicitly cast to a typed pointer for i16. - Value sourcePtr = adaptor.getSource(); - Value tileCandidate = sourcePtr; - if (auto castOp = sourcePtr.getDefiningOp()) { - tileCandidate = castOp.getOperand(); - } else if (auto uc = - sourcePtr.getDefiningOp()) { - tileCandidate = uc.getOperand(0); - } - if (auto ot = dyn_cast(tileCandidate.getType())) { - auto tyStr = ot.getValue(); - if (tyStr.find("Tile<") != std::string::npos || - tyStr.find("ConvTile<") != std::string::npos) { - std::string elemTok = elemTypeToString(srcType.getElementType()); - std::string qualifier = "__gm__"; - if (auto asAttr = - dyn_cast_or_null(srcType.getMemorySpace())) - qualifier = addrSpaceQualifier(asAttr.getAddressSpace()); - auto rawPtrTy = - emitc::OpaqueType::get(ctx, qualifier + " " + elemTok + "*"); - sourcePtr = - rewriter - .create(loc, rawPtrTy, - "PTOAS__TILE_DATA", ArrayAttr{}, - ArrayAttr{}, ValueRange{tileCandidate}) - .getResult(0); - } - } - Value newPtr; - { - auto resTy = mlir::cast(op.getResult().getType()); - Type elemTy = resTy.getElementType(); - if (elemTy.isInteger(16)) { - std::string castElemTypeStr = "int16_t"; - if (cast(elemTy).isUnsigned()) - castElemTypeStr = "uint16_t"; - - std::string qualifier = "__gm__"; - if (Attribute ms = srcType.getMemorySpace()) { - if (auto ptoAttr = dyn_cast(ms)) { - qualifier = addrSpaceQualifier(ptoAttr.getAddressSpace()); - } - } - - auto typedPtrTy = emitc::OpaqueType::get(ctx, qualifier + " " + castElemTypeStr + "*"); - Value typedSourcePtr = rewriter.create(loc, typedPtrTy, sourcePtr); - newPtr = rewriter.create(loc, typedPtrTy, typedSourcePtr, totalOffset); - } else { - newPtr = rewriter.create(loc, sourcePtr.getType(), sourcePtr, totalOffset); - } - } - - - // ------------------------------------------------------------------------- - // Part 2: For non-GM memrefs, keep pointer (no GlobalTensor). - // ------------------------------------------------------------------------- - bool isGlobal = true; - if (auto asAttr = dyn_cast_or_null(srcType.getMemorySpace())) { - auto as = asAttr.getAddressSpace(); - isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); - } - if (!isGlobal) { - Type dstTy = getTypeConverter()->convertType(op.getType()); - if (!dstTy) - return failure(); - if (newPtr.getType() != dstTy) - newPtr = rewriter.create(loc, dstTy, newPtr); - rewriter.replaceOp(op, newPtr); - return success(); - } - - // ------------------------------------------------------------------------- - // Part 3: 生成 GlobalTensor 类型 (Shape/Stride Template Generation) - // ------------------------------------------------------------------------- - - // When emitting C++ with `declareVariablesAtTop`, value declarations are - // hoisted before body statements. Avoid introducing local `using` aliases - // for templated types (Shape/Stride/GlobalTensor) because those aliases - // would appear after the hoisted declarations and break compilation - // (`unknown type name`). - // - // Instead, use the fully spelled template types as EmitC opaque types. - - auto resTy = mlir::cast(op.getResult().getType()); - - // 1. 解析具体元素类型 (完整逻辑,不省略) - std::string elemTypeStr = "float"; - Type elemTy = resTy.getElementType(); - - if (elemTy.isF16()) { - elemTypeStr = "half"; - } else if (elemTy.isBF16()) { - elemTypeStr = "bfloat16_t"; - } else if (elemTy.isF32()) { - elemTypeStr = "float"; - } else if (elemTy.isInteger(8)) { - // 区分有符号/无符号通常依赖上下文,但在 EmitC 中 int8_t 比较通用 - if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) - elemTypeStr = "int8_t"; - else - elemTypeStr = "uint8_t"; - } else if (elemTy.isInteger(16)) { - if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) - elemTypeStr = "int16_t"; - else - elemTypeStr = "uint16_t"; - } else if (elemTy.isInteger(32)) { - if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) - elemTypeStr = "int32_t"; - else - elemTypeStr = "uint32_t"; - } else if (elemTy.isInteger(64)) { - elemTypeStr = cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; - } - - // 2. 生成 Shape 模板参数,之后会右对齐有效维度并补齐到 5 维(高维填 1) - SmallVector shapeParamsVec; - SmallVector sizeValues; // 每个维度对应的运行时 size(统一为 unsigned) - auto resShape = resTy.getShape(); - auto mixedSizes = op.getMixedSizes(); - sizeValues.reserve(rank); - for (int i = 0; i < resTy.getRank(); ++i) { - if (resShape[i] == ShapedType::kDynamic) { - shapeParamsVec.push_back("-1"); - } else { - shapeParamsVec.push_back(std::to_string(resShape[i])); - } - // size 值:优先从 op.getMixedSizes() 取(可动态/静态),否则退化为类型里的静态 shape。 - if (i < (int)mixedSizes.size()) - sizeValues.push_back(ofrToEmitCValue(mixedSizes[i])); - else - sizeValues.push_back( - mkU32(resShape[i] == ShapedType::kDynamic ? 1 : resShape[i])); - } - - // 3. 生成 Stride 模板参数 + 运行时 stride 值(考虑 subview step) - SmallVector dummyStrideVec; - SmallVector strideValues; // 每个维度对应的运行时 stride(统一为 unsigned) - dummyStrideVec.reserve(rank); - strideValues.reserve(rank); - auto subViewSteps = op.getMixedStrides(); - for (int i = 0; i < rank; ++i) { - OpFoldResult srcStrideOfr = - (i < (int)sourceStrides.size()) ? sourceStrides[i] - : rewriter.getIndexAttr(1); - OpFoldResult stepOfr = (i < (int)subViewSteps.size()) - ? subViewSteps[i] - : rewriter.getIndexAttr(1); - - auto srcStatic = extractStaticInt(srcStrideOfr); - auto stepStatic = extractStaticInt(stepOfr); - if (srcStatic && stepStatic) { - int64_t finalStride = (*srcStatic) * (*stepStatic); - dummyStrideVec.push_back(std::to_string(finalStride)); - strideValues.push_back(mkU32(finalStride)); - continue; - } - - dummyStrideVec.push_back("-1"); - Value srcV = ofrToEmitCValue(srcStrideOfr); - Value stepV = ofrToEmitCValue(stepOfr); - // 尽量避免乘以 1 生成冗余指令 - if (stepStatic && *stepStatic == 1) - strideValues.push_back(srcV); - else if (srcStatic && *srcStatic == 1) - strideValues.push_back(stepV); - else - strideValues.push_back( - rewriter.create(loc, u32Ty, srcV, stepV)); - } - - // 3.1 右对齐到 5 维:shape 补 1;已有维度继承原 stride; - // 被补出来的高维按“紧密升维”规则连续推导:stride[i] = shape[i+1] * stride[i+1] - SmallVector finalShape(5, "1"); - SmallVector finalStride(5, "1"); - Value oneU32 = mkU32(1); - SmallVector finalShapeValues(5, oneU32); - SmallVector finalStrideValues(5, oneU32); - int shift = 5 - rank; - - // 先放入原始 shape/stride(保持用户提供的值) - for (int i = 0; i < rank && i < 5; ++i) { - finalShape[shift + i] = shapeParamsVec[i]; - finalStride[shift + i] = dummyStrideVec[i]; - finalShapeValues[shift + i] = sizeValues[i]; - finalStrideValues[shift + i] = strideValues[i]; - } - - auto mulOrDyn = [](const std::string &a, const std::string &b) -> std::string { - if (a == "-1" || b == "-1") - return "-1"; - int64_t va = 1, vb = 1; - (void)llvm::to_integer(a, va); - (void)llvm::to_integer(b, vb); - return std::to_string(va * vb); - }; - - // 从低维到高维倒推补齐 stride(仅对补出来的前置维度生效) - for (int i = 3; i >= 0; --i) { - // 如果该维已由原始 rank 覆盖,则保持原值 - if (i >= shift) - continue; - // 补维:shape 已经是 1,stride = shape[i+1] * stride[i+1](或动态) - finalStride[i] = mulOrDyn(finalShape[i + 1], finalStride[i + 1]); - if (finalStride[i] != "-1") { - int64_t si = 1; - (void)llvm::to_integer(finalStride[i], si); - finalStrideValues[i] = mkU32(si); - continue; - } - // 动态推导:stride[i] = shape[i+1] * stride[i+1] - if (finalShape[i + 1] == "1") { - finalStrideValues[i] = finalStrideValues[i + 1]; - } else { - finalStrideValues[i] = rewriter.create( - loc, u32Ty, finalShapeValues[i + 1], finalStrideValues[i + 1]); - } - } - - auto joinParams = [](llvm::ArrayRef vec) { - std::string out; - for (size_t i = 0; i < vec.size(); ++i) { - if (i > 0) out += ", "; - out += vec[i]; - } - return out; - }; - - std::string shapeParams = joinParams(finalShape); - std::string strideParams = joinParams(finalStride); - - // Spelled-out C++ types. - std::string shapeCppType = "pto::Shape<" + shapeParams + ">"; - std::string strideCppType = "pto::Stride<" + strideParams + ">"; - - // 3.0 Layout: prefer the attribute from InferPTOLayout; only fall back to - // local inference when the pass is disabled. - std::string layoutEnum = "pto::Layout::ND"; - if (auto layout = resolveLayoutForGlobalTensor(op, op.getSource())) { - layoutEnum = layoutToEmitCString(*layout); - } else { - auto strToInt = [](const std::string &s, int64_t &out) -> bool { - return s != "-1" && llvm::to_integer(s, out); - }; - SmallVector shapeInt(5, -1), strideInt(5, -1); - bool allStatic = true; - for (int i = 0; i < 5; ++i) { - if (!strToInt(finalShape[i], shapeInt[i]) || - !strToInt(finalStride[i], strideInt[i])) - allStatic = false; - } - - int layoutTag = 0; // ND - auto elemBytes = 4; // default float - if (elemTypeStr.find("half") != std::string::npos || - elemTypeStr.find("f16") != std::string::npos || - elemTypeStr.find("bf16") != std::string::npos) - elemBytes = 2; - else if (elemTypeStr.find("double") != std::string::npos || - elemTypeStr.find("f64") != std::string::npos) - elemBytes = 8; - - if (allStatic) { - if (shapeInt[2] == 16 && shapeInt[2] * shapeInt[3] * elemBytes == 512 && - strideInt[4] == 1 && strideInt[3] == shapeInt[4]) { - layoutTag = 2; // NZ - } else { - bool isRow = strideInt[4] == 1; - for (int i = 3; i >= 0; --i) - isRow &= (strideInt[i] == strideInt[i + 1] * shapeInt[i + 1]); - bool isCol = strideInt[0] == 1; - for (int i = 0; i < 4; ++i) - isCol &= (strideInt[i + 1] == strideInt[i] * shapeInt[i]); - if (isCol) - layoutTag = 1; // DN - else - layoutTag = isRow ? 0 : 0; // fallback ND - } - } - - if (layoutTag == 1) - layoutEnum = "pto::Layout::DN"; - else if (layoutTag == 2) - layoutEnum = "pto::Layout::NZ"; - } - // GlobalTensor takes a Layout non-type template parameter; directly use the - // enum constant. - - - // ------------------------------------------------------------------------- - // Part 3: 显式对象实例化 (Explicit Object Instantiation) - // ------------------------------------------------------------------------- - - // A. Instantiate Shape object. - auto shapeTypeOpaque = emitc::OpaqueType::get(ctx, shapeCppType); - SmallVector shapeArgs; - // 从 adaptor.getSizes() 获取 subview 的所有 dynamic sizes - for (Value dynSize : adaptor.getSizes()) { - shapeArgs.push_back(dynSize); - } - - auto shapeInstOp = rewriter.create( - loc, - shapeTypeOpaque, // 返回类型 - shapeCppType, // 调用的“函数名”即类名构造函数 - /*args=*/ArrayAttr{}, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange(shapeArgs) - ); - - // B. Instantiate Stride object. - auto strideTypeOpaque = emitc::OpaqueType::get(ctx, strideCppType); - // 仅传入动态 stride 维度对应的值,匹配 pto::Stride 的 N-parameter ctor(并满足其 static_assert)。 - SmallVector strideCtorArgs; - strideCtorArgs.reserve(5); - for (int i = 0; i < 5; ++i) { - if (finalStride[i] == "-1") - strideCtorArgs.push_back(finalStrideValues[i]); - } - auto strideInstOp = rewriter.create( - loc, strideTypeOpaque, strideCppType, - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange(strideCtorArgs)); - - // C. Instantiate GlobalTensor object (ptr + shape + stride). - std::string gtCppType = "GlobalTensor<" + elemTypeStr + ", " + shapeCppType + - ", " + strideCppType + ", " + layoutEnum + ">"; - auto gtType = emitc::OpaqueType::get(ctx, gtCppType); - - // 准备构造参数: [ptr, shape_instance, stride_instance] - SmallVector gtConstructorArgs; - gtConstructorArgs.push_back(newPtr); - gtConstructorArgs.push_back(shapeInstOp.getResult(0)); // 拿到 shape_inst 的 SSA Value - gtConstructorArgs.push_back(strideInstOp.getResult(0)); // 拿到 stride_inst 的 SSA Value - - rewriter.replaceOpWithNewOp( - op, - gtType, - gtCppType, - /*args=*/ArrayAttr{}, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange(gtConstructorArgs) - ); - - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Helper: build GlobalTensor from a static MemRef (for TLOAD/TSTORE) -//===----------------------------------------------------------------------===// - -static std::string getElemTypeStringForGT(Type elemTy) { - if (elemTy.isF16()) return "half"; - if (elemTy.isBF16()) return "bfloat16_t"; - if (elemTy.isF32()) return "float"; - if (elemTy.isF64()) return "double"; - if (elemTy.isInteger(8)) { - if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) - return "int8_t"; - return "uint8_t"; - } - if (elemTy.isInteger(16)) { - if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) - return "int16_t"; - return "uint16_t"; - } - if (elemTy.isInteger(32)) { - if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) - return "int32_t"; - return "uint32_t"; - } - if (elemTy.isInteger(64)) { - return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; - } - return "float"; -} - -static Value buildGlobalTensorFromMemref(ConversionPatternRewriter &rewriter, - Location loc, Value basePtr, - MemRefType mrTy, - Operation *anchor) { - auto *ctx = rewriter.getContext(); - - // Only handle fully static shapes/strides for now. - auto shape = mrTy.getShape(); - for (int64_t dim : shape) { - if (dim == ShapedType::kDynamic) - return Value(); - } - - SmallVector strides; - int64_t offset = 0; - if (failed(getStridesAndOffset(mrTy, strides, offset))) { - // Fallback: compact row-major - strides.resize(shape.size()); - int64_t s = 1; - for (int i = (int)shape.size() - 1; i >= 0; --i) { - strides[i] = s; - s *= shape[i]; - } - offset = 0; - } - if (offset == ShapedType::kDynamic) - return Value(); - for (int64_t s : strides) { - if (s == ShapedType::kDynamic) - return Value(); - } - - // Apply static base offset if needed. - Value ptr = basePtr; - if (offset != 0) { - Type u32Ty = emitc::OpaqueType::get(ctx, "unsigned"); - auto offVal = rewriter.create( - loc, u32Ty, emitc::OpaqueAttr::get(ctx, std::to_string(offset))); - ptr = rewriter.create(loc, basePtr.getType(), basePtr, - offVal); - } - - std::string suffix = "_" + std::to_string(reinterpret_cast(anchor)); - std::string shapeTypeName = "GTShape" + suffix; - std::string strideTypeName = "GTStride" + suffix; - std::string gtTypeName = "GT" + suffix; - - std::string elemTypeStr = getElemTypeStringForGT(mrTy.getElementType()); - - SmallVector shapeParamsVec; - SmallVector strideParamsVec; - for (int i = 0, e = (int)shape.size(); i < e; ++i) { - shapeParamsVec.push_back(std::to_string(shape[i])); - strideParamsVec.push_back(std::to_string(strides[i])); - } - - // Right-align to 5D (pad leading dims with 1). - SmallVector finalShape(5, "1"); - SmallVector finalStride(5, "1"); - int rank = (int)shape.size(); - int shift = 5 - rank; - for (int i = 0; i < rank && i < 5; ++i) { - finalShape[shift + i] = shapeParamsVec[i]; - finalStride[shift + i] = strideParamsVec[i]; - } - auto mulOrDyn = [](const std::string &a, const std::string &b) -> std::string { - if (a == "-1" || b == "-1") - return "-1"; - int64_t va = 1, vb = 1; - (void)llvm::to_integer(a, va); - (void)llvm::to_integer(b, vb); - return std::to_string(va * vb); - }; - for (int i = 3; i >= 0; --i) { - if (i >= shift) - continue; - finalStride[i] = mulOrDyn(finalShape[i + 1], finalStride[i + 1]); - } - - auto joinParams = [](llvm::ArrayRef vec) { - std::string out; - for (size_t i = 0; i < vec.size(); ++i) { - if (i > 0) out += ", "; - out += vec[i]; - } - return out; - }; - - std::string shapeParams = joinParams(finalShape); - std::string strideParams = joinParams(finalStride); - - rewriter.create( - loc, "using " + shapeTypeName + " = pto::Shape<" + shapeParams + ">;"); - rewriter.create( - loc, "using " + strideTypeName + " = pto::Stride<" + strideParams + ">;"); - - // Layout: prefer the attribute from InferPTOLayout; only fall back to local - // inference when the pass is disabled. - std::string layoutEnum = "pto::Layout::ND"; - bool hasLayoutAttr = false; - if (auto layout = resolveLayoutForGlobalTensor(anchor, basePtr)) { - layoutEnum = layoutToEmitCString(*layout); - hasLayoutAttr = true; - } - if (!hasLayoutAttr) { - SmallVector shapeInt(5, -1), strideInt(5, -1); - for (int i = 0; i < 5; ++i) { - (void)llvm::to_integer(finalShape[i], shapeInt[i]); - (void)llvm::to_integer(finalStride[i], strideInt[i]); - } - int layoutTag = 0; // ND - int elemBytes = 4; - if (elemTypeStr.find("half") != std::string::npos || - elemTypeStr.find("bf16") != std::string::npos) - elemBytes = 2; - else if (elemTypeStr.find("double") != std::string::npos) - elemBytes = 8; - if (shapeInt[2] == 16 && shapeInt[2] * shapeInt[3] * elemBytes == 512 && - strideInt[4] == 1 && strideInt[3] == shapeInt[4]) { - layoutTag = 2; // NZ - } else { - bool isRow = strideInt[4] == 1; - for (int i = 3; i >= 0; --i) - isRow &= (strideInt[i] == strideInt[i + 1] * shapeInt[i + 1]); - bool isCol = strideInt[0] == 1; - for (int i = 0; i < 4; ++i) - isCol &= (strideInt[i + 1] == strideInt[i] * shapeInt[i]); - if (isCol) layoutTag = 1; // DN - else layoutTag = isRow ? 0 : 0; // fallback ND - } - if (layoutTag == 1) - layoutEnum = "pto::Layout::DN"; - else if (layoutTag == 2) - layoutEnum = "pto::Layout::NZ"; - } - std::string layoutConstName = gtTypeName + "_layout"; - rewriter.create( - loc, "constexpr pto::Layout " + layoutConstName + " = " + layoutEnum + ";"); - - auto shapeTypeOpaque = emitc::OpaqueType::get(ctx, shapeTypeName); - auto strideTypeOpaque = emitc::OpaqueType::get(ctx, strideTypeName); - auto shapeInstOp = rewriter.create( - loc, shapeTypeOpaque, shapeTypeName, ArrayAttr{}, ArrayAttr{}, - ValueRange{}); - auto strideInstOp = rewriter.create( - loc, strideTypeOpaque, strideTypeName, ArrayAttr{}, ArrayAttr{}, - ValueRange{}); - - rewriter.create( - loc, "using " + gtTypeName + " = GlobalTensor<" + elemTypeStr + ", " + - shapeTypeName + ", " + strideTypeName + ", " + - layoutConstName + ">;"); - auto gtType = emitc::OpaqueType::get(ctx, gtTypeName); - - SmallVector gtArgs; - gtArgs.push_back(ptr); - gtArgs.push_back(shapeInstOp.getResult(0)); - gtArgs.push_back(strideInstOp.getResult(0)); - - auto gtInst = rewriter.create( - loc, gtType, gtTypeName, ArrayAttr{}, ArrayAttr{}, ValueRange(gtArgs)); - - return gtInst.getResult(0); -} - -//===----------------------------------------------------------------------===// -// pto.pointer_cast lowering -//===----------------------------------------------------------------------=== -struct PointerCastConversion : public OpConversionPattern { - static bool getIndexConst(Value v, int64_t &out) { - if (auto cst = v.getDefiningOp()) { - if (auto ia = dyn_cast(cst.getValue())) { - out = ia.getValue().getSExtValue(); - return true; - } - } - return false; - } - - using OpConversionPattern::OpConversionPattern; - - enum class TileRole { Vec, Mat, Left, Right, Acc, Bias, Scaling }; - - static void collectUserOpsThroughCasts(Value v, SmallVectorImpl &out) { - for (Operation *u : v.getUsers()) { - if (auto castOp = dyn_cast(u)) { - for (Value r : castOp.getResults()) - collectUserOpsThroughCasts(r, out); - continue; - } - out.push_back(u); - } - } - - static Value peelUnrealized(Value v) { - while (auto castOp = v.getDefiningOp()) { - v = castOp.getOperand(0); - } - return v; - } - - static TileRole inferRole(pto::PointerCastOp op) { - // 1. 优先检查 AddressSpace - if (auto memRefTy = dyn_cast(op.getType())) { - Attribute memorySpace = memRefTy.getMemorySpace(); - if (auto ptoAttr = dyn_cast_or_null(memorySpace)) { - switch (ptoAttr.getAddressSpace()) { - case pto::AddressSpace::LEFT: return TileRole::Left; - case pto::AddressSpace::RIGHT: return TileRole::Right; - case pto::AddressSpace::ACC: return TileRole::Acc; - case pto::AddressSpace::BIAS: return TileRole::Bias; - case pto::AddressSpace::MAT: return TileRole::Mat; - case pto::AddressSpace::SCALING: return TileRole::Scaling; - default: break; - } - } - } - - // 2. 通过 Usage 推导 (Fallback) - SmallVector users; - collectUserOpsThroughCasts(op.getResult(), users); - - for (Operation *user : users) { - if (auto mm = dyn_cast(user)) { - if (mm.getDst() && peelUnrealized(mm.getDst()) == op.getResult()) return TileRole::Acc; - if (peelUnrealized(mm.getLhs()) == op.getResult()) return TileRole::Left; - if (peelUnrealized(mm.getRhs()) == op.getResult()) return TileRole::Right; - } - if (auto mmacc = dyn_cast(user)) { - if (mmacc.getDst() && peelUnrealized(mmacc.getDst()) == op.getResult()) return TileRole::Acc; - if (peelUnrealized(mmacc.getAccIn()) == op.getResult()) return TileRole::Acc; - if (peelUnrealized(mmacc.getLhs()) == op.getResult()) return TileRole::Left; - if (peelUnrealized(mmacc.getRhs()) == op.getResult()) return TileRole::Right; - } - } - - return TileRole::Vec; - } - - // [新增] 辅助函数:判断 Value 是否源自 arith.constant - static bool isConstant(Value v, int64_t &outVal) { - if (!v) return false; - if (auto cst = v.getDefiningOp()) { - if (auto attr = dyn_cast(cst.getValue())) { - outVal = attr.getInt(); - return true; - } - } - return false; - } - - LogicalResult matchAndRewrite(pto::PointerCastOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - auto selfType = mlir::cast(op.getType()); - ArrayRef shape = selfType.getShape(); - Type elemType = selfType.getElementType(); - - // 1. 推导 Tile Role - TileRole role = inferRole(op); - - // 2. 类型字符串生成 (elemTypeStr, dimStr) - std::string elemTypeStr = "T"; - if (elemType.isF16()) elemTypeStr = "half"; - else if (elemType.isBF16()) elemTypeStr = "bfloat16_t"; - else if (elemType.isF32()) elemTypeStr = "float"; - else if (elemType.isInteger(8)) elemTypeStr = cast(elemType).isUnsigned() ? "uint8_t" : "int8_t"; - else if (elemType.isInteger(16)) elemTypeStr = cast(elemType).isUnsigned() ? "uint16_t" : "int16_t"; - else if (elemType.isInteger(32)) elemTypeStr = cast(elemType).isUnsigned() ? "uint32_t" : "int32_t"; - else if (elemType.isInteger(64)) elemTypeStr = cast(elemType).isUnsigned() ? "uint64_t" : "int64_t"; - - std::string dimStr; - auto dimToString = [](int64_t dim, const char* symbol) -> std::string { - return (dim == ShapedType::kDynamic) ? std::string(symbol) : std::to_string(dim); - }; - - if (role == TileRole::Left) dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "K"); - else if (role == TileRole::Right) dimStr = dimToString(shape[0], "K") + ", " + dimToString(shape[1], "N"); - else if (role == TileRole::Bias) dimStr = "1, " + dimToString(shape[1], "N"); - else dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "N"); - - // 3. Role Token - const char *roleTok = "TileType::Vec"; - switch (role) { - case TileRole::Left: roleTok = "TileType::Left"; break; - case TileRole::Right: roleTok = "TileType::Right"; break; - case TileRole::Acc: roleTok = "TileType::Acc"; break; - case TileRole::Bias: roleTok = "TileType::Bias"; break; - case TileRole::Mat: roleTok = "TileType::Mat"; break; - case TileRole::Vec: roleTok = "TileType::Vec"; break; - case TileRole::Scaling: roleTok = "TileType::Scaling"; break; - } - - // 4. Config & Layout (support BLayoutAttr/SLayoutAttr/PadValueAttr after namespace change) - std::string layoutParams = "BLayout::RowMajor"; - std::string extraParams = ""; - if (auto configOpt = op.getConfig()) { - auto config = *configOpt; - int32_t blVal = 0; - if (auto attr = dyn_cast(config.getBLayout())) - blVal = static_cast(attr.getValue()); - - if (blVal == 1) layoutParams = "BLayout::ColMajor"; - - int32_t slVal = 0; - if (auto attr = dyn_cast(config.getSLayout())) - slVal = static_cast(attr.getValue()); - - std::string slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; - - int32_t frVal = 0; - if (auto attr = dyn_cast(config.getSFractalSize())) frVal = attr.getInt(); - - int32_t padVal = 0; - if (auto attr = dyn_cast(config.getPad())) - padVal = static_cast(attr.getValue()); - - std::string padStr = "PadValue::Null"; - switch (padVal) { - case 1: padStr = "PadValue::Zero"; break; - case 2: padStr = "PadValue::Max"; break; - case 3: padStr = "PadValue::Min"; break; - } - - if (!slStr.empty()) { - extraParams += ", " + slStr + ", " + std::to_string(frVal) + ", " + padStr; - } - } - - // [核心修改] Valid Dims 处理逻辑 (支持混合静态/动态) - std::string vrowTok, vcolTok; - bool useConstructor = false; - - // 引入标志位,明确记录哪个维度是动态的 - bool rowIsDynamic = false; - bool colIsDynamic = false; - - SmallVector constructorArgs; - - Value vRow = op.getValidRow(); - Value vCol = op.getValidCol(); - Value vRowEmitC = adaptor.getValidRow(); - Value vColEmitC = adaptor.getValidCol(); - - int64_t cRow, cCol; - - // --- Row 逻辑 --- - if (vRow && isConstant(vRow, cRow)) { - // Case A: 静态常量 (e.g., 32) - vrowTok = std::to_string(cRow); - } else if (vRow) { - // Case B: 动态变量 (e.g., %arg0) - vrowTok = "-1"; - rowIsDynamic = true; // 标记为动态 - useConstructor = true; - } else { - // Case C: 默认静态 (Shape) - vrowTok = std::to_string(shape[0]); - } - - // --- Col 逻辑 --- - if (vCol && isConstant(vCol, cCol)) { - // Case A: 静态常量 - vcolTok = std::to_string(cCol); - } else if (vCol) { - // Case B: 动态变量 - vcolTok = "-1"; - colIsDynamic = true; // 标记为动态 - useConstructor = true; - } else { - // Case C: 默认静态 - vcolTok = std::to_string(shape[1]); - } - - // --- 收集构造参数 --- - // [修复] 只收集被标记为 Dynamic 的维度的值 - if (useConstructor) { - if (rowIsDynamic && vRowEmitC) constructorArgs.push_back(vRowEmitC); - if (colIsDynamic && vColEmitC) constructorArgs.push_back(vColEmitC); - } - - // 5. 生成 Tile 类型字符串 - std::string tileTypeStr = - std::string("Tile<") + roleTok + ", " + elemTypeStr + ", " + dimStr + ", " + - layoutParams + ", " + vrowTok + ", " + vcolTok + extraParams + ">"; - - auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); - Value resultValue; - - if (useConstructor) { - // 使用 CallOpaqueOp 生成构造函数调用 (Tile v = Tile(...)) - auto ctorOp = rewriter.create( - loc, - tileType, // Result Type - tileTypeStr, // Callee Name (类名) - ArrayAttr{}, // args - ArrayAttr{}, // template_args - ValueRange(constructorArgs) // operands - ); - resultValue = ctorOp.getResult(0); - } else { - // 静态情况 (Tile v;) - auto varOp = rewriter.create( - loc, - tileType, - emitc::OpaqueAttr::get(ctx, "") - ); - resultValue = varOp.getResult(); - } - - // TASSIGN: pto-isa expects an integral address. - Value addr = adaptor.getAddrs()[0]; - if (isa(addr.getType()) || - (isa(addr.getType()) && - cast(addr.getType()).getValue().ends_with("*"))) { - auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); - auto rcU64 = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); - addr = rewriter.create( - loc, u64Ty, "reinterpret_cast", - /*args=*/ArrayAttr{}, /*templateArgs=*/rcU64, - /*operands=*/ValueRange{addr}) - .getResult(0); - } - - rewriter.create( - loc, TypeRange{}, "TASSIGN", - ArrayAttr{}, ArrayAttr{}, - ValueRange{resultValue, addr}); - - rewriter.replaceOp(op, resultValue); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.load_dps / pto.store_dps lowering (FIX: keep optional result) -//===----------------------------------------------------------------------=== - -struct PTOTLoadToTLOAD : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TLoadOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getDst()) - return rewriter.notifyMatchFailure(op, "expected outs(dst) on pto.tload"); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value srcArg = src; - if (auto srcMrTy = dyn_cast(op.getSrc().getType())) { - bool isGlobal = true; - if (auto asAttr = dyn_cast_or_null(srcMrTy.getMemorySpace())) { - auto as = asAttr.getAddressSpace(); - isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); - } - if (isGlobal) { - if (Value gt = buildGlobalTensorFromMemref(rewriter, op.getLoc(), src, srcMrTy, - op.getOperation())) - srcArg = gt; - } - } - - rewriter.create( - op.getLoc(), TypeRange{}, "TLOAD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, srcArg}); - - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -struct PTOTStoreToTSTORE : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TStoreOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getDst()) - return rewriter.notifyMatchFailure(op, "expected outs(dst) on pto.tstore"); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value dstArg = dst; - if (auto dstMrTy = dyn_cast(op.getDst().getType())) { - bool isGlobal = true; - if (auto asAttr = dyn_cast_or_null(dstMrTy.getMemorySpace())) { - auto as = asAttr.getAddressSpace(); - isGlobal = (as == pto::AddressSpace::GM || as == pto::AddressSpace::Zero); - } - if (isGlobal) { - if (Value gt = buildGlobalTensorFromMemref(rewriter, op.getLoc(), dst, dstMrTy, - op.getOperation())) - dstArg = gt; - } - } - - rewriter.create( - op.getLoc(), TypeRange{}, "TSTORE", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dstArg, src}); - - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.matmul_dps lowering (Simplified: No internal copy/sync) -//===----------------------------------------------------------------------===// -struct PTOTMatmulToTMATMUL : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - // 1. 获取操作数 (剥离 Cast) - Value lhs = peelUnrealized(adaptor.getLhs()); // A (Left) - Value rhs = peelUnrealized(adaptor.getRhs()); // B (Right) - Value dst = peelUnrealized(adaptor.getDst()); // C (Acc) - - // 2. 直接生成函数调用 TMATMUL(dst, lhs, rhs) - // 假设输入已经在对应的 L0 Buffer 中 - rewriter.create( - op.getLoc(), TypeRange{}, "TMATMUL", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, lhs, rhs}); - - // 3. 处理 Op 替换/删除 - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.tgemv lowering -//===----------------------------------------------------------------------===// -struct PTOTGemvToTGEMV : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGemvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - // 1. 获取操作数 (剥离 Cast) - Value lhs = peelUnrealized(adaptor.getLhs()); // A (Matrix) - Value rhs = peelUnrealized(adaptor.getRhs()); // B (Vector) - Value dst = peelUnrealized(adaptor.getDst()); // C (Result) - - // 2. 直接生成函数调用 TGEMV(dst, lhs, rhs) - rewriter.create( - op.getLoc(), TypeRange{}, "TGEMV", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, lhs, rhs}); - - // 3. 处理 Op 替换/删除 - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.tgemv.acc lowering -//===----------------------------------------------------------------------===// -struct PTOTGemvAccToTGEMVACC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGemvAccOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getDst()) - return rewriter.notifyMatchFailure(op, "expected outs(dst) for pto.tgemv.acc"); - - // 1. 获取操作数 - Value accIn = peelUnrealized(adaptor.getAccIn()); // AccOld - Value lhs = peelUnrealized(adaptor.getLhs()); // A (Matrix) - Value rhs = peelUnrealized(adaptor.getRhs()); // B (Vector) - Value dst = peelUnrealized(adaptor.getDst()); // AccNew - - // 2. 直接生成函数调用 TGEMV_ACC(dst, accIn, lhs, rhs) - rewriter.create( - op.getLoc(), TypeRange{}, "TGEMV_ACC", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, accIn, lhs, rhs}); - - // 3. 处理 Op 替换/删除 - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.matmul_acc_dps lowering (Simplified: No internal copy/sync) -//===----------------------------------------------------------------------===// -struct PTOTMatmulAccToTMATMULACC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulAccOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getDst()) - return rewriter.notifyMatchFailure(op, "expected outs(dst) for pto.tmatmul.acc"); - - // 1. 获取操作数 - Value accIn = peelUnrealized(adaptor.getAccIn()); // AccOld - Value lhs = peelUnrealized(adaptor.getLhs()); // A (Left) - Value rhs = peelUnrealized(adaptor.getRhs()); // B (Right) - Value dst = peelUnrealized(adaptor.getDst()); // AccNew - - // 2. 直接生成函数调用 TMATMUL_ACC(dst, accIn, lhs, rhs) - rewriter.create( - op.getLoc(), TypeRange{}, "TMATMUL_ACC", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, accIn, lhs, rhs}); - - // 3. 处理 Op 替换/删除 - if (op->getNumResults() == 1) { - rewriter.replaceOp(op, dst); - } else { - rewriter.eraseOp(op); - } - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Return lowering -//===----------------------------------------------------------------------=== - -struct ReturnToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(func::ReturnOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto vals = adaptor.getOperands(); - if (vals.empty()) { - rewriter.replaceOpWithNewOp(op, Value{}); - return success(); - } - if (vals.size() == 1) { - rewriter.replaceOpWithNewOp(op, vals[0]); - return success(); - } - return rewriter.notifyMatchFailure(op, "EmitC cannot return multiple values"); - } -}; - -//===----------------------------------------------------------------------===// -// Sync lowering -//===----------------------------------------------------------------------=== - -static std::string getPipeName(pto::PIPE pipe) { - switch (pipe) { - case pto::PIPE::PIPE_S: return "PIPE_S"; - case pto::PIPE::PIPE_V: return "PIPE_V"; - case pto::PIPE::PIPE_M: return "PIPE_M"; - case pto::PIPE::PIPE_MTE1: return "PIPE_MTE1"; - case pto::PIPE::PIPE_MTE2: return "PIPE_MTE2"; - case pto::PIPE::PIPE_MTE3: return "PIPE_MTE3"; - case pto::PIPE::PIPE_ALL: return "PIPE_ALL"; - case pto::PIPE::PIPE_MTE4: return "PIPE_MTE4"; - case pto::PIPE::PIPE_MTE5: return "PIPE_MTE5"; - case pto::PIPE::PIPE_V2: return "PIPE_V2"; - case pto::PIPE::PIPE_FIX: return "PIPE_FIX"; - case pto::PIPE::VIRTUAL_PIPE_MTE2_L1A: return "VIRTUAL_PIPE_MTE2_L1A"; - case pto::PIPE::VIRTUAL_PIPE_MTE2_L1B: return "VIRTUAL_PIPE_MTE2_L1B"; - // 默认回退 - default: return "PIPE_ALL"; - } -} - -//===----------------------------------------------------------------------===// -// pto.barrier lowering -> pipe_barrier(...) -//===----------------------------------------------------------------------===// -struct PTOBarrierToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::BarrierOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto *ctx = rewriter.getContext(); - - // [FIX] op.getPipe() returns PipeAttr. - // We must call .getPipe() on the attribute to get the actual Enum value. - pto::PIPE pipeEnum = op.getPipe().getPipe(); - - // Convert Enum to String (e.g., PIPE_ALL -> "PIPE_ALL") - std::string pipeStr = pto::stringifyPIPE(pipeEnum).str(); - - auto args = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, pipeStr) - }); - - rewriter.replaceOpWithNewOp( - op, - TypeRange{}, // void return - "pipe_barrier", // function name - args, // arguments - ArrayAttr{}, // template args - ValueRange{} // operands - ); - - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Sync lowering (robust for bracket form pto.set_flag[...] / pto.wait_flag[...]) -// Replace your PTOSyncToRuntimeCall with the code below. -//===----------------------------------------------------------------------===// - -static LogicalResult extractSyncTripletTokens(Operation *op, - std::string &srcTok, - std::string &dstTok, - std::string &evtTok, - ConversionPatternRewriter &rewriter) { - auto *ctx = rewriter.getContext(); - - auto pipeToTok = [](mlir::Attribute a, std::string &out) -> bool { - if (!a) return false; - if (auto p = dyn_cast(a)) { - out = mlir::pto::stringifyPIPE(p.getPipe()).str(); - return true; - } - if (auto s = dyn_cast(a)) { - out = s.getValue().str(); // expects already like "PIPE_MTE2" - return true; - } - return false; - }; - - auto evtToTok = [](mlir::Attribute a, std::string &out) -> bool { - if (!a) return false; - if (auto e = dyn_cast(a)) { - out = mlir::pto::stringifyEVENT(e.getEvent()).str(); - return true; - } - if (auto s = dyn_cast(a)) { - out = s.getValue().str(); // expects already like "EVENT_ID0" - return true; - } - return false; - }; - - auto tryNamed = [&](StringRef s0, StringRef s1, StringRef e0) -> bool { - std::string st, dt, et; - if (!pipeToTok(op->getAttr(s0), st)) return false; - if (!pipeToTok(op->getAttr(s1), dt)) return false; - if (!evtToTok(op->getAttr(e0), et)) return false; - srcTok = std::move(st); - dstTok = std::move(dt); - evtTok = std::move(et); - return true; - }; - - // 1) Most common named-attr encodings - if (tryNamed("src_pipe", "dst_pipe", "event_id")) return success(); - if (tryNamed("srcPipe", "dstPipe", "eventId")) return success(); - if (tryNamed("src", "dst", "event")) return success(); - - // 2) Bracket-form / custom-asm often packs them into an ArrayAttr under some key - auto tryArrayKey = [&](StringRef key) -> bool { - auto arr = op->getAttrOfType(key); - if (!arr || arr.size() < 3) return false; - - std::string st, dt, et; - if (!pipeToTok(arr[0], st)) return false; - if (!pipeToTok(arr[1], dt)) return false; - if (!evtToTok(arr[2], et)) return false; - srcTok = std::move(st); - dstTok = std::move(dt); - evtTok = std::move(et); - return true; - }; - - if (tryArrayKey("args") || tryArrayKey("pipes") || tryArrayKey("sync") || - tryArrayKey("triplet") || tryArrayKey("attrs")) - return success(); - - // 3) Last resort: scan everything and pick 2 Pipe + 1 Event in encounter order. - std::vector pipes; - std::string event; - for (auto &na : op->getAttrs()) { - Attribute a = na.getValue(); - std::string tok; - if (pipeToTok(a, tok)) { - pipes.push_back(std::move(tok)); - continue; - } - if (evtToTok(a, tok)) { - event = std::move(tok); - continue; - } - } - - if (pipes.size() >= 2 && !event.empty()) { - srcTok = pipes[0]; - dstTok = pipes[1]; - evtTok = event; - return success(); - } - - return rewriter.notifyMatchFailure(op, "cannot extract PIPE/PIPE/EVENT tokens from pto.{set,wait}_flag"); -} -static inline std::string pipeTokFromPipeEnum(mlir::pto::PIPE p) { - return mlir::pto::stringifyPIPE(p).str(); -} -static inline std::string evtTokFromEventEnum(mlir::pto::EVENT e) { - return mlir::pto::stringifyEVENT(e).str(); -} -static inline std::string pipeTokFromPipeAttr(mlir::pto::PipeAttr a) { - return mlir::pto::stringifyPIPE(a.getPipe()).str(); -} -static inline std::string evtTokFromEventAttr(mlir::pto::EventAttr a) { - return mlir::pto::stringifyEVENT(a.getEvent()).str(); -} - -template -struct HasGetSrcPipe : std::false_type {}; -template -struct HasGetSrcPipe().getSrcPipe())>> : std::true_type {}; - -template -struct HasGetDstPipe : std::false_type {}; -template -struct HasGetDstPipe().getDstPipe())>> : std::true_type {}; - -template -struct HasGetEventId : std::false_type {}; -template -struct HasGetEventId().getEventId())>> : std::true_type {}; - -template -struct HasGetSrcPipeAttr : std::false_type {}; -template -struct HasGetSrcPipeAttr().getSrcPipeAttr())>> : std::true_type {}; - -template -struct HasGetDstPipeAttr : std::false_type {}; -template -struct HasGetDstPipeAttr().getDstPipeAttr())>> : std::true_type {}; - -template -struct HasGetEventIdAttr : std::false_type {}; -template -struct HasGetEventIdAttr().getEventIdAttr())>> : std::true_type {}; - -template -static LogicalResult extractSyncTokens(SyncOpT op, - std::string &srcTok, - std::string &dstTok, - std::string &evtTok, - ConversionPatternRewriter &rewriter) { - if constexpr (HasGetSrcPipe::value && - HasGetDstPipe::value && - HasGetEventId::value) { - auto s = op.getSrcPipe(); - auto d = op.getDstPipe(); - auto e = op.getEventId(); - - if constexpr (std::is_same::value) srcTok = pipeTokFromPipeEnum(s); - else srcTok = pipeTokFromPipeAttr(s); - - if constexpr (std::is_same::value) dstTok = pipeTokFromPipeEnum(d); - else dstTok = pipeTokFromPipeAttr(d); - - if constexpr (std::is_same::value) evtTok = evtTokFromEventEnum(e); - else evtTok = evtTokFromEventAttr(e); - - return success(); - } - - if constexpr (HasGetSrcPipeAttr::value && - HasGetDstPipeAttr::value && - HasGetEventIdAttr::value) { - auto s = op.getSrcPipeAttr(); - auto d = op.getDstPipeAttr(); - auto e = op.getEventIdAttr(); - srcTok = pipeTokFromPipeAttr(s); - dstTok = pipeTokFromPipeAttr(d); - evtTok = evtTokFromEventAttr(e); - return success(); - } - - return extractSyncTripletTokens(op.getOperation(), srcTok, dstTok, evtTok, rewriter); -} -struct PTOSetFlagToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(mlir::pto::SetFlagOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - - std::string srcTok, dstTok, evtTok; - if (failed(extractSyncTokens(op, srcTok, dstTok, evtTok, rewriter))) - return failure(); - - auto argsAttr = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, srcTok), - emitc::OpaqueAttr::get(ctx, dstTok), - emitc::OpaqueAttr::get(ctx, evtTok), - }); - - rewriter.replaceOpWithNewOp( - op, TypeRange{}, "set_flag", - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - return success(); - } -}; - -struct PTOWaitFlagToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(mlir::pto::WaitFlagOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - - std::string srcTok, dstTok, evtTok; - if (failed(extractSyncTokens(op, srcTok, dstTok, evtTok, rewriter))) - return failure(); - - auto argsAttr = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, srcTok), - emitc::OpaqueAttr::get(ctx, dstTok), - emitc::OpaqueAttr::get(ctx, evtTok), - }); - - rewriter.replaceOpWithNewOp( - op, TypeRange{}, "wait_flag", - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - return success(); - } -}; - -struct PTOGetBufToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(mlir::pto::GetBufOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - - std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); - auto argsAttr = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, pipeTok), - op.getBufIdAttr(), - op.getModeAttr(), - }); - - rewriter.replaceOpWithNewOp( - op, TypeRange{}, "get_buf", - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - return success(); - } -}; - -struct PTORlsBufToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(mlir::pto::RlsBufOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - - std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); - auto argsAttr = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, pipeTok), - op.getBufIdAttr(), - op.getModeAttr(), - }); - - rewriter.replaceOpWithNewOp( - op, TypeRange{}, "rls_buf", - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - return success(); - } -}; - -struct PTOSyncSetToEmitC : public OpConversionPattern { - PTOSyncSetToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, - PTOArch targetArch) - : OpConversionPattern(typeConverter, ctx), - targetArch(targetArch) {} - - LogicalResult - matchAndRewrite(mlir::pto::SyncSetOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - auto loc = op->getLoc(); - - std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); - auto argsAttr = rewriter.getArrayAttr( - {emitc::OpaqueAttr::get(ctx, pipeTok), op.getEventIdAttr()}); - const char *kSyncSetCallee = (targetArch == PTOArch::A3) - ? "ffts_cross_core_sync" - : "set_intra_block"; - rewriter.create(loc, TypeRange{}, kSyncSetCallee, - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - - rewriter.eraseOp(op); - return success(); - } - - PTOArch targetArch; -}; - -struct PTOSyncWaitToEmitC : public OpConversionPattern { - PTOSyncWaitToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, - PTOArch targetArch) - : OpConversionPattern(typeConverter, ctx), - targetArch(targetArch) {} - - LogicalResult - matchAndRewrite(mlir::pto::SyncWaitOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - (void)adaptor; - auto *ctx = rewriter.getContext(); - auto loc = op->getLoc(); - - std::string pipeTok = pipeTokFromPipeAttr(op.getPipe()); - auto argsAttr = rewriter.getArrayAttr( - {emitc::OpaqueAttr::get(ctx, pipeTok), op.getEventIdAttr()}); - const char *kSyncWaitCallee = - (targetArch == PTOArch::A3) ? "wait_flag_dev" : "wait_intra_block"; - rewriter.create(loc, TypeRange{}, kSyncWaitCallee, - argsAttr, ArrayAttr{}, ValueRange{}); - - rewriter.eraseOp(op); - return success(); - } - - PTOArch targetArch; -}; - -// GetBlockIdxOp Lowering (pto.get_block_idx -> get_block_idx()) -struct PTOGetBlockIdxToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(mlir::pto::GetBlockIdxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - rewriter.replaceOpWithNewOp( - op, op.getType(), "get_block_idx", ValueRange{}, ArrayAttr{}, - ArrayAttr{}); - - return success(); - } -}; - -// GetBlockNumOp Lowering (pto.get_block_num -> get_block_num()) -struct PTOGetBlockNumToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(mlir::pto::GetBlockNumOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - rewriter.replaceOpWithNewOp( - op, op.getType(), "get_block_num", ValueRange{}, ArrayAttr{}, - ArrayAttr{}); - - return success(); - } -}; - -// GetSubBlockIdxOp Lowering (pto.get_block_idx -> get_subblockid()) -struct PTOGetSubBlockIdxToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(mlir::pto::GetSubBlockIdxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - rewriter.replaceOpWithNewOp( - op, op.getType(), "get_subblockid", ValueRange{}, ArrayAttr{}, - ArrayAttr{}); - - return success(); - } -}; - -// GetSubBlockNumOp Lowering (pto.get_block_num -> get_subblockdim()) -struct PTOGetSubBlockNumToEmitC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(mlir::pto::GetSubBlockNumOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - rewriter.replaceOpWithNewOp( - op, op.getType(), "get_subblockdim", ValueRange{}, ArrayAttr{}, - ArrayAttr{}); - - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.mscatter lowering -> MSCATTER(mem, src, idx) -// pto.mscatter %src, %mem, %idx : memref<...>, memref<...>, memref<...> -//===----------------------------------------------------------------------===// - -struct PTOMScatterToMSCATTER : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::MScatterOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src = peelUnrealized(adaptor.getSrc()); - Value mem = peelUnrealized(adaptor.getMem()); - - // pto-isa currently has no NPU implementation for MGATHER/MSCATTER. - // Fallback to a smoke-friendly lowering to keep compile/run coverage. - rewriter.create( - op.getLoc(), TypeRange{}, "TSTORE", - ArrayAttr{}, ArrayAttr{}, - ValueRange{mem, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOSetValToSETVAL : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSetValOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value dst = peelUnrealized(adaptor.getDst()); - Value val = peelUnrealized(adaptor.getVal()); - - // ---- offset: SSA index operand ---- - Value offset = peelUnrealized(adaptor.getOffset()); - - // NOTE: EmitC has no direct member-call op today. We emit a marker call - // and post-process ptoas output to rewrite it into: - // dst.SetValue(offset, val); - rewriter.create( - op.getLoc(), TypeRange{}, "PTOAS__TILE_SET_VALUE", - ArrayAttr{}, ArrayAttr{}, ValueRange{dst, offset, val}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOGetValToGETVAL : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGetValOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src = peelUnrealized(adaptor.getSrc()); - - // ---- offset: SSA index operand ---- - Value offset = peelUnrealized(adaptor.getOffset()); - - // NOTE: EmitC has no direct member-call op today. We emit a marker call - // and post-process ptoas output to rewrite it into: - // auto x = src.GetValue(offset); - Type dstTy = getTypeConverter()->convertType(op.getDst().getType()); - if (!dstTy) - return failure(); - auto call = rewriter.create( - op.getLoc(), - TypeRange{dstTy}, - "PTOAS__TILE_GET_VALUE", - ArrayAttr{}, ArrayAttr{}, - ValueRange{src, offset}); - - rewriter.replaceOp(op, call.getResults()); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.load_scalar / pto.store_scalar lowering -> ptr[offset] -//===----------------------------------------------------------------------===// - -struct PTOLoadScalarToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::LoadScalarOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value ptr = peelUnrealized(adaptor.getPtr()); - Value offset = peelUnrealized(adaptor.getOffset()); - - Type dstTy = getTypeConverter()->convertType(op.getValue().getType()); - if (!dstTy) - return failure(); - - auto call = rewriter.create( - op.getLoc(), TypeRange{dstTy}, "PTOAS__PTR_LOAD", - ArrayAttr{}, ArrayAttr{}, ValueRange{ptr, offset}); - - rewriter.replaceOp(op, call.getResults()); - return success(); - } -}; - -struct PTOStoreScalarToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::StoreScalarOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value ptr = peelUnrealized(adaptor.getPtr()); - Value offset = peelUnrealized(adaptor.getOffset()); - Value val = peelUnrealized(adaptor.getValue()); - - rewriter.create( - op.getLoc(), TypeRange{}, "PTOAS__PTR_STORE", - ArrayAttr{}, ArrayAttr{}, ValueRange{ptr, offset, val}); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.tabs lowering -> TABS(dst, src) -//===----------------------------------------------------------------------===// - -struct PTOTAbsToTABS : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAbsOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - // intrinsic: TABS(dst, src) - rewriter.create( - op.getLoc(), TypeRange{}, "TABS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tadd lowering -> TADD(dst, src0, src1) -//===----------------------------------------------------------------------===// - -struct PTOTAddToTADD : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAddOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - op.getLoc(), TypeRange{}, "TADD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src0, src1}); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// populate patterns -//===----------------------------------------------------------------------=== -struct ReinterpretCastToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - auto resMrTy = dyn_cast(op.getType()); - if (!resMrTy) - return failure(); - - auto asAttr = dyn_cast_or_null(resMrTy.getMemorySpace()); - const bool isGm = (!asAttr || asAttr.getAddressSpace() == pto::AddressSpace::GM); - - bool emitAddPtrTrace = op->hasAttr("pto.addptr_trace"); - Value source = peelUnrealized(adaptor.getSource()); - auto offsets = adaptor.getOffsets(); - Value offsetVal = offsets.empty() ? Value() : offsets[0]; - - // GM: keep pointer arithmetic. - if (isGm) { - if (!offsetVal) { - rewriter.replaceOp(op, source); - return success(); - } - - Type resultType = getTypeConverter()->convertType(op.getType()); - if (!resultType) - return failure(); - - auto addOp = rewriter.create(loc, resultType, source, offsetVal); - if (emitAddPtrTrace) { - rewriter.setInsertionPointAfter(addOp); - rewriter.create( - loc, TypeRange{}, "PTOAS__ADDPTR_TRACE", - ArrayAttr{}, ArrayAttr{}, - ValueRange{addOp.getResult(), source, offsetVal}); - } - rewriter.replaceOp(op, addOp.getResult()); - return success(); - } - - // UB/L1/L0 tiles: materialize a new Tile view by assigning an adjusted - // underlying pointer (in elements). - pto::AddressSpace as = asAttr.getAddressSpace(); - - // Element type token. - std::string elemTok = "float"; - Type elemTy = resMrTy.getElementType(); - int64_t elemBytes = 4; - if (elemTy.isF16()) - elemBytes = 2, - elemTok = "half"; - else if (elemTy.isBF16()) - elemBytes = 2, - elemTok = "bfloat16_t"; - else if (elemTy.isF32()) - elemBytes = 4, - elemTok = "float"; - else if (elemTy.isInteger(8)) - elemBytes = 1, - elemTok = cast(elemTy).isUnsigned() ? "uint8_t" : "int8_t"; - else if (elemTy.isInteger(16)) - elemBytes = 2, - elemTok = cast(elemTy).isUnsigned() ? "uint16_t" : "int16_t"; - else if (elemTy.isInteger(32)) - elemBytes = 4, - elemTok = cast(elemTy).isUnsigned() ? "uint32_t" : "int32_t"; - else if (elemTy.isInteger(64)) - elemBytes = 8, - elemTok = cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; - - // Tile role. - const char *roleTok = "TileType::Vec"; - switch (as) { - case pto::AddressSpace::VEC: - roleTok = "TileType::Vec"; - break; - case pto::AddressSpace::MAT: - roleTok = "TileType::Mat"; - break; - case pto::AddressSpace::LEFT: - roleTok = "TileType::Left"; - break; - case pto::AddressSpace::RIGHT: - roleTok = "TileType::Right"; - break; - case pto::AddressSpace::ACC: - roleTok = "TileType::Acc"; - break; - case pto::AddressSpace::BIAS: - roleTok = "TileType::Bias"; - break; - case pto::AddressSpace::GM: - roleTok = "TileType::Vec"; - break; - } - - // Shape (fallback to 32x32). - int64_t rows = 32, cols = 32; - if (resMrTy.getRank() >= 2 && resMrTy.hasStaticShape()) { - rows = resMrTy.getDimSize(0); - cols = resMrTy.getDimSize(1); - } - - // Keep a conservative default config for now. - std::string tileTypeStr = - std::string("Tile<") + roleTok + ", " + elemTok + ", " + - std::to_string(rows) + ", " + std::to_string(cols) + - ", BLayout::RowMajor, " + std::to_string(rows) + ", " + - std::to_string(cols) + ", SLayout::NoneBox, 512, PadValue::Null>"; - - auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); - Value tile = rewriter - .create(loc, tileType, - emitc::OpaqueAttr::get(ctx, "")) - .getResult(); - - // Compute an integer address and assign it to the new tile. - // NOTE: pto-isa TASSIGN requires an integral address (not a pointer). - auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); - auto rcU64 = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); - - // Non-GM reinterpret_cast operands come from UB/L1/L0 tiles. - // We need the underlying address, but `__cce_get_tile_ptr()` is only valid - // inside `__tf__` functions. Use `tile.data()` (via a post-processed marker) - // and compute the adjusted address in bytes. - Value rawPtr = source; - if (auto ot = dyn_cast(source.getType())) { - // Only Tiles have a `.data()` member. For plain address-space pointers - // (e.g. `__ubuf__ float*`), use the pointer value directly. - if (ot.getValue().starts_with("Tile<")) { - std::string rawPtrTok = - std::string(addrSpaceQualifier(as)) + " " + elemTok + "*"; - auto rawPtrTy = emitc::OpaqueType::get(ctx, rawPtrTok); - rawPtr = rewriter - .create(loc, rawPtrTy, - "PTOAS__TILE_DATA", ArrayAttr{}, - ArrayAttr{}, ValueRange{source}) - .getResult(0); - } - } - - Value baseAddr = rewriter - .create(loc, u64Ty, "reinterpret_cast", - /*args=*/ArrayAttr{}, - /*templateArgs=*/rcU64, - /*operands=*/ValueRange{rawPtr}) - .getResult(0); - - Value addr = baseAddr; - if (offsetVal) { - Value offU64 = offsetVal; - if (offU64.getType() != u64Ty) - offU64 = rewriter.create(loc, u64Ty, offU64).getResult(); - - auto bytesAttr = emitc::OpaqueAttr::get(ctx, std::to_string(elemBytes)); - Value bytesVal = rewriter.create(loc, u64Ty, bytesAttr); - Value byteOff = rewriter.create(loc, u64Ty, offU64, bytesVal); - addr = rewriter.create(loc, u64Ty, baseAddr, byteOff); - } - - rewriter.create(loc, TypeRange{}, "TASSIGN", - /*args=*/ArrayAttr{}, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{tile, addr}); - - rewriter.replaceOp(op, tile); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.taddc lowering -> TADDC(dst, src0, src1, src2) -//===----------------------------------------------------------------------===// - -struct PTOTAddCToTADDC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAddCOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value src2 = peelUnrealized(adaptor.getSrc2()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa does not provide NPU implementation for TADDC yet. - // Decompose: dst = src0 + src1 + src2 - rewriter.create( - loc, TypeRange{}, "TADD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src0, src1}); - rewriter.create( - loc, TypeRange{}, "TADD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, dst, src2}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tadds lowering -> TADDS(dst, src, scalar) -//===----------------------------------------------------------------------===// - -struct PTOAddSToTADDS : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAddSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value scalar = peelUnrealized(adaptor.getScalar()); - - rewriter.create( - op.getLoc(), TypeRange{}, "TADDS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src, scalar}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.taddsc lowering -> TADDSC(dst, src0, scalar, src1) -//===----------------------------------------------------------------------===// - -struct PTOAddSCToTADDSC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAddSCOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa does not provide NPU implementation for TADDSC yet. - // Decompose: dst = src0 + scalar + src1 - rewriter.create( - loc, TypeRange{}, "TADDS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src0, scalar}); - rewriter.create( - loc, TypeRange{}, "TADD", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, dst, src1}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOTAndToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAndOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value a = peelUnrealized(adaptor.getSrc0()); - Value b = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - op.getLoc(), TypeRange{}, "TAND", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, a, b}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOAndSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TAndSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - op.getLoc(), TypeRange{}, "TANDS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src, scalar}); - - rewriter.eraseOp(op); - return success(); - } -}; - - -struct PTOTCIToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TCIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value S = peelUnrealized(adaptor.getS()); - - // scalar cpp type token - std::string scalarTok = "int32_t"; - if (auto it = S.getType().dyn_cast()) { - scalarTok = (it.getWidth() == 16) ? "int16_t" : "int32_t"; - } - - // descending -> "0"/"1" - std::string descTok = op.getDescending() ? "1" : "0"; - - ArrayAttr targs; - if (auto ot = dst.getType().dyn_cast()) { - std::string tileTok = ot.getValue().str(); // "Tile<...>" - targs = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, tileTok), - emitc::OpaqueAttr::get(ctx, scalarTok), - emitc::OpaqueAttr::get(ctx, descTok), - }); - } else { - targs = rewriter.getArrayAttr({}); - } - - rewriter.create( - loc, TypeRange{}, "TCI", - /*args=*/ArrayAttr{}, - /*templateArgs=*/targs, - /*operands=*/ValueRange{dst, S}); - - rewriter.eraseOp(op); - return success(); - } -}; -static std::string cmpModeTok(pto::CmpModeAttr a) { - // 生成 "CmpMode::GT" 这种 token - auto m = a.getValue(); // 取 enum - switch (m) { - case pto::CmpMode::EQ: return "CmpMode::EQ"; - case pto::CmpMode::NE: return "CmpMode::NE"; - case pto::CmpMode::LT: return "CmpMode::LT"; - case pto::CmpMode::LE: return "CmpMode::LE"; - case pto::CmpMode::GT: return "CmpMode::GT"; - case pto::CmpMode::GE: return "CmpMode::GE"; - } - return "CmpMode::EQ"; -} -struct PTOColExpandToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TColExpandOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value src = peelUnrealized(adaptor.getSrc()); - - rewriter.create( - loc, TypeRange{}, "TCOLEXPAND", - /*args=*/ArrayAttr(), - /*templateArgs=*/ArrayAttr(), - /*operands=*/ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; - -struct PTOCmpToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TCmpOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - - std::string tok = "CmpMode::EQ"; - if (auto a = op.getCmpModeAttr()) - tok = cmpModeTok(a); - - auto modeTy = emitc::OpaqueType::get(ctx, "CmpMode"); - Value modeVal = rewriter.create( - loc, modeTy, emitc::OpaqueAttr::get(ctx, tok)); - - auto argsAttr = rewriter.getArrayAttr({}); - - rewriter.create( - loc, - TypeRange{}, - "TCMP", - /*args=*/ArrayAttr{}, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src0, src1, modeVal}); - - rewriter.eraseOp(op); - return success(); - } -}; - -struct PTOCmpSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TCmpSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - - // cmpMode -> token - auto cmpAttr = op.getCmpModeAttr(); // PTO_CmpModeAttr - std::string tok = cmpModeTok(cmpAttr); - - auto modeTy = emitc::OpaqueType::get(ctx, "CmpMode"); - Value modeVal = rewriter.create( - loc, modeTy, emitc::OpaqueAttr::get(ctx, tok)); - - rewriter.create( - loc, - TypeRange{}, - "TCMPS", - /*args=*/ArrayAttr{}, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src, scalar, modeVal}); - - rewriter.eraseOp(op); - return success(); - } -}; - - -struct PTOColMaxToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TColMaxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - // intrinsic: TCOLMAX(dst, src) - rewriter.create( - loc, TypeRange{}, "TCOLMAX", - /*args=*/ArrayAttr{}, // default: print all operands - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOColMinToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TColMinOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - // intrinsic: TCOLMIN(dst, src) - rewriter.create( - loc, TypeRange{}, "TCOLMIN", - /*args=*/ArrayAttr{}, // default: print all operands - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOColSumToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TColSumOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - // Check if tmp exists before accessing it - if (op.getTmp()) { - // Format 2: with tmp and isBinary - Value tmp = peelUnrealized(adaptor.getTmp()); - bool isBinary = false; - if (auto a = op.getIsBinaryAttr()) - isBinary = a.getValue(); - - auto boolTy = emitc::OpaqueType::get(ctx, "bool"); - auto tok = isBinary ? "true" : "false"; - Value isBinaryVal = rewriter.create( - loc, boolTy, emitc::OpaqueAttr::get(ctx, tok)); - - rewriter.create( - loc, TypeRange{}, "TCOLSUM", - /*args=*/ArrayAttr(), - /*templateArgs=*/ArrayAttr(), - /*operands=*/ValueRange{dst, src, tmp, isBinaryVal}); - } else { - // Format 1: without tmp and isBinary - rewriter.create( - loc, TypeRange{}, "TCOLSUM", - /*args=*/ArrayAttr(), - /*templateArgs=*/ArrayAttr(), - /*operands=*/ValueRange{dst, src}); - } - - rewriter.eraseOp(op); - return success(); - } -}; -static std::string roundModeTok(mlir::pto::RoundModeAttr attr) { - using RM = mlir::pto::RoundMode; - switch (attr.getValue()) { - case RM::NONE: return "RoundMode::CAST_NONE"; - case RM::RINT: return "RoundMode::CAST_RINT"; - case RM::ROUND: return "RoundMode::CAST_ROUND"; - case RM::FLOOR: return "RoundMode::CAST_FLOOR"; - case RM::CEIL: return "RoundMode::CAST_CEIL"; - case RM::TRUNC: return "RoundMode::CAST_TRUNC"; - case RM::ODD: return "RoundMode::CAST_ODD"; - case RM::CAST_RINT: return "RoundMode::CAST_RINT"; - } - return "RoundMode::CAST_RINT"; -} -struct PTOCvtToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TCvtOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - // rmode default: CAST_RINT - pto::RoundModeAttr rmAttr = op.getRmodeAttr(); - std::string rmTok = rmAttr ? roundModeTok(rmAttr) - : std::string("RoundMode::CAST_RINT"); - - // 生成: TCVT(dst, src, RoundMode::XXX) - auto rmodeTy = emitc::OpaqueType::get(ctx, "RoundMode"); - Value rmodeVal = rewriter.create( - loc, rmodeTy, emitc::OpaqueAttr::get(ctx, rmTok)); - - // 这里 args 被清空,只保留 operands,包括 src, dst 和 rmode - rewriter.create( - loc, TypeRange{}, "TCVT", - /*args=*/ArrayAttr{}, // 不使用 args - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src, rmodeVal}); // 传递 dst, src 和 rmode - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tdiv lowering -> TDIV(dst, src0, src1) -//===----------------------------------------------------------------------===// - -struct PTODivToTDIV : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TDivOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - op.getLoc(), TypeRange{}, "TDIV", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src0, src1}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tdivs lowering -> TDIVS(dst, src, scalar) or TDIVS(dst, scalar, src) -// Order is determined by operand types: if src is tile_buf, order is (tile, scalar) -// Otherwise, order is (scalar, tile) -//===----------------------------------------------------------------------===// - -struct PTODivSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TDivSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - // Check types BEFORE conversion (using original op types, not adaptor types) - // The adaptor types may already be converted to emitc.opaque - Value origSrc = op.getSrc(); - Value origScalar = op.getScalar(); - - // Determine order based on original operand types - // Check if src is memref/tensor/partition_tensor_view/tile (not scalar) - bool srcIsMemref = (isa(origSrc.getType()) || - isa(origSrc.getType()) || - isa(origSrc.getType()) || - isa(origSrc.getType())); - // Check if scalar is memref/tensor/partition_tensor_view/tile (not scalar) - bool scalarIsMemref = (isa(origScalar.getType()) || - isa(origScalar.getType()) || - isa(origScalar.getType()) || - isa(origScalar.getType())); - - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - if (srcIsMemref && !scalarIsMemref) { - // memref/scalar: TDIVS(dst, src, scalar) - normal order - rewriter.create( - loc, TypeRange{}, "TDIVS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src, scalar}); - } else if (!srcIsMemref && scalarIsMemref) { - // scalar/memref: TDIVS(dst, scalar, src) - swapped order - rewriter.create( - loc, TypeRange{}, "TDIVS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, scalar, src}); - } else { - // This should not happen if verifier is correct, but provide a fallback - return op.emitError("TDivSOp: expected exactly one memref/tensor operand and one scalar operand"); - } - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// pto.tdivs (TDivSOp) lowering -> TDIVS(dst, src, scalar) or TDIVS(dst, scalar, src) -// Order is determined by operand types: if src is tile_buf, order is (tile, scalar) -// Otherwise, order is (scalar, tile) -//===----------------------------------------------------------------------===// - -struct PTOTDivSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TDivSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - // Determine order based on operand types - bool srcIsTile = isa(src.getType()); - bool scalarIsTile = isa(scalar.getType()); - - if (srcIsTile && !scalarIsTile) { - // tile/scalar: TDIVS(dst, src, scalar) - rewriter.create( - loc, TypeRange{}, "TDIVS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src, scalar}); - } else if (!srcIsTile && scalarIsTile) { - // scalar/tile: TDIVS(dst, scalar, src) - rewriter.create( - loc, TypeRange{}, "TDIVS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, scalar, src}); - } else { - // Default: assume src is tile (should not happen if types are correct) - rewriter.create( - loc, TypeRange{}, "TDIVS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src, scalar}); - } - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.texp lowering -> TEXP(dst, src) -//===----------------------------------------------------------------------===// - -struct PTOExpToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TExpOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - loc, TypeRange{}, "TEXP", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.texpands lowering -> TEXPANDS(dst, scalar) -//===----------------------------------------------------------------------===// - -struct PTOExpandsToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TExpandsOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - loc, TypeRange{}, "TEXPANDS", - ArrayAttr{}, ArrayAttr{}, - ValueRange{dst, scalar}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.textract lowering -> TEXTRACT(dst, src, indexRow, indexCol) -//===----------------------------------------------------------------------===// - -struct PTOExtractToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TExtractOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value r0 = peelUnrealized(adaptor.getIndexRow()); - Value c0 = peelUnrealized(adaptor.getIndexCol()); - - rewriter.create( - loc, TypeRange{}, "TEXTRACT", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src, r0, c0}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tfillpad lowering -> TFILLPAD_EXPAND(dst, src) -//===----------------------------------------------------------------------===// - -struct PTOFillPadToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TFillPadOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - loc, TypeRange{}, "TFILLPAD_EXPAND", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// pto.tgather lowering -// - Index form: TGATHER(dst, src0, indices) -// - Mask form : TGATHER(dst, src0) -//===----------------------------------------------------------------------===// - -static std::string maskPatternTok(mlir::pto::MaskPatternAttr a) { - - auto v = a.getValue(); // enum - return (std::string("pto::MaskPattern::") + mlir::pto::stringifyMaskPattern(v).str()); -} - -struct PTOGatherToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGatherOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value src0 = peelUnrealized(adaptor.getSrc()); - - // Case 1: index-based TGATHER(dst, src0, indices) - if (Value idx = adaptor.getIndices()) { - idx = peelUnrealized(idx); - - rewriter.create( - loc, TypeRange{}, "TGATHER", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src0, idx}); - - rewriter.eraseOp(op); - return success(); - } - - // Case 2: mask-pattern TGATHER(dst, src0) - auto mp = op.getMaskPatternAttr(); - if (!mp) - return rewriter.notifyMatchFailure(op, "expected maskPattern when indices is absent"); - - auto getOpaqueTok = [&](Value v, StringRef name) -> FailureOr { - if (auto ot = v.getType().dyn_cast()) - return ot.getValue().str(); - return rewriter.notifyMatchFailure(op, (name + " must be emitc::OpaqueType (tile)").str()); - }; - - auto dstTokOr = getOpaqueTok(dst, "dst"); - auto srcTokOr = getOpaqueTok(src0, "src0"); - if (failed(dstTokOr) || failed(srcTokOr)) - return failure(); - - // mp is an EnumAttr; stringify name is "P0101" etc. - // We emit MaskPattern::P0101 (because generated C++ has `using namespace pto;`) - std::string mpTok = std::string("MaskPattern::") + - mlir::pto::stringifyMaskPattern(mp.getValue()).str(); - - auto targs = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, *dstTokOr), - emitc::OpaqueAttr::get(ctx, *srcTokOr), - emitc::OpaqueAttr::get(ctx, mpTok), - }); - - rewriter.create( - loc, TypeRange{}, "TGATHER", - /*args=*/ArrayAttr{}, - /*templateArgs=*/targs, - /*operands=*/ValueRange{dst, src0}); - - rewriter.eraseOp(op); - return success(); - } -}; - - -struct PTOGatherbToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGatherBOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value offsets = peelUnrealized(adaptor.getOffsets()); - Value dst = peelUnrealized(adaptor.getDst()); - - rewriter.create( - loc, TypeRange{}, "TGATHERB", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src, offsets}); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// TLOG lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - -struct PTOLogToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TLogOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TLOG", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - - - -//===----------------------------------------------------------------------===// -// TLRELU lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - - struct PTOLReluToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TLReluOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value slope = peelUnrealized(adaptor.getSlope()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, slope}; - - rewriter.create( - loc, TypeRange{}, "TLRELU", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// TMAX lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - -struct PTOMaxToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMaxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TMAX", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// TMAXS lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - - struct PTOMaxSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMaxSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, scalar}; - rewriter.create( - loc, TypeRange{}, "TMAXS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - - -//===----------------------------------------------------------------------===// -// TMIN lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - -struct PTOMinToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMinOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TMIN", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// TMINS lowering to EmitC (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// TMINS lowering to EmitC (fix APFloat -> FloatAttr) (PTOConvert.cpp) -//===----------------------------------------------------------------------===// - -struct PTOMinsToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMinSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value scalar = peelUnrealized(adaptor.getScalar()); - - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TMINS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering for TMOV op -> EmitC) -//===----------------------------------------------------------------------===// - -struct PTOMovToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMovOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TMOV", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TMOV_FP DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOMovFPToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMovFPOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDst()); - Value src = peelUnrealized(adaptor.getSrc()); - Value fp = peelUnrealized(adaptor.getFp()); - - // TMOV_FP(dstTileData, cTile, fbTile) - ArrayAttr templateArgs; - auto dstOT = dst.getType().dyn_cast(); - auto srcOT = src.getType().dyn_cast(); - auto fpOT = fp.getType().dyn_cast(); - if (dstOT && srcOT && fpOT) { - templateArgs = rewriter.getArrayAttr({ - emitc::OpaqueAttr::get(ctx, dstOT.getValue().str()), - emitc::OpaqueAttr::get(ctx, srcOT.getValue().str()), - emitc::OpaqueAttr::get(ctx, fpOT.getValue().str()), - }); - } else { - templateArgs = ArrayAttr{}; - } - - SmallVector operands{dst, src, fp}; - rewriter.create( - loc, TypeRange{}, "TMOV_FP", - /*args=*/ArrayAttr{}, /*templateArgs=*/templateArgs, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TMRGSORT DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOMrgSortToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMrgSortOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - if (op.isFormat1()) { - Value src = peelUnrealized(adaptor.getSrcs().front()); - Value dst = peelUnrealized(adaptor.getDsts().front()); - Value blockLen = peelUnrealized(adaptor.getBlockLen()); - - SmallVector operands{dst, src, blockLen}; - rewriter.create( - loc, TypeRange{}, "TMRGSORT", - ArrayAttr{}, ArrayAttr{}, operands); - } else if (op.isFormat2()) { - // pto-isa API: - // TMRGSORT( - // dst, executedNumList, tmp, src0, src1, src2, src3); - auto *ctx = rewriter.getContext(); - - Value dst = peelUnrealized(adaptor.getDsts()[0]); - Value tmp = peelUnrealized(adaptor.getDsts()[1]); - Value excuted = peelUnrealized(adaptor.getExcuted()); - - SmallVector srcs; - srcs.reserve(4); - for (Value v : adaptor.getSrcs()) - srcs.push_back(peelUnrealized(v)); - - auto dstOT = dst.getType().dyn_cast(); - auto tmpOT = tmp.getType().dyn_cast(); - if (!dstOT || !tmpOT || srcs.size() != 4) - return op.emitOpError("format2 expects (dst,tmp) tilebufs and exactly 4 srcs"); - - SmallVector targs; - targs.reserve(7); - targs.push_back(emitc::OpaqueAttr::get(ctx, dstOT.getValue().str())); - targs.push_back(emitc::OpaqueAttr::get(ctx, tmpOT.getValue().str())); - for (Value v : srcs) { - auto ot = v.getType().dyn_cast(); - if (!ot) - return op.emitOpError("format2 expects tilebuf srcs"); - targs.push_back(emitc::OpaqueAttr::get(ctx, ot.getValue().str())); - } - targs.push_back(emitc::OpaqueAttr::get(ctx, op.getExhausted() ? "true" : "false")); - ArrayAttr templateArgs = rewriter.getArrayAttr(targs); - - SmallVector operands{dst, excuted, tmp}; - operands.append(srcs.begin(), srcs.end()); - - rewriter.create( - loc, TypeRange{}, "TMRGSORT", - /*args=*/ArrayAttr{}, /*templateArgs=*/templateArgs, operands); - } else { - return op.emitOpError("unsupported mrgsort_dps format"); - } - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TMUL DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOMulToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMulOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TMUL", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TMULS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOMulsToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMulSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc0()); - Value dst = peelUnrealized(adaptor.getDst()); - Value scalar = peelUnrealized(adaptor.getScalar()); - - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TMULS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TNEG DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTONegToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TNegOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TNEG", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TNOT DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTONotToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TNotOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TNOT", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TOR DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOOrToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TOrOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TOR", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TORS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOOrsToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TOrSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - // NOTE: The conversion type system may materialize integers as emitc.opaque - // (e.g. "int32_t"). For EmitC call emission we can pass the scalar through - // directly without arith casts here. - Value s = adaptor.getScalar(); - - SmallVector operands{dst, src0, s}; - rewriter.create( - loc, TypeRange{}, "TORS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TPARTADD DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOPartAddToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TPartAddOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TPARTADD", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TPARTMAX DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOPartMaxToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TPartMaxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TPARTMAX", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TPARTMIN DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOPartMinToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TPartMinOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TPARTMIN", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TPRELU DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOPreluToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TPReluOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa TPRELU requires a tmp tile argument. Current NPU implementation - // does not use tmp, so we safely pass dst as tmp for compatibility. - SmallVector operands{dst, src0, src1, dst}; - rewriter.create( - loc, TypeRange{}, "TPRELU", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TRECIP DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORecipToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRecipOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TRECIP", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TRELU DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOReluToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TReluOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TRELU", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TREM DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORemToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRemOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TREM", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TREMS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORemSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRemSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value scalar = peelUnrealized(adaptor.getScalar()); - - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TREMS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWEXPAND DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowExpandToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowExpandOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TROWEXPAND", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDDIV DPS/memref op) -//===----------------------------------------------------------------------===// -// Helper: replace or erase based on whether op has results. -static void replaceOrEraseWithOpaqueCall(Operation *op, - StringRef callee, - ArrayRef args, - ConversionPatternRewriter &rewriter) { - TypeRange resultTypes = op->getResultTypes(); - auto call = rewriter.create( - op->getLoc(), resultTypes, callee, ArrayAttr{}, ArrayAttr{}, ValueRange(args)); - if (resultTypes.empty()) - rewriter.eraseOp(op); - else - rewriter.replaceOp(op, call.getResults()); -} - -// ---------- TOp ---------- -struct PTOTGemvBiasToTGEMV_BIAS - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TGemvBiasOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value a = peelUnrealized(adaptor.getA()); - Value b = peelUnrealized(adaptor.getB()); - Value bias = peelUnrealized(adaptor.getBias()); - Value dst = peelUnrealized(adaptor.getDst()); - - replaceOrEraseWithOpaqueCall(op.getOperation(), "TGEMV_BIAS", - {dst, a, b, bias}, rewriter); - return success(); - } -}; - -struct PTOTMatmulBiasToTMATMUL_BIAS - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulBiasOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value a = peelUnrealized(adaptor.getA()); - Value b = peelUnrealized(adaptor.getB()); - Value bias = peelUnrealized(adaptor.getBias()); - Value dst = peelUnrealized(adaptor.getDst()); - - replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_BIAS", - {dst, a, b, bias}, rewriter); - return success(); - } -}; - -struct PTOTMatmulMXToTMATMUL_MX - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulMxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value a = peelUnrealized(adaptor.getA()); - Value aScale = peelUnrealized(adaptor.getAScale()); - Value b = peelUnrealized(adaptor.getB()); - Value bScale = peelUnrealized(adaptor.getBScale()); - Value dst = peelUnrealized(adaptor.getDst()); - - replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX", - {dst, a, aScale, b, bScale}, rewriter); - return success(); - } -}; - -struct PTOTMatmulMXAccToTMATMUL_MX_ACC - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulMxAccOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value cIn = peelUnrealized(adaptor.getCIn()); - Value a = peelUnrealized(adaptor.getA()); - Value aScale = peelUnrealized(adaptor.getAScale()); - Value b = peelUnrealized(adaptor.getB()); - Value bScale = peelUnrealized(adaptor.getBScale()); - Value dst = peelUnrealized(adaptor.getDst()); - - replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX_ACC", - {dst, cIn, a, aScale, b, bScale}, rewriter); - return success(); - } -}; - -struct PTOTMatmulMXBiasToTMATMUL_MX_BIAS - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TMatmulMxBiasOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value a = peelUnrealized(adaptor.getA()); - Value aScale = peelUnrealized(adaptor.getAScale()); - Value b = peelUnrealized(adaptor.getB()); - Value bScale = peelUnrealized(adaptor.getBScale()); - Value bias = peelUnrealized(adaptor.getBias()); - Value dst = peelUnrealized(adaptor.getDst()); - - replaceOrEraseWithOpaqueCall(op.getOperation(), "TMATMUL_MX_BIAS", - {dst, a, aScale, b, bScale, bias}, rewriter); - return success(); - } -}; - -struct PTORowExpandDivToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowExpandDivOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TROWEXPANDDIV", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDMUL DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowExpandMulToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowExpandMulOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TROWEXPANDMUL", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWEXPANDSUB DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowExpandSubToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowExpandSubOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TROWEXPANDSUB", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWMAX DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowMaxToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowMaxOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value tmp = peelUnrealized(adaptor.getTmp()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, tmp}; - rewriter.create( - loc, TypeRange{}, "TROWMAX", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWMIN DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowMinToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowMinOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value tmp = peelUnrealized(adaptor.getTmp()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, tmp}; - rewriter.create( - loc, TypeRange{}, "TROWMIN", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TROWSUM DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORowSumToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRowSumOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value tmp = peelUnrealized(adaptor.getTmp()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, tmp}; - rewriter.create( - loc, TypeRange{}, "TROWSUM", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TRSQRT DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTORsqrtToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TRsqrtOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TRSQRT", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSCATTER DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOScatterToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TScatterOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value idx = peelUnrealized(adaptor.getIndexes()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, idx}; - rewriter.create( - loc, TypeRange{}, "TSCATTER", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSEL DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSelToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSelOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value mask = peelUnrealized(adaptor.getMask()); - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, mask, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TSEL", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSELS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSelSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSelSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value selectMode = peelUnrealized(adaptor.getSelectMode()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1, selectMode}; - rewriter.create( - loc, TypeRange{}, "TSELS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSHL DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOShlSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TShlOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TSHL", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSHR DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOShrSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TShrOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TSHR", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering for TSHLS/TSHRS DPS: shift by scalar) -//===----------------------------------------------------------------------===// - -struct PTOShlSConstToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TShlSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Value dst = peelUnrealized(adaptor.getDst()); - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TSHLS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - rewriter.eraseOp(op); - return success(); - } -}; - -struct PTOShrSConstToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TShrSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - Value dst = peelUnrealized(adaptor.getDst()); - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TSHRS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSORT32 DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSORT32SToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSort32Op op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - Value idx = peelUnrealized(adaptor.getIdx()); - - SmallVector operands{dst, src, idx}; - rewriter.create( - loc, TypeRange{}, "TSORT32", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSQRT DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSqrtSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSqrtOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src}; - rewriter.create( - loc, TypeRange{}, "TSQRT", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSTORE_FP DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOStoreFPSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TStoreFPOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value fp = peelUnrealized(adaptor.getFp()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, fp}; - rewriter.create( - loc, TypeRange{}, "TSTORE_FP", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSUB DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSubSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSubOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src0, src1}; - rewriter.create( - loc, TypeRange{}, "TSUB", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSUBC DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSubCSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSubCOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value src2 = peelUnrealized(adaptor.getSrc2()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa does not provide NPU implementation for TSUBC yet. - // Decompose: dst = src0 - src1 + src2 - rewriter.create( - loc, TypeRange{}, "TSUB", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src0, src1}); - rewriter.create( - loc, TypeRange{}, "TADD", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, dst, src2}); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSUBS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSubSSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSubSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, scalar}; - rewriter.create( - loc, TypeRange{}, "TSUBS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSUBSC DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSubSCToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSubSCOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa does not provide NPU implementation for TSUBSC yet. - // Decompose: dst = src0 - scalar + src1 - rewriter.create( - loc, TypeRange{}, "TSUBS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, src0, scalar}); - rewriter.create( - loc, TypeRange{}, "TADD", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{dst, dst, src1}); - - rewriter.eraseOp(op); - return success(); - } -}; - - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TXOR DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOXORToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TXorOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src0 = peelUnrealized(adaptor.getSrc0()); - Value src1 = peelUnrealized(adaptor.getSrc1()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa TXOR requires a tmp tile argument. Current NPU implementation - // does not use tmp, so we safely pass dst as tmp for compatibility. - SmallVector operands{dst, src0, src1, dst}; - rewriter.create( - loc, TypeRange{}, "TXOR", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -struct PTOTTransToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TTransOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value tmp = peelUnrealized(adaptor.getTmp()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, src, tmp}; - rewriter.create( - loc, TypeRange{}, "TTRANS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TXORS DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOXORSToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TXorSOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - Value scalar = peelUnrealized(adaptor.getScalar()); - Value dst = peelUnrealized(adaptor.getDst()); - - // pto-isa TXORS requires a tmp tile argument. Current NPU implementation - // does not use tmp, so we safely pass dst as tmp for compatibility. - SmallVector operands{dst, src, scalar, dst}; - rewriter.create( - loc, TypeRange{}, "TXORS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - struct PTOPrintToTPRINT : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TPrintOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value src = peelUnrealized(adaptor.getSrc()); - - SmallVector operands{src}; - rewriter.create( - loc, TypeRange{}, "TPRINT", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -// pto.print "format", %scalar -> PRINTF("format", scalar) -struct PTOPrintOpToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::PrintOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - - std::string fmt = op.getFormat().str(); - if (fmt.empty()) - fmt = "%f"; - std::string quoted = "\""; - for (char c : fmt) { - if (c == '"' || c == '\\') - quoted += '\\'; - else if (c == '\n') - quoted += "\\n"; - else if (c == '\t') - quoted += "\\t"; - else - quoted += c; - } - quoted += "\""; - - Value scalar = peelUnrealized(adaptor.getScalar()); - auto argsAttr = rewriter.getArrayAttr( - {emitc::OpaqueAttr::get(ctx, quoted), - IntegerAttr::get(IndexType::get(ctx), 0)}); - rewriter.create( - loc, TypeRange{}, "cce::printf", - /*args=*/argsAttr, - /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{scalar}); - - rewriter.eraseOp(op); - return success(); - } -}; - -// pto.trap -> TRAP() -struct PTOTrapOpToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TrapOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - rewriter.create( - loc, TypeRange{}, "trap", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/ValueRange{}); - - rewriter.eraseOp(op); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// PTOConvert.cpp (add lowering + patterns.add for TSYNC DPS/memref op) -//===----------------------------------------------------------------------===// - -struct PTOSYNCToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult matchAndRewrite(pto::TSyncOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - Value events = peelUnrealized(adaptor.getEvents()); - Value dst = peelUnrealized(adaptor.getDst()); - - SmallVector operands{dst, events}; - rewriter.create( - loc, TypeRange{}, "TSYNC", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); - - rewriter.eraseOp(op); - return success(); - } -}; - -// ============================================================================= -// 2. BindTileOp Lowering (FIX: Trace back to physical address) -// ============================================================================= -struct PTOBindTileToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - static bool getIndexConst(Value v, int64_t &out) { - if (!v) - return false; - if (auto cst = v.getDefiningOp()) { - if (auto ia = dyn_cast(cst.getValue())) { - out = ia.getValue().getSExtValue(); - return true; - } - } - return false; - } - - LogicalResult matchAndRewrite(pto::BindTileOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto *ctx = rewriter.getContext(); - auto configAttr = op.getConfigAttr(); - auto viewSemantics = op->getAttrOfType("pto.view_semantics"); - - auto peelAllCasts = [](Value v) { - while (auto castOp = v.getDefiningOp()) - v = castOp.getOperand(0); - if (auto castOp = v.getDefiningOp()) - v = castOp.getOperand(); - return v; - }; - auto isTileLike = [](Value v) -> bool { - auto ot = dyn_cast(v.getType()); - if (!ot) - return false; - StringRef s = ot.getValue(); - return s.contains("Tile<") || s.contains("ConvTile<"); - }; - - auto buildTileValue = [&]() -> FailureOr { - auto resMrTy = dyn_cast(op.getType()); - if (!resMrTy) - return failure(); - - const char *roleTok = "TileType::Vec"; - if (auto asAttr = - dyn_cast_or_null(resMrTy.getMemorySpace())) { - switch (asAttr.getAddressSpace()) { - case pto::AddressSpace::VEC: - roleTok = "TileType::Vec"; - break; - case pto::AddressSpace::MAT: - roleTok = "TileType::Mat"; - break; - case pto::AddressSpace::LEFT: - roleTok = "TileType::Left"; - break; - case pto::AddressSpace::RIGHT: - roleTok = "TileType::Right"; - break; - case pto::AddressSpace::ACC: - roleTok = "TileType::Acc"; - break; - case pto::AddressSpace::BIAS: - roleTok = "TileType::Bias"; - break; - case pto::AddressSpace::SCALING: - roleTok = "TileType::Scaling"; - break; - case pto::AddressSpace::GM: - case pto::AddressSpace::Zero: - roleTok = "TileType::Vec"; - break; - } - } - - Type elemTy = resMrTy.getElementType(); - Type emitElemTy = getTypeConverter()->convertType(elemTy); - if (!emitElemTy) - return failure(); - auto emitElemOpaque = dyn_cast(emitElemTy); - if (!emitElemOpaque) - return failure(); - std::string elemTypeStr = emitElemOpaque.getValue().str(); - - if (resMrTy.getRank() < 2) - return failure(); - int64_t rows = resMrTy.getDimSize(0); - int64_t cols = resMrTy.getDimSize(1); - if (rows == ShapedType::kDynamic || cols == ShapedType::kDynamic) - return failure(); - - std::string blTok = "BLayout::RowMajor"; - if (auto blAttr = dyn_cast(configAttr.getBLayout())) { - if (static_cast(blAttr.getValue()) == 1) - blTok = "BLayout::ColMajor"; - } - - std::string slTok = "SLayout::NoneBox"; - if (auto slAttr = dyn_cast(configAttr.getSLayout())) { - int32_t slVal = static_cast(slAttr.getValue()); - slTok = (slVal == 1) ? "SLayout::RowMajor" - : (slVal == 2) ? "SLayout::ColMajor" - : "SLayout::NoneBox"; - } - - int32_t fractal = 512; - if (auto frAttr = dyn_cast(configAttr.getSFractalSize())) - fractal = frAttr.getInt(); - - std::string padTok = "PadValue::Null"; - if (auto padAttr = dyn_cast(configAttr.getPad())) { - switch (static_cast(padAttr.getValue())) { - case 1: - padTok = "PadValue::Zero"; - break; - case 2: - padTok = "PadValue::Max"; - break; - case 3: - padTok = "PadValue::Min"; - break; - default: - padTok = "PadValue::Null"; - break; - } - } - - std::string vrowTok, vcolTok; - bool useConstructor = false; - bool rowIsDynamic = false; - bool colIsDynamic = false; - SmallVector constructorArgs; - - Value vRow = op.getValidRow(); - Value vCol = op.getValidCol(); - Value vRowEmitC = adaptor.getValidRow(); - Value vColEmitC = adaptor.getValidCol(); - int64_t cRow = 0, cCol = 0; - - if (vRow && getIndexConst(vRow, cRow)) { - vrowTok = std::to_string(cRow); - } else if (vRow) { - vrowTok = "-1"; - rowIsDynamic = true; - useConstructor = true; - } else { - vrowTok = std::to_string(rows); - } - - if (vCol && getIndexConst(vCol, cCol)) { - vcolTok = std::to_string(cCol); - } else if (vCol) { - vcolTok = "-1"; - colIsDynamic = true; - useConstructor = true; - } else { - vcolTok = std::to_string(cols); - } - - if (useConstructor) { - if (rowIsDynamic && vRowEmitC) - constructorArgs.push_back(vRowEmitC); - if (colIsDynamic && vColEmitC) - constructorArgs.push_back(vColEmitC); - } - - std::string tileTypeStr = std::string("Tile<") + roleTok + ", " + - elemTypeStr + ", " + std::to_string(rows) + - ", " + std::to_string(cols) + ", " + blTok + - ", " + vrowTok + ", " + vcolTok + ", " + slTok + - ", " + std::to_string(fractal) + ", " + padTok + - ">"; - - auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); - if (useConstructor) { - return rewriter - .create(loc, tileType, tileTypeStr, ArrayAttr{}, - ArrayAttr{}, ValueRange(constructorArgs)) - .getResult(0); - } - - return rewriter - .create(loc, tileType, emitc::OpaqueAttr::get(ctx, "")) - .getResult(); - }; - - auto emitElemTypeToString = [&](Type elemTy) -> std::string { - if (elemTy.isF16()) - return "half"; - if (elemTy.isBF16()) - return "bfloat16_t"; - if (elemTy.isF32()) - return "float"; - if (elemTy.isF64()) - return "double"; - if (elemTy.isInteger(8)) { - if (elemTy.isSignlessInteger(8) || elemTy.isSignedInteger(8)) - return "int8_t"; - return "uint8_t"; - } - if (elemTy.isInteger(16)) { - if (elemTy.isSignlessInteger(16) || elemTy.isSignedInteger(16)) - return "int16_t"; - return "uint16_t"; - } - if (elemTy.isInteger(32)) { - if (elemTy.isSignlessInteger(32) || elemTy.isSignedInteger(32)) - return "int32_t"; - return "uint32_t"; - } - if (elemTy.isInteger(64)) { - return cast(elemTy).isUnsigned() ? "uint64_t" : "int64_t"; - } - return "float"; - }; - - auto buildIntegralAddress = [&](Value sourceValue) -> FailureOr { - auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); - auto rcU64 = - rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); - - Value rawPtr = sourceValue; - if (auto ot = dyn_cast(sourceValue.getType())) { - StringRef tyStr = ot.getValue(); - if (tyStr.contains("Tile<") || tyStr.contains("ConvTile<")) { - auto srcMrTy = dyn_cast(op.getSource().getType()); - if (!srcMrTy) - return failure(); - std::string elemTok = emitElemTypeToString(srcMrTy.getElementType()); - pto::AddressSpace as = pto::AddressSpace::GM; - if (auto asAttr = - dyn_cast_or_null(srcMrTy.getMemorySpace())) - as = asAttr.getAddressSpace(); - std::string rawPtrTok = - std::string(addrSpaceQualifier(as)) + " " + elemTok + "*"; - auto rawPtrTy = emitc::OpaqueType::get(ctx, rawPtrTok); - rawPtr = rewriter - .create( - loc, rawPtrTy, "PTOAS__TILE_DATA", ArrayAttr{}, - ArrayAttr{}, ValueRange{sourceValue}) - .getResult(0); - } - } - - if (isa(rawPtr.getType()) || - (isa(rawPtr.getType()) && - cast(rawPtr.getType()).getValue().ends_with("*"))) { - return rewriter - .create(loc, u64Ty, "reinterpret_cast", - ArrayAttr{}, rcU64, ValueRange{rawPtr}) - .getResult(0); - } - - if (rawPtr.getType() == u64Ty) - return rawPtr; - return rewriter.create(loc, u64Ty, rawPtr).getResult(); - }; - - Value tileCandidate = peelAllCasts(adaptor.getSource()); - if (viewSemantics && viewSemantics.getValue() == "bitcast" && - isTileLike(tileCandidate)) { - FailureOr dstTile = buildTileValue(); - if (failed(dstTile)) - return failure(); - FailureOr addr = buildIntegralAddress(tileCandidate); - if (failed(addr)) - return failure(); - - rewriter.create(loc, TypeRange{}, "TASSIGN", - ArrayAttr{}, ArrayAttr{}, - ValueRange{*dstTile, *addr}); - rewriter.replaceOp(op, *dstTile); - return success(); - } - - if (viewSemantics && viewSemantics.getValue() == "treshape" && - isTileLike(tileCandidate)) { - FailureOr dstTile = buildTileValue(); - if (failed(dstTile)) - return failure(); - - rewriter.create(loc, TypeRange{}, "TRESHAPE", - ArrayAttr{}, ArrayAttr{}, - ValueRange{*dstTile, tileCandidate}); - rewriter.replaceOp(op, *dstTile); - return success(); - } - - SmallVector physAddrs; - Value source = op.getSource(); - - while (auto castOp = source.getDefiningOp()) - source = castOp.getOperand(0); - - if (auto upstreamCast = source.getDefiningOp()) { - auto upstreamOperands = upstreamCast.getAddrs(); - physAddrs.append(upstreamOperands.begin(), upstreamOperands.end()); - } else { - physAddrs.push_back(adaptor.getSource()); - } - - Value vRow = op.getValidRow(); - Value vCol = op.getValidCol(); - - rewriter.replaceOpWithNewOp( - op, op.getType(), physAddrs, vRow ? vRow : Value(), - vCol ? vCol : Value(), configAttr); - - return success(); - } -}; - -// ============================================================================= -// Arith CmpI -> EmitC Cmp -// ============================================================================= -class ArithCmpIToEmitC : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(arith::CmpIOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - - // 将 arith.cmpi 转换为 emitc.cmp - // 映射 Predicate: eq -> equal, slt -> less, etc. - emitc::CmpPredicate emitcPred; - const bool isUnsignedPred = - op.getPredicate() == arith::CmpIPredicate::ult || - op.getPredicate() == arith::CmpIPredicate::ule || - op.getPredicate() == arith::CmpIPredicate::ugt || - op.getPredicate() == arith::CmpIPredicate::uge; - switch (op.getPredicate()) { - case arith::CmpIPredicate::eq: emitcPred = emitc::CmpPredicate::eq; break; - case arith::CmpIPredicate::ne: emitcPred = emitc::CmpPredicate::ne; break; - case arith::CmpIPredicate::slt: emitcPred = emitc::CmpPredicate::lt; break; - case arith::CmpIPredicate::sle: emitcPred = emitc::CmpPredicate::le; break; - case arith::CmpIPredicate::sgt: emitcPred = emitc::CmpPredicate::gt; break; - case arith::CmpIPredicate::sge: emitcPred = emitc::CmpPredicate::ge; break; - // ... 处理无符号比较 (ult, ule 等) ... - case arith::CmpIPredicate::ult: emitcPred = emitc::CmpPredicate::lt; break; - case arith::CmpIPredicate::ule: emitcPred = emitc::CmpPredicate::le; break; - case arith::CmpIPredicate::ugt: emitcPred = emitc::CmpPredicate::gt; break; - case arith::CmpIPredicate::uge: emitcPred = emitc::CmpPredicate::ge; break; - default: return failure(); - } - - Type resTy = getTypeConverter()->convertType(op.getType()); - if (!resTy) - return failure(); - - Value lhs = adaptor.getLhs(); - Value rhs = adaptor.getRhs(); - if (isUnsignedPred) { - Type opTy = op.getLhs().getType(); - auto intTy = dyn_cast(opTy); - const bool isIndex = isa(opTy); - if (!intTy && !isIndex) - return rewriter.notifyMatchFailure( - op, "expected scalar integer or index operands"); - - const unsigned bitWidth = - intTy ? intTy.getWidth() : static_cast(kPTOIndexBitWidth); - if (bitWidth != 1) { - lhs = castSignlessIntToUnsignedSameWidth(rewriter, loc, lhs, bitWidth); - rhs = castSignlessIntToUnsignedSameWidth(rewriter, loc, rhs, bitWidth); - } - } - - rewriter.replaceOpWithNewOp( - op, - /*resultType=*/resTy, // i1 -> bool/i1 - emitcPred, - lhs, - rhs - ); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Section Op Lowering -//===----------------------------------------------------------------------===// -template -struct SectionToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - std::string getMacroName() const { - if (std::is_same::value) - return "__DAV_CUBE__"; - if (std::is_same::value) - return "__DAV_VEC__"; - return "UNKNOWN_MACRO"; - } - - LogicalResult - matchAndRewrite(SectionOpTy op, typename SectionOpTy::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - - std::string startMacro = "\n#if defined(" + getMacroName() + ")"; - rewriter.create(loc, startMacro); - - if constexpr (std::is_same_v) { - // Vector mask is a global HW state and may be modified by previous kernels - // (or earlier sections). Reset it to a well-defined state for deterministic - // execution of VEC ops. - rewriter.create(loc, "set_mask_norm();"); - rewriter.create(loc, "set_vector_mask(-1, -1);"); - } - - Block &innerBlock = op.getBody().front(); - if (!innerBlock.empty()) { - rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{}); - } - - std::string endMacro = "#endif // " + getMacroName() + "\n"; - rewriter.create(loc, endMacro); - - rewriter.eraseOp(op); - - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// SCF Control-Flow Pre-Lowering -// -// EmitC translation supports `emitc.for`/`emitc.if` plus CFG-style -// `cf.br`/`cf.cond_br`. Upstream SCFToEmitC patterns only cover `scf.for` and -// `scf.if`, so we pre-lower some SCF ops into those supported forms. -//===----------------------------------------------------------------------===// - -namespace { - -static bool isTriviallyInlineableExecuteRegion(scf::ExecuteRegionOp op) { - Region &r = op.getRegion(); - if (!r.hasOneBlock()) - return false; - Block &b = r.front(); - return isa_and_nonnull(b.getTerminator()); -} - -static bool needsWholeFunctionSCFToCF(func::FuncOp func) { - bool needs = false; - func.walk([&](Operation *op) { - if (!isa(op)) - return WalkResult::advance(); - Operation *parentOp = op->getParentOp(); - - // `scf.execute_region` can legally appear in single-block parents. Only - // require whole-function SCFToCF if we need to lower it into CFG blocks - // (multi-block region / non-trivial terminators). - if (auto exec = dyn_cast(op)) { - if (parentOp && parentOp->hasTrait() && - !isTriviallyInlineableExecuteRegion(exec)) { - needs = true; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - } - - if (parentOp && parentOp->hasTrait()) { - needs = true; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - return needs; -} - -// scf.execute_region is semantically just an inlined region producing results -// via scf.yield. Inline it to the parent block to avoid extra lowering needs. -struct SCFExecuteRegionInline - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(scf::ExecuteRegionOp op, - PatternRewriter &rewriter) const override { - if (op.getRegion().empty()) - return rewriter.notifyMatchFailure(op, "expected non-empty region"); - - Block &innerBlock = op.getRegion().front(); - auto yield = dyn_cast(innerBlock.getTerminator()); - if (!yield) - return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); - - // Move the body operations before the execute_region op. - rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{}); - - // Replace execute_region results with yielded values, then erase the yield. - rewriter.replaceOp(op, yield.getOperands()); - rewriter.eraseOp(yield); - return success(); - } -}; - -// Lower scf.execute_region into CFG blocks with cf.br/cf.cond_br by inlining the -// region blocks into the parent region and rewriting scf.yield to branch into a -// continuation block carrying results. -// -// Note: This requires the parent region to allow multiple blocks (e.g. the -// function body CFG region). For execute_region nested in single-block regions -// (scf.for/scf.if), run SCFToCF first to eliminate the single-block constraint. -struct SCFExecuteRegionToCF : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(scf::ExecuteRegionOp op, - PatternRewriter &rewriter) const override { - if (isTriviallyInlineableExecuteRegion(op)) - return rewriter.notifyMatchFailure(op, "trivially inlineable"); - - Operation *parentOp = op->getParentOp(); - if (parentOp && parentOp->hasTrait()) { - return rewriter.notifyMatchFailure( - op, "cannot lower scf.execute_region inside a single-block parent region"); - } - - if (op.getRegion().empty()) - return rewriter.notifyMatchFailure(op, "expected non-empty region"); - - Location loc = op.getLoc(); - Block *curBlock = op->getBlock(); - Region *parentRegion = curBlock->getParent(); - - // Split the parent block so we can branch to a continuation block with phi - // arguments for the execute_region results. - auto execIt = Block::iterator(op.getOperation()); - Block *continueBlock = rewriter.splitBlock(curBlock, std::next(execIt)); - - SmallVector contArgs; - contArgs.reserve(op.getNumResults()); - for (Type t : op.getResultTypes()) - contArgs.push_back(continueBlock->addArgument(t, loc)); - - for (auto it : llvm::enumerate(op.getResults())) - it.value().replaceAllUsesWith(contArgs[it.index()]); - - // Capture blocks before moving the region. - SmallVector movedBlocks; - movedBlocks.reserve(op.getRegion().getBlocks().size()); - for (Block &b : op.getRegion()) - movedBlocks.push_back(&b); - Block *entryBlock = &op.getRegion().front(); - - // Inline the execute_region blocks into the parent region right before the - // continuation block. - rewriter.inlineRegionBefore(op.getRegion(), *parentRegion, - continueBlock->getIterator()); - - // Replace all scf.yield terminators with a branch to the continuation. - for (Block *b : movedBlocks) { - auto yield = dyn_cast(b->getTerminator()); - if (!yield) - continue; - rewriter.setInsertionPoint(yield); - rewriter.create(loc, continueBlock, yield.getOperands()); - rewriter.eraseOp(yield); - } - - // Replace execute_region itself with a branch to the inlined entry block. - rewriter.setInsertionPoint(op); - rewriter.create(loc, entryBlock, ValueRange{}); - rewriter.eraseOp(op); - return success(); - } -}; - -// Lower scf.index_switch into CFG blocks with cf.cond_br/cf.br so that we can -// avoid `scf.if` result materialization quirks (and avoid relying on cf.switch, -// which is not supported by EmitC C++ translation). -struct SCFIndexSwitchToCF : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - static LogicalResult cloneYieldingBlockAndBranchTo( - PatternRewriter &rewriter, Location loc, Block &srcBlock, Block *destBlock, - Block *continueBlock) { - rewriter.setInsertionPointToEnd(destBlock); - - IRMapping mapping; - for (Operation &inner : srcBlock.without_terminator()) - rewriter.clone(inner, mapping); - - auto yield = dyn_cast(srcBlock.getTerminator()); - if (!yield) - return failure(); - - SmallVector yieldOperands; - yieldOperands.reserve(yield.getNumOperands()); - for (Value v : yield.getOperands()) - yieldOperands.push_back(mapping.lookupOrDefault(v)); - - rewriter.create(loc, continueBlock, yieldOperands); - return success(); - } - - LogicalResult matchAndRewrite(scf::IndexSwitchOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - Operation *parentOp = op->getParentOp(); - if (parentOp && parentOp->hasTrait()) { - return rewriter.notifyMatchFailure( - op, "cannot lower scf.index_switch inside a single-block parent region"); - } - - Block *curBlock = op->getBlock(); - Region *parentRegion = curBlock->getParent(); - - // Split the parent block so we can branch to a continuation block with phi - // arguments for the switch results. - auto switchIt = Block::iterator(op.getOperation()); - Block *continueBlock = rewriter.splitBlock(curBlock, std::next(switchIt)); - - SmallVector contArgs; - contArgs.reserve(op.getNumResults()); - for (Type t : op.getResultTypes()) - contArgs.push_back(continueBlock->addArgument(t, loc)); - - for (auto it : llvm::enumerate(op.getResults())) - it.value().replaceAllUsesWith(contArgs[it.index()]); - - unsigned numCases = op.getCases().size(); - auto insertPt = continueBlock->getIterator(); - - SmallVector checkBlocks; - SmallVector caseBlocks; - checkBlocks.reserve(numCases); - caseBlocks.reserve(numCases); - - // Create check blocks for each case: check_i compares selector to case_i. - for (unsigned i = 0; i < numCases; ++i) - checkBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); - - // Create one block for default and one block per case to execute the body. - Block *defaultBlock = rewriter.createBlock(parentRegion, insertPt); - for (unsigned i = 0; i < numCases; ++i) - caseBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); - - Value selector = op.getArg(); - auto cases = op.getCases(); - - // Fill check blocks with chained comparisons. - for (unsigned i = 0; i < numCases; ++i) { - rewriter.setInsertionPointToEnd(checkBlocks[i]); - Value caseVal = rewriter.create(loc, cases[i]); - Value cond = rewriter.create( - loc, arith::CmpIPredicate::eq, selector, caseVal); - Block *falseDest = (i + 1 < numCases) ? checkBlocks[i + 1] : defaultBlock; - rewriter.create(loc, cond, caseBlocks[i], ValueRange{}, - falseDest, ValueRange{}); - } - - // Fill case blocks and default block with cloned bodies + branch to cont. - for (unsigned i = 0; i < numCases; ++i) { - if (failed(cloneYieldingBlockAndBranchTo( - rewriter, loc, op.getCaseBlock(i), caseBlocks[i], continueBlock))) - return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); - } - if (failed(cloneYieldingBlockAndBranchTo(rewriter, loc, op.getDefaultBlock(), - defaultBlock, continueBlock))) - return rewriter.notifyMatchFailure(op, "expected scf.yield terminator"); - - // Replace the original switch op with a branch into the check chain. - Block *entryDest = numCases ? checkBlocks[0] : defaultBlock; - rewriter.setInsertionPointAfter(op); - rewriter.create(loc, entryDest, ValueRange{}); - rewriter.eraseOp(op); - return success(); - } -}; - -// Lower scf.while into CFG blocks with cf.br/cf.cond_br. -// -// Note: This requires the parent region to allow multiple blocks. In -// particular, scf.if/scf.for regions are single-block and cannot contain this -// lowering. -struct SCFWhileToCF : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(scf::WhileOp op, - PatternRewriter &rewriter) const override { - Operation *parentOp = op->getParentOp(); - if (parentOp && parentOp->hasTrait()) { - return rewriter.notifyMatchFailure( - op, "cannot lower scf.while inside a single-block parent region"); - } - - Block *curBlock = op->getBlock(); - - // Only support the common structured form where the while results are used - // in the same block after the op. - for (Value res : op.getResults()) { - for (auto &use : res.getUses()) { - if (use.getOwner()->getBlock() != curBlock) - return rewriter.notifyMatchFailure( - op, "unsupported: while results used outside the parent block"); - } - } - - auto loc = op.getLoc(); - auto whileIt = Block::iterator(op.getOperation()); - Block *afterWhileBlock = rewriter.splitBlock(curBlock, std::next(whileIt)); - - // Add block args to carry while results into the continuation block. - SmallVector exitArgs; - exitArgs.reserve(op.getNumResults()); - for (Type t : op.getResultTypes()) - exitArgs.push_back(afterWhileBlock->addArgument(t, loc)); - - for (auto it : llvm::enumerate(op.getResults())) - it.value().replaceAllUsesWith(exitArgs[it.index()]); - - // Create the CFG blocks before the continuation block. - Region *parentRegion = curBlock->getParent(); - auto insertPt = afterWhileBlock->getIterator(); - - // Header block arguments match the while init operands. - SmallVector headerArgTypes; - for (Value v : op.getInits()) - headerArgTypes.push_back(v.getType()); - SmallVector headerArgLocs(headerArgTypes.size(), loc); - Block *headerBlock = - rewriter.createBlock(parentRegion, insertPt, headerArgTypes, - headerArgLocs); - - // Body block arguments match the "after" region arguments. - Block &afterRegionBlock = op.getAfter().front(); - SmallVector bodyArgTypes(afterRegionBlock.getArgumentTypes().begin(), - afterRegionBlock.getArgumentTypes().end()); - SmallVector bodyArgLocs(bodyArgTypes.size(), loc); - insertPt = afterWhileBlock->getIterator(); - Block *bodyBlock = - rewriter.createBlock(parentRegion, insertPt, bodyArgTypes, bodyArgLocs); - - // Move the before/after region bodies into the new CFG blocks. - rewriter.mergeBlocks(&op.getBefore().front(), headerBlock, - headerBlock->getArguments()); - rewriter.mergeBlocks(&afterRegionBlock, bodyBlock, bodyBlock->getArguments()); - - // Replace scf.condition in the header with cf.cond_br. - { - auto condOp = cast(headerBlock->getTerminator()); - rewriter.setInsertionPoint(condOp); - rewriter.create(loc, condOp.getCondition(), - /*trueDest=*/bodyBlock, - /*trueOperands=*/condOp.getArgs(), - /*falseDest=*/afterWhileBlock, - /*falseOperands=*/condOp.getArgs()); - rewriter.eraseOp(condOp); - } - - // Replace scf.yield in the body with cf.br back to the header. - { - auto yieldOp = cast(bodyBlock->getTerminator()); - rewriter.setInsertionPoint(yieldOp); - rewriter.create(loc, headerBlock, yieldOp.getOperands()); - rewriter.eraseOp(yieldOp); - } - - // Replace scf.while itself with a branch to the header. - rewriter.setInsertionPoint(op); - rewriter.create(loc, headerBlock, op.getInits()); - rewriter.eraseOp(op); - return success(); - } -}; - -// Lower cf.switch into chained comparisons and cf.cond_br/cf.br. -// -// EmitC C++ translation currently supports cf.br/cf.cond_br, but not cf.switch. -struct CFSwitchToCondBr : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(cf::SwitchOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - Operation *parentOp = op->getParentOp(); - if (parentOp && parentOp->hasTrait()) { - return rewriter.notifyMatchFailure( - op, "cannot lower cf.switch inside a single-block parent region"); - } - - Block *curBlock = op->getBlock(); - Region *parentRegion = curBlock->getParent(); - - Value flag = op.getFlag(); - auto flagTy = dyn_cast(flag.getType()); - if (!flagTy) - return rewriter.notifyMatchFailure(op, "expected integer switch flag"); - - SmallVector defaultOperands(op.getDefaultOperands().begin(), - op.getDefaultOperands().end()); - Block *defaultDest = op.getDefaultDestination(); - - SmallVector caseDests(op.getCaseDestinations().begin(), - op.getCaseDestinations().end()); - SmallVector> caseOperands; - caseOperands.reserve(caseDests.size()); - for (auto range : op.getCaseOperands()) - caseOperands.emplace_back(range.begin(), range.end()); - - if (caseDests.empty()) { - rewriter.replaceOpWithNewOp(op, defaultDest, defaultOperands); - return success(); - } - - std::optional caseValuesAttr = op.getCaseValues(); - if (!caseValuesAttr) - return rewriter.notifyMatchFailure(op, "missing case_values"); - - SmallVector caseValues; - for (APInt v : caseValuesAttr->getValues()) - caseValues.push_back(v); - - if (caseValues.size() != caseDests.size()) - return rewriter.notifyMatchFailure(op, "case_values/destinations mismatch"); - if (caseOperands.size() != caseDests.size()) - return rewriter.notifyMatchFailure(op, "case_operands/destinations mismatch"); - - // Insert check blocks right after the current block. - auto insertPt = std::next(curBlock->getIterator()); - SmallVector checkBlocks; - checkBlocks.reserve(caseDests.size()); - for (size_t i = 0; i < caseDests.size(); ++i) - checkBlocks.push_back(rewriter.createBlock(parentRegion, insertPt)); - - // Fill each check block with: - // if (flag == caseVal_i) goto caseDest_i else goto nextCheck/default. - for (size_t i = 0; i < caseDests.size(); ++i) { - rewriter.setInsertionPointToEnd(checkBlocks[i]); - - APInt caseVal = caseValues[i]; - if (caseVal.getBitWidth() != flagTy.getWidth()) { - return rewriter.notifyMatchFailure( - op, "case value bitwidth doesn't match flag type"); - } - - Value caseConst = rewriter.create( - loc, flagTy, rewriter.getIntegerAttr(flagTy, caseVal)); - Value cond = rewriter.create( - loc, arith::CmpIPredicate::eq, flag, caseConst); - - Block *falseDest = - (i + 1 < checkBlocks.size()) ? checkBlocks[i + 1] : defaultDest; - ValueRange falseOperands = - (i + 1 < checkBlocks.size()) ? ValueRange{} : ValueRange(defaultOperands); - - rewriter.create(loc, cond, - /*trueDest=*/caseDests[i], - /*trueOperands=*/caseOperands[i], - /*falseDest=*/falseDest, - /*falseOperands=*/falseOperands); - } - - // Replace the switch terminator with a branch into the first check block. - rewriter.setInsertionPoint(op); - rewriter.replaceOpWithNewOp(op, checkBlocks.front(), - ValueRange{}); - return success(); - } -}; - -} // namespace - -static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, - TypeConverter &typeConverter, - MLIRContext *ctx, - DataFlowSolver &solver, - PTOArch targetArch) { - (void)solver; - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add>( - typeConverter, ctx); - patterns.add>( - typeConverter, ctx); - patterns.add>( - typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add>(typeConverter, - ctx); - patterns.add>(typeConverter, - ctx); - patterns.add>(typeConverter, - ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add>(typeConverter, ctx); - patterns.add>(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add>(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add>(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx, targetArch); - patterns.add(typeConverter, ctx, targetArch); - patterns.add>(typeConverter, ctx); - patterns.add>(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); - patterns.add< - PTOTMatmulBiasToTMATMUL_BIAS, - PTOTMatmulMXToTMATMUL_MX, - PTOTMatmulMXAccToTMATMUL_MX_ACC, - PTOTMatmulMXBiasToTMATMUL_MX_BIAS, - PTOTMatmulBiasToTMATMUL_BIAS, - PTOTMatmulMXToTMATMUL_MX, - PTOTMatmulMXAccToTMATMUL_MX_ACC, - PTOTMatmulMXBiasToTMATMUL_MX_BIAS, - PTOTGemvBiasToTGEMV_BIAS, - PTOBarrierToEmitC - >(typeConverter, ctx); - - patterns.add(typeConverter, ctx); - - populateSCFToEmitCConversionPatterns(patterns); - // Keep CFG-style branches type-consistent when block argument types are - // converted (e.g. after lowering scf.while to cf.br/cf.cond_br). - populateBranchOpInterfaceTypeConversionPattern(patterns, typeConverter); - populateCallOpTypeConversionPattern(patterns, typeConverter); -} - -//===----------------------------------------------------------------------===// -// Pass -//===----------------------------------------------------------------------===// - -namespace { -struct EmitPTOManualPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(EmitPTOManualPass) - - PTOArch targetArch; - - EmitPTOManualPass() : targetArch(PTOArch::A3) {} - - explicit EmitPTOManualPass(PTOArch arch) : targetArch(arch) {} - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - llvm::errs() << "DEBUG: Start PTOToEmitC Pass\n"; - MLIRContext *ctx = &getContext(); - ModuleOp mop = getOperation(); - - // 1. 插入头文件 - auto loc = mop->getLoc(); - OpBuilder builder(ctx); - builder.setInsertionPointToStart(mop.getBody()); - builder.create( - loc, builder.getStringAttr("pto/pto-inst.hpp"), /*isAngled=*/nullptr); - builder.create( - loc, builder.getStringAttr("using namespace pto;")); - - // Only inject the bitcast helper when we actually lower ops that need it - // (e.g. arith.bitcast or arith.maximumf/minimumf tie-breaking on zeros). - bool needsBitcastHelper = false; - mop.walk([&](Operation *op) { - if (isa(op)) { - needsBitcastHelper = true; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (needsBitcastHelper) { - builder.create( - loc, builder.getStringAttr(R"cpp( - template - static inline To ptoas_bitcast(From from) { - static_assert(sizeof(To) == sizeof(From), "ptoas_bitcast: size mismatch"); - To to; - __builtin_memcpy(&to, &from, sizeof(To)); - return to; - } - )cpp")); - } - - // 1.5 Pre-lower SCF constructs not handled by SCFToEmitC. - { - // scf.while / scf.index_switch are lowered via CFG blocks. This is not - // possible inside ops that require single-block regions (e.g. scf.for / - // scf.if). If we see such nesting, lower the entire function to the - // ControlFlow dialect first. - bool needsAnySCFToCF = false; - for (auto func : mop.getOps()) { - if (needsWholeFunctionSCFToCF(func)) { - needsAnySCFToCF = true; - break; - } - } - if (needsAnySCFToCF) { - RewritePatternSet scfToCfPatterns(ctx); - populateSCFToControlFlowConversionPatterns(scfToCfPatterns); - FrozenRewritePatternSet frozenSCFToCF(std::move(scfToCfPatterns)); - - ConversionTarget scfToCfTarget(*ctx); - // Only eliminate the single-block SCF constructs; we'll pre-lower - // scf.while/index_switch/execute_region ourselves afterwards. - scfToCfTarget.addIllegalOp(); - scfToCfTarget.markUnknownOpDynamicallyLegal( - [](Operation *) { return true; }); - - for (auto func : mop.getOps()) { - if (!needsWholeFunctionSCFToCF(func)) - continue; - if (failed(applyPartialConversion(func, scfToCfTarget, - frozenSCFToCF))) { - func.emitError() - << "failed to lower nested SCF to ControlFlow (SCFToCF)"; - return signalPassFailure(); - } - } - } - - RewritePatternSet scfLoweringPatterns(ctx); - scfLoweringPatterns.add(ctx); - (void)applyPatternsAndFoldGreedily(mop, std::move(scfLoweringPatterns)); - - bool hasUnsupportedSCF = false; - mop.walk([&](Operation *op) { - if (isa(op)) { - hasUnsupportedSCF = true; - op->emitError() << "Unsupported SCF op remained after pre-lowering"; - return WalkResult::interrupt(); - } - if (isa(op)) { - hasUnsupportedSCF = true; - op->emitError() - << "Unsupported CF op remained after pre-lowering: cf.switch"; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (hasUnsupportedSCF) - return signalPassFailure(); - } - - // 2. 配置转换目标 - PTOToEmitCTypeConverter typeConverter(ctx); - ConversionTarget target(*ctx); - - target.addIllegalDialect(); - target.addIllegalDialect(); - target.addIllegalDialect(); - target.addIllegalDialect(); - - // If we introduced CFG branches (e.g. from scf.while), make sure they are - // updated to use legalized operand types. - target.addDynamicallyLegalOp( - [&](Operation *op) { - return isLegalForBranchOpInterfaceTypeConversionPattern(op, - typeConverter); - }); - - // [关键] 允许 Cast 存在,最后统一清理 - target.addLegalOp(); - - target.addIllegalOp(); - target.addIllegalOp(); - target.addIllegalOp(); - - target.addLegalDialect(); - target.addLegalOp(); - - auto solver = std::make_unique(); - solver->load(); - solver->load(); - if (failed(solver->initializeAndRun(getOperation()))) - return signalPassFailure(); - - RewritePatternSet patterns(ctx); - populatePTOToEmitCPatterns(patterns, typeConverter, ctx, *solver, targetArch); - populateCallOpTypeConversionPattern(patterns, typeConverter); - - // 3. 执行转换 - if (failed(applyPartialConversion(mop, target, std::move(patterns)))) { - llvm::errs() << "Conversion FAILED! Rolling back executed.\n"; - return signalPassFailure(); - } - - // ========================================================================= - // 4. [终极清理] - // 顺序至关重要: - // Step A: 先移除所有 Cast,让 Loop 的 Operand 类型变成底层类型 (如 int32) - // Step B: 再根据新的 Operand 类型,修复 Loop IV 的类型 - // ========================================================================= - - // --- Step A: 清理 UnrealizedConversionCastOp --- - // Prefer dropping redundant/unused casts; otherwise lower to emitc.cast - // so the C++ emitter can print it. - llvm::SmallVector castsToErase; - bool castCleanupFailed = false; - mop.walk([&](UnrealizedConversionCastOp cast) { - if (castCleanupFailed) - return; - - if (cast->getNumOperands() != 1 || cast->getNumResults() != 1) { - cast.emitError() << "unsupported unrealized_conversion_cast shape"; - castCleanupFailed = true; - return; - } - - Value input = cast.getOperand(0); - Value output = cast.getResult(0); - Type inTy = input.getType(); - Type outTy = output.getType(); - - if (output.use_empty()) { - castsToErase.push_back(cast); - return; - } - - if (inTy == outTy) { - output.replaceAllUsesWith(input); - castsToErase.push_back(cast); - return; - } - - if (emitc::isSupportedEmitCType(inTy) && emitc::isSupportedEmitCType(outTy)) { - OpBuilder builder(cast); - auto c = builder.create(cast.getLoc(), outTy, input); - output.replaceAllUsesWith(c.getResult()); - castsToErase.push_back(cast); - return; - } - - cast.emitError() << "cannot lower unrealized_conversion_cast(" << inTy - << " -> " << outTy << ") to emitc.cast"; - castCleanupFailed = true; - }); - - for (auto cast : castsToErase) - cast.erase(); - - if (castCleanupFailed) - return signalPassFailure(); - - // --- Step A2: Sink casts of emitc.variable "reads" to their use sites --- - // - // SCFToEmitC lowers scf.if/scf.for results via mutable `emitc.variable` and - // `emitc.assign`. During type conversion, casts from the variable handle to - // the converted type may be materialized right after the variable - // declaration, effectively snapshotting the value *before* assignments. That - // produces wrong C++ (use-before-init / stale reads). - // - // Fix by re-materializing the cast at each use site so it reads the variable - // at the point of use. - { - SmallVector castOpsToSink; - mop.walk([&](emitc::CastOp castOp) { - if (castOp.getSource().getDefiningOp()) - castOpsToSink.push_back(castOp); - }); - - for (emitc::CastOp castOp : castOpsToSink) { - Value src = castOp.getSource(); - Type dstTy = castOp.getResult().getType(); - Value oldRes = castOp.getResult(); - - // Replace each use with a freshly inserted cast right before the user. - for (OpOperand &use : llvm::make_early_inc_range(oldRes.getUses())) { - Operation *user = use.getOwner(); - OpBuilder b(user); - b.setInsertionPoint(user); - auto newCast = b.create(castOp.getLoc(), dstTy, src); - use.set(newCast.getResult()); - } - - castOp.erase(); - } - } - - // --- Step B: 修复 Loop 归纳变量 (IV) --- - // 此时 emitc.for 的 operand 已经是 int32 了,我们检查 IV 是否匹配,不匹配则修正 - mop.walk([&](emitc::ForOp forOp) { - Type boundTy = forOp.getLowerBound().getType(); - BlockArgument iv = forOp.getBody()->getArgument(0); - - if (iv.getType() != boundTy) { - iv.setType(boundTy); // 强制将 IV 类型 (index) 修改为与边界一致 (int32) - } - }); - - // --- Step C: 消除冗余 Tile 变量 (Dead Code Elimination) [新增] --- - // 逻辑:如果一个 emitc.variable 没有被读取(use_empty), - // 那么它自己,以及给它赋值的 TASSIGN 都可以删除。 - // 注意:TASSIGN(v15, v9) 会把 v15 作为 Operand 0 使用,所以 v15 不是严格的 use_empty。 - // 我们需要检查:v15 是否除了 TASSIGN 之外没有其他 User。 - - llvm::SmallVector deadVars; - mop.walk([&](emitc::VariableOp varOp) { - // 检查该变量的所有 User - bool isRead = false; - for (Operation* user : varOp.getResult().getUsers()) { - // 如果 User 是 TASSIGN 且变量是第0个参数(dst),不算"读取" - if (auto call = dyn_cast(user)) { - if (call.getCallee() == "TASSIGN" && call.getOperand(0) == varOp.getResult()) { - continue; // 这是一个赋值操作,不算有效使用 - } - } - // 如果还有其他用途(如 TLOAD, TMOV, TMATMUL),则该变量有用 - isRead = true; - break; - } - - if (!isRead) { - deadVars.push_back(varOp); - } - }); - - for (auto varOp : deadVars) { - // 1. 先删除所有使用该变量的 TASSIGN - llvm::SmallVector usersToErase; - for (Operation* user : varOp.getResult().getUsers()) { - // 我们上面已经确认过,剩下的 user 只能是 TASSIGN - usersToErase.push_back(user); - } - for (auto u : usersToErase) u->erase(); - - // 2. 删除变量定义本身 - varOp.erase(); - } - - // ========================================================================= - } - }; -} // namespace - -std::unique_ptr mlir::pto::createEmitPTOManualPass() { - return std::make_unique(); -} - -std::unique_ptr mlir::pto::createEmitPTOManualPass(PTOArch arch) { - return std::make_unique(arch); -} diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/README.md b/.agent/skills/translate_cpp2py/references/ptoas_source/README.md deleted file mode 100644 index e4690280..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoas_source/README.md +++ /dev/null @@ -1,4 +0,0 @@ -Copy critical references -- https://github.com/huawei-csl/PTOAS/blob/20260309/include/PTO/IR/PTOOps.td -- https://github.com/huawei-csl/PTOAS/blob/20260309/python/pto/dialects/pto.py -- https://github.com/huawei-csl/PTOAS/blob/20260309/lib/PTO/Transforms/PTOToEmitC.cpp diff --git a/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py b/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py deleted file mode 100644 index 68e85bf5..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoas_source/pto.py +++ /dev/null @@ -1,280 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from . import _pto_ops_gen as _pto_ops_gen -from ._pto_ops_gen import * -from mlir import ir as _ods_ir - - -def _load_local_pto_ext(): - import importlib.util - from pathlib import Path - - lib_dir = Path(__file__).resolve().parent.parent / "_mlir_libs" - for suffix in ("*.so", "*.pyd", "*.dll", "*.dylib"): - for so_path in lib_dir.glob(f"_pto{suffix}"): - spec = importlib.util.spec_from_file_location( - "mlir._mlir_libs._pto", so_path - ) - if spec and spec.loader: - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod - raise ImportError("cannot locate local _pto extension in _mlir_libs") - - -try: - _pto_mod = _load_local_pto_ext() -except Exception: - from .._mlir_libs import _pto as _pto_mod - -register_dialect = _pto_mod.register_dialect -PtrType = _pto_mod.PtrType -TensorViewType = _pto_mod.TensorViewType -PartitionTensorViewType = _pto_mod.PartitionTensorViewType -TileType = _pto_mod.TileType -TileBufType = _pto_mod.TileBufType -AddressSpace = _pto_mod.AddressSpace -AddressSpaceAttr = _pto_mod.AddressSpaceAttr -TileBufConfigAttr = _pto_mod.TileBufConfigAttr -BLayout = _pto_mod.BLayout -BLayoutAttr = _pto_mod.BLayoutAttr -SLayout = _pto_mod.SLayout -SLayoutAttr = _pto_mod.SLayoutAttr -PadValue = _pto_mod.PadValue -PadValueAttr = _pto_mod.PadValueAttr -RoundMode = _pto_mod.RoundMode -RoundModeAttr = _pto_mod.RoundModeAttr -CmpMode = _pto_mod.CmpMode -CmpModeAttr = _pto_mod.CmpModeAttr -PIPE = _pto_mod.PIPE -PipeAttr = _pto_mod.PipeAttr -Layout = _pto_mod.Layout -LayoutAttr = _pto_mod.LayoutAttr -SyncOpType = _pto_mod.SyncOpType -SyncOpTypeAttr = _pto_mod.SyncOpTypeAttr -EVENT = _pto_mod.EVENT -EventAttr = _pto_mod.EventAttr -MaskPattern = _pto_mod.MaskPattern -MaskPatternAttr = _pto_mod.MaskPatternAttr - -__all__ = [ - # Dialect utilities - "register_dialect", - # Types - "PtrType", - "TensorViewType", - "PartitionTensorViewType", - "TileType", - "TileBufType", - "AddressSpace", - "AddressSpaceAttr", - "BLayout", - "BLayoutAttr", - "SLayout", - "SLayoutAttr", - "PadValue", - "PadValueAttr", - "RoundMode", - "RoundModeAttr", - "CmpMode", - "CmpModeAttr", - "PIPE", - "PipeAttr", - "Layout", - "LayoutAttr", - "SyncOpType", - "SyncOpTypeAttr", - "EVENT", - "EventAttr", - "MaskPattern", - "MaskPatternAttr", - "TileBufConfigAttr", - "TileConfig", - # High-level sync helpers - "record_event", - "wait_event", - "barrier", - # Scalar pointer helpers - "load_scalar", - "store_scalar" - # Aliases for SyncOpType enums (for terse calls) - , - "TLOAD", - "TSTORE_ACC", - "TSTORE_VEC", - "TMOV_M2L", - "TMOV_M2S", - "TMOV_M2B", - "TMOV_M2V", - "TMOV_V2M", - "TMATMUL", - "TVEC", - "TVECWAIT_EVENT" - # Aliases for EVENT enums - , - "EVENT_ID0", - "EVENT_ID1", - "EVENT_ID2", - "EVENT_ID3", - "EVENT_ID4", - "EVENT_ID5", - "EVENT_ID6", - "EVENT_ID7", -] - -# ----------------------------------------------------------------------------- -# Convenience wrappers for high-level sync to allow passing enums directly -# ----------------------------------------------------------------------------- - - -def _ensure_sync_attr(val, ctx): - # Accept SyncOpType enum, SyncOpTypeAttr, or string name ("TMATMUL"/"tmatmul"). - if isinstance(val, SyncOpType): - return SyncOpTypeAttr.get(val, ctx) - if isinstance(val, str): - name = val.upper() - try: - enum_val = getattr(SyncOpType, name) - except AttributeError: - raise ValueError(f"Unknown SyncOpType name: {val}") - return SyncOpTypeAttr.get(enum_val, ctx) - return val - - -def _ensure_event_attr(val, ctx): - if isinstance(val, EVENT): - return EventAttr.get(val, ctx) - if isinstance(val, str): - name = val.upper() - try: - enum_val = getattr(EVENT, name) - except AttributeError: - raise ValueError(f"Unknown EVENT name: {val}") - return EventAttr.get(enum_val, ctx) - return val - - -def record_event(src_op, dst_op, event_id, *, loc=None, ip=None): - ctx = loc.context if loc else _ods_ir.Context.current - return _pto_ops_gen.record_event( - _ensure_sync_attr(src_op, ctx), - _ensure_sync_attr(dst_op, ctx), - _ensure_event_attr(event_id, ctx), - loc=loc, - ip=ip, - ) - - -def wait_event(src_op, dst_op, event_id, *, loc=None, ip=None): - ctx = loc.context if loc else _ods_ir.Context.current - return _pto_ops_gen.wait_event( - _ensure_sync_attr(src_op, ctx), - _ensure_sync_attr(dst_op, ctx), - _ensure_event_attr(event_id, ctx), - loc=loc, - ip=ip, - ) - - -def barrier(op, *, loc=None, ip=None): - ctx = loc.context if loc else _ods_ir.Context.current - # If user passes SyncOpType/Attr, route to barrier_sync (maps to PIPE) - if isinstance(op, (SyncOpType, SyncOpTypeAttr, str)): - op_attr = _ensure_sync_attr(op, ctx) - return _pto_ops_gen.barrier_sync(op_attr, loc=loc, ip=ip) - # Otherwise fall back to low-level barrier expecting PipeAttr - return _pto_ops_gen.barrier(op, loc=loc, ip=ip) - - -# ----------------------------------------------------------------------------- -# Scalar pointer helpers (manual wrappers until python ops are regenerated) -# ----------------------------------------------------------------------------- -def load_scalar(result_type, ptr, offset, *, loc=None, ip=None): - operands = [ - _pto_ops_gen._get_op_result_or_value(ptr), - _pto_ops_gen._get_op_result_or_value(offset), - ] - op = _ods_ir.Operation.create( - "pto.load_scalar", - results=[result_type], - operands=operands, - loc=loc, - ip=ip, - ) - return op.results[0] - - -def store_scalar(ptr, offset, value, *, loc=None, ip=None): - operands = [ - _pto_ops_gen._get_op_result_or_value(ptr), - _pto_ops_gen._get_op_result_or_value(offset), - _pto_ops_gen._get_op_result_or_value(value), - ] - return _ods_ir.Operation.create( - "pto.store_scalar", - operands=operands, - loc=loc, - ip=ip, - ) - - -# ----------------------------------------------------------------------------- -# Export enum aliases for terse calls: pto.record_event(TLOAD, TLOAD, EVENT_ID0) -# ----------------------------------------------------------------------------- -TLOAD = SyncOpType.TLOAD -TSTORE_ACC = SyncOpType.TSTORE_ACC -TSTORE_VEC = SyncOpType.TSTORE_VEC -TMOV_M2L = SyncOpType.TMOV_M2L -TMOV_M2S = SyncOpType.TMOV_M2S -TMOV_M2B = SyncOpType.TMOV_M2B -TMOV_M2V = SyncOpType.TMOV_M2V -TMOV_V2M = SyncOpType.TMOV_V2M -TMATMUL = SyncOpType.TMATMUL -TVEC = SyncOpType.TVEC -TVECWAIT_EVENT = SyncOpType.TVECWAIT_EVENT - -EVENT_ID0 = EVENT.EVENT_ID0 -EVENT_ID1 = EVENT.EVENT_ID1 -EVENT_ID2 = EVENT.EVENT_ID2 -EVENT_ID3 = EVENT.EVENT_ID3 -EVENT_ID4 = EVENT.EVENT_ID4 -EVENT_ID5 = EVENT.EVENT_ID5 -EVENT_ID6 = EVENT.EVENT_ID6 -EVENT_ID7 = EVENT.EVENT_ID7 - - -class TileConfig: - alignedSize = 32 - fixedRowSize = 16 - fixedColSize = 16 - fixedMxRowSize = 16 - fixedMxColSize = 2 - fractalABSize = 512 - fractalCSize = 1024 - fractalMxSize = 32 - - -# ----------------------------------------------------------------------------- -# Op aliases without "Op" suffix (user-facing) -# ----------------------------------------------------------------------------- -def _install_op_aliases(): - added = [] - for name, obj in _pto_ops_gen.__dict__.items(): - if not isinstance(obj, type): - continue - if not issubclass(obj, _ods_ir.OpView): - continue - alias = None - if name.endswith("Op_DPS"): - alias = f"{name[:-6]}_DPS" - elif name.endswith("Op"): - alias = name[:-2] - if not alias or alias in globals(): - continue - globals()[alias] = obj - added.append(alias) - return added - - -__all__.extend(_install_op_aliases()) diff --git a/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md b/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md deleted file mode 100644 index 972afcec..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoisa_source/README.md +++ /dev/null @@ -1,6 +0,0 @@ -If only need one file, use: -- https://gitcode.com/cann/pto-isa/blob/8.5.0/include/pto/common/pto_instr.hpp - -Full references (not put to local dir yet): -- https://gitcode.com/cann/pto-isa/tree/8.5.0/include/pto/npu/a2a3 -- https://gitcode.com/cann/pto-isa/tree/8.5.0/include/pto/common diff --git a/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp b/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp deleted file mode 100644 index ab9b961e..00000000 --- a/.agent/skills/translate_cpp2py/references/ptoisa_source/pto-inst.hpp +++ /dev/null @@ -1,830 +0,0 @@ -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#ifndef PTO_INSTR_HPP -#define PTO_INSTR_HPP - -#include "pto/common/debug.h" -#include "pto/common/pto_instr_impl.hpp" - -#define MAP_INSTR_IMPL(API, ...) API##_IMPL(__VA_ARGS__) - -namespace pto { -template -PTO_INST void TASSIGN(T &obj, AddrType addr) { - MAP_INSTR_IMPL(TASSIGN, obj, addr); -} - -#ifndef __CPU_SIM - template - PTO_INST void TSYNC() { - TSYNC_IMPL(); - } -#endif - -template -PTO_INST void TSYNC(WaitEvents&... events) { - WaitAllEvents(events...); -} - -template -PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TADD, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TABS(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TABS, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSUB, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMUL, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMIN, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMAX, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TEXPANDS, dst, scalar); - return {}; -} - -template -PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TLOAD, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc0 &src0, T src1, CmpMode cmpMode, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCMPS, dst, src0, src1, cmpMode); - return {}; -} - -template -PTO_INST RecordEvent TCMP(TileDataDst &dst, TileDataSrc &src0, TileDataSrc &src1, CmpMode cmpMode, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCMP, dst, src0, src1, cmpMode); - return {}; -} - -template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) -{ - TSYNC(events...); - TSTORE_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) -{ - TSYNC(events...); - TSTORE_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) -{ - TSYNC(events...); - TSTORE_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &...events) -{ - TSYNC(events...); - TSTORE_IMPL(dst, src, preQuantScalar); - return {}; -} - -template -PTO_INST RecordEvent TSTORE_FP(GlobalData &dst, TileData &src, FpTileData &fp, WaitEvents &...events) -{ - TSYNC(events...); - TSTORE_IMPL(dst, src, fp); - return {}; -} - -template -PTO_INST RecordEvent TDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TDIV, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TREM(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TREM, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TSHL(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSHL, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TSHR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSHR, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TAND(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TAND, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TOR, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TXOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TXOR, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TLOG(TileData &dst, TileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TLOG, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TNEG(TileData &dst, TileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TNEG, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TNOT(TileData &dst, TileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TNOT, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TDIVS, dst, 1, src); - return {}; -} - -template -PTO_INST RecordEvent TRELU(TileData &dst, TileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TRELU, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TPRELU(TileData &dst, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TPRELU, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TPRINT(TileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TPRINT, src); - return {}; -} - -template -PTO_INST RecordEvent TADDC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TADDC, dst, src0, src1, src2); - return {}; -} - -template -PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSUBC, dst, src0, src1, src2); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL_MX( - TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, TileRightScale &bScaleMatrix, WaitEvents&... events) -{ - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL_MX, cMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL_MX(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, - TileRight &bMatrix, TileRightScale &bScaleMatrix, WaitEvents&... events) -{ - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL_MX, cOutMatrix, cInMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL_MX(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, - TileRightScale &bScaleMatrix, TileBias &biasData, WaitEvents&... events) -{ - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL_MX, cMatrix, aMatrix, aScaleMatrix, bMatrix, bScaleMatrix, biasData); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL, cMatrix, aMatrix, bMatrix); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL_ACC, cOutMatrix, cInMatrix, aMatrix, bMatrix); - return {}; -} - -template -PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMATMUL_BIAS, cMatrix, aMatrix, bMatrix, biasData); - return {}; -} - -template -PTO_INST RecordEvent -TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, - TmpTileData &tmp, Src0TileData &src0, Src1TileData &src1, - Src2TileData &src2, Src3TileData &src3, WaitEvents&... events) { - TSYNC(events...); - TMRGSORT_IMPL( - dst, executedNumList, tmp, src0, src1, src2, src3); - return {}; -} - -template -PTO_INST RecordEvent TMRGSORT(DstTileData &dst, - MrgSortExecutedNumList &executedNumList, - TmpTileData &tmp, Src0TileData &src0, - Src1TileData &src1, Src2TileData &src2, WaitEvents&... events) { - TSYNC(events...); - TMRGSORT_IMPL(dst, executedNumList, tmp, src0, src1, - src2); - return {}; -} - -template -PTO_INST RecordEvent -TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, - TmpTileData &tmp, Src0TileData &src0, Src1TileData &src1, WaitEvents&... events) { - TSYNC(events...); - TMRGSORT_IMPL(dst, executedNumList, tmp, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TMRGSORT(DstTileData &dst, SrcTileData &src, - uint32_t blockLen, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMRGSORT, dst, src, blockLen); - return {}; -} - -template -PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, - uint16_t indexRow = 0, uint16_t indexCol = 0, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TEXTRACT, dst, src, indexRow, indexCol); - return {}; -} - -template -PTO_INST RecordEvent TFILLPAD(DstTileData &dst, SrcTileData &src, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TFILLPAD, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TFILLPAD_INPLACE(DstTileData &dst, SrcTileData &src, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TFILLPAD_INPLACE, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TFILLPAD_EXPAND(DstTileData &dst, SrcTileData &src, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TFILLPAD_EXPAND, dst, src); - return {}; -} - -// TSORT32不自动实现wait, 需手动TSYNC(events...) -template -PTO_INST RecordEvent TSORT32(DstTileData &dst, SrcTileData &src, IdxTileData &idx) { - MAP_INSTR_IMPL(TSORT32, dst, src, idx); - return {}; -} - -template -PTO_INST RecordEvent TSORT32(DstTileData &dst, SrcTileData &src, IdxTileData &idx, TmpTileData &tmp) { - MAP_INSTR_IMPL(TSORT32, dst, src, idx, tmp); - return {}; -} - -template -PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS0 &src0, TileDataS1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TGATHER, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents&... events) { - TSYNC(events...); - TCI_IMPL(dst, start); - return {}; -} - -template -PTO_INST RecordEvent TTRI(TileData &dst, WaitEvents&... events) { - TSYNC(events...); - TTRI_IMPL(dst); - return {}; -} - -template -PTO_INST RecordEvent TGATHER(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { - TSYNC(events...); - TGATHER_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TPARTADD, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TPARTMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TPARTMAX, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TPARTMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TPARTMIN, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCVT, dst, src, mode); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMOV, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src); - return {}; -} - -template -PTO_INST RecordEvent TMOV_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src, fp); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src, fp); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src, preQuantScalar); - return {}; -} - -template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents&... events) { - TSYNC(events...); - TMOV_IMPL(dst, src, preQuantScalar); - return {}; -} - -template -PTO_INST RecordEvent TROWSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWSUM, dst, src, tmp); - return {}; -} - -template -PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLSUM, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, bool isBinary, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLSUM, dst, src, tmp, isBinary); - return {}; -} - -template -PTO_INST RecordEvent TCOLMAX(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLMAX, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TROWMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWMAX, dst, src, tmp); - return {}; -} - -template -PTO_INST RecordEvent TRESHAPE(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TRESHAPE, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TROWMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWMIN, dst, src, tmp); - return {}; -} - -template -PTO_INST RecordEvent TSELS(TileData &dst, TileData &src0, TileData &src1, uint8_t selectMode, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSELS, dst, src0, src1, selectMode); - return {}; -} - -template -PTO_INST RecordEvent TSEL(TileData &dst, MaskTile &selMask, TileData &src0, TileData &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSEL, dst, selMask, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TTRANS(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TTRANS, dst, src, tmp); - return {}; -} - -template -PTO_INST RecordEvent TMINS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMINS, dst, src, scalar); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPAND, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDDIV, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDMUL, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDSUB, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDADD, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDMAX, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TROWEXPANDMIN, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TRSQRT, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSQRT, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TEXP, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TGATHERB(TileDataDst &dst, TileDataSrc &src, TileDataOffset &offset, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TGATHERB, dst, src, offset); - return {}; -} - -template -PTO_INST RecordEvent TADDS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TADDS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TSUBS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSUBS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TDIVS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TDIVS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMULS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TDIVS(TileDataDst &dst, typename TileDataDst::DType scalar, TileDataSrc &src0, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TDIVS, dst, scalar, src0); - return {}; -} - -template -PTO_INST RecordEvent TREMS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TREMS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TMAXS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TMAXS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TANDS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TANDS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TORS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TORS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TXORS(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TXORS, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TLRELU(TileData &dst, TileData &src0, typename TileData::DType scalar, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TLRELU, dst, src0, scalar); - return {}; -} - -template -PTO_INST RecordEvent TADDSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TADDSC, dst, src0, scalar, src1); - return {}; -} - -template -PTO_INST RecordEvent TSUBSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, - WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSUBSC, dst, src0, scalar, src1); - return {}; -} - -template -PTO_INST RecordEvent TCOLMIN(TileDataOut &dst, TileDataIn &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLMIN, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TSCATTER(TileDataD &dst, TileDataS &src, TileDataI &indexes, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TSCATTER, dst, src, indexes); - return {}; -} - -template -PTO_INST RecordEvent TCOLEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLEXPAND, dst, src); - return {}; -} - -template -PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, TileInd &indexes, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(MGATHER, dst, src, indexes); - return {}; -} - -template -PTO_INST RecordEvent MSCATTER(GlobalData &dst, TileSrc &src, TileInd &indexes, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(MSCATTER, dst, src, indexes); - return {}; -} - -template -PTO_INST RecordEvent TNEG(TileDataDst &dst, TileDataSrc &src, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TNEG, dst, src); - return {}; -} - -template -PTO_INST RecordEvent TCOLEXPANDDIV(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLEXPANDDIV, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TCOLEXPANDMUL(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLEXPANDMUL, dst, src0, src1); - return {}; -} - -template -PTO_INST RecordEvent TCOLEXPANDSUB(TileDataDst &dst, TileDataDst &src0, TileDataSrc1 &src1, WaitEvents&... events) { - TSYNC(events...); - MAP_INSTR_IMPL(TCOLEXPANDSUB, dst, src0, src1); - return {}; -} - -} // namespace pto -#endif From 2fa71523c96ec405d4cac4acf8969588af4004fb Mon Sep 17 00:00:00 2001 From: jiawei_zhuang Date: Tue, 17 Mar 2026 17:21:22 +0100 Subject: [PATCH 44/53] collect fast_inverse as translation example --- .../scripts/example_list.json | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.agent/skills/translate_cpp2py/scripts/example_list.json b/.agent/skills/translate_cpp2py/scripts/example_list.json index 8af6ab1a..287fe70a 100644 --- a/.agent/skills/translate_cpp2py/scripts/example_list.json +++ b/.agent/skills/translate_cpp2py/scripts/example_list.json @@ -71,6 +71,33 @@ "pto_file": "hadamard_manual_sync.pto", "cpp_file": "hadamard_manual_sync.cpp" }, + { + "example_dir": "fast_inverse", + "compile_script": "compile.sh", + "py_source": "inverse_builder.py", + "py_command": "python ./inverse_builder.py --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./build_artifacts/inverse_auto_sync_16.pto", + "ptoas_command": "ptoas --enable-insert-sync ./build_artifacts/inverse_auto_sync_16.pto -o ./build_artifacts/inverse_auto_sync_16.cpp", + "pto_file": "build_artifacts/inverse_auto_sync_16.pto", + "cpp_file": "build_artifacts/inverse_auto_sync_16.cpp" + }, + { + "example_dir": "fast_inverse", + "compile_script": "compile.sh", + "py_source": "inverse_builder.py", + "py_command": "python ./inverse_builder.py --manual-sync --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./build_artifacts/inverse_manual_sync_16.pto", + "ptoas_command": "ptoas ./build_artifacts/inverse_manual_sync_16.pto -o ./build_artifacts/inverse_manual_sync_16.cpp", + "pto_file": "build_artifacts/inverse_manual_sync_16.pto", + "cpp_file": "build_artifacts/inverse_manual_sync_16.cpp" + }, + { + "example_dir": "fast_inverse", + "compile_script": "compile.sh", + "py_source": "inverse_builder.py", + "py_command": "python ./inverse_builder.py --matrix-size 128 --kernel-name tri_inv_trick_fp16_128 > ./build_artifacts/inverse_auto_sync_128.pto", + "ptoas_command": "ptoas --enable-insert-sync ./build_artifacts/inverse_auto_sync_128.pto -o ./build_artifacts/inverse_auto_sync_128.cpp", + "pto_file": "build_artifacts/inverse_auto_sync_128.pto", + "cpp_file": "build_artifacts/inverse_auto_sync_128.cpp" + }, { "example_dir": "matmul_optimization_guide", "compile_script": "compile.sh", From 1248fab537b2e04a2a8377b4ed11ee36687627d3 Mon Sep 17 00:00:00 2001 From: jiawei_zhuang Date: Tue, 17 Mar 2026 17:33:24 +0100 Subject: [PATCH 45/53] fix subdir --- .../scripts/example_list.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.agent/skills/translate_cpp2py/scripts/example_list.json b/.agent/skills/translate_cpp2py/scripts/example_list.json index 287fe70a..7c1b6a1b 100644 --- a/.agent/skills/translate_cpp2py/scripts/example_list.json +++ b/.agent/skills/translate_cpp2py/scripts/example_list.json @@ -75,28 +75,28 @@ "example_dir": "fast_inverse", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./build_artifacts/inverse_auto_sync_16.pto", - "ptoas_command": "ptoas --enable-insert-sync ./build_artifacts/inverse_auto_sync_16.pto -o ./build_artifacts/inverse_auto_sync_16.cpp", - "pto_file": "build_artifacts/inverse_auto_sync_16.pto", - "cpp_file": "build_artifacts/inverse_auto_sync_16.cpp" + "py_command": "python ./inverse_builder.py --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./inverse_auto_sync_16.pto", + "ptoas_command": "ptoas --enable-insert-sync ./inverse_auto_sync_16.pto -o ./inverse_auto_sync_16.cpp", + "pto_file": "inverse_auto_sync_16.pto", + "cpp_file": "inverse_auto_sync_16.cpp" }, { "example_dir": "fast_inverse", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --manual-sync --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./build_artifacts/inverse_manual_sync_16.pto", - "ptoas_command": "ptoas ./build_artifacts/inverse_manual_sync_16.pto -o ./build_artifacts/inverse_manual_sync_16.cpp", - "pto_file": "build_artifacts/inverse_manual_sync_16.pto", - "cpp_file": "build_artifacts/inverse_manual_sync_16.cpp" + "py_command": "python ./inverse_builder.py --manual-sync --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./inverse_manual_sync_16.pto", + "ptoas_command": "ptoas ./inverse_manual_sync_16.pto -o ./inverse_manual_sync_16.cpp", + "pto_file": "inverse_manual_sync_16.pto", + "cpp_file": "inverse_manual_sync_16.cpp" }, { "example_dir": "fast_inverse", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --matrix-size 128 --kernel-name tri_inv_trick_fp16_128 > ./build_artifacts/inverse_auto_sync_128.pto", - "ptoas_command": "ptoas --enable-insert-sync ./build_artifacts/inverse_auto_sync_128.pto -o ./build_artifacts/inverse_auto_sync_128.cpp", - "pto_file": "build_artifacts/inverse_auto_sync_128.pto", - "cpp_file": "build_artifacts/inverse_auto_sync_128.cpp" + "py_command": "python ./inverse_builder.py --matrix-size 128 --kernel-name tri_inv_trick_fp16_128 > ./inverse_auto_sync_128.pto", + "ptoas_command": "ptoas --enable-insert-sync ./inverse_auto_sync_128.pto -o ./inverse_auto_sync_128.cpp", + "pto_file": "inverse_auto_sync_128.pto", + "cpp_file": "inverse_auto_sync_128.cpp" }, { "example_dir": "matmul_optimization_guide", From 9c84cfdbdefd03659084fd95d5f5f1648477b446 Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:10:06 +0100 Subject: [PATCH 46/53] User-friendly guide for fast-inverse-trick in Python (#93) Cleans up #90 #91 for human readers. --- .../references/example_translation/.gitignore | 1 + .../references/example_translation/.gitkeep | 0 .../references/external_repo/README.md | 2 +- .../scripts/example_list.json | 30 +- docker/Dockerfile | 14 +- .../matmul_dynbatch_multicore_2buf/.gitignore | 1 + examples/aot/fast_inverse/README.md | 7 - .../aot/fast_inverse/basic_dense/.gitignore | 1 + .../aot/fast_inverse/basic_dense/README.md | 27 ++ .../fast_inverse/basic_dense/bench_inverse.py | 200 ++++++++++ .../aot/fast_inverse/basic_dense/caller.cpp | 26 ++ .../aot/fast_inverse/basic_dense/compile.sh | 34 ++ .../basic_dense/inverse_builder.py | 174 ++++++++ .../fast_inverse/basic_dense/run_inverse.py | 155 ++++++++ .../fast_inverse/block_inversion/README.md | 20 + .../fast_inverse/block_inversion/caller.cpp | 24 ++ .../fast_inverse/block_inversion/compile.sh | 33 ++ .../block_inversion/inverse_builder.py | 235 +++++++++++ .../block_inversion/run_inverse.py | 215 ++++++++++ examples/aot/fast_inverse/caller.cpp | 92 ----- examples/aot/fast_inverse/compile.sh | 74 ---- examples/aot/fast_inverse/inverse_builder.py | 370 ------------------ examples/aot/fast_inverse/run_inverse.py | 220 ----------- .../experimental/.gitignore | 2 + examples/aot/print_tile/.gitignore | 1 + 25 files changed, 1172 insertions(+), 786 deletions(-) create mode 100644 .agent/skills/translate_cpp2py/references/example_translation/.gitignore delete mode 100644 .agent/skills/translate_cpp2py/references/example_translation/.gitkeep create mode 100644 examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/.gitignore delete mode 100644 examples/aot/fast_inverse/README.md create mode 100644 examples/aot/fast_inverse/basic_dense/.gitignore create mode 100644 examples/aot/fast_inverse/basic_dense/README.md create mode 100644 examples/aot/fast_inverse/basic_dense/bench_inverse.py create mode 100644 examples/aot/fast_inverse/basic_dense/caller.cpp create mode 100644 examples/aot/fast_inverse/basic_dense/compile.sh create mode 100644 examples/aot/fast_inverse/basic_dense/inverse_builder.py create mode 100644 examples/aot/fast_inverse/basic_dense/run_inverse.py create mode 100644 examples/aot/fast_inverse/block_inversion/README.md create mode 100644 examples/aot/fast_inverse/block_inversion/caller.cpp create mode 100644 examples/aot/fast_inverse/block_inversion/compile.sh create mode 100644 examples/aot/fast_inverse/block_inversion/inverse_builder.py create mode 100644 examples/aot/fast_inverse/block_inversion/run_inverse.py delete mode 100644 examples/aot/fast_inverse/caller.cpp delete mode 100644 examples/aot/fast_inverse/compile.sh delete mode 100644 examples/aot/fast_inverse/inverse_builder.py delete mode 100644 examples/aot/fast_inverse/run_inverse.py create mode 100644 examples/aot/print_tile/.gitignore diff --git a/.agent/skills/translate_cpp2py/references/example_translation/.gitignore b/.agent/skills/translate_cpp2py/references/example_translation/.gitignore new file mode 100644 index 00000000..72e8ffc0 --- /dev/null +++ b/.agent/skills/translate_cpp2py/references/example_translation/.gitignore @@ -0,0 +1 @@ +* diff --git a/.agent/skills/translate_cpp2py/references/example_translation/.gitkeep b/.agent/skills/translate_cpp2py/references/example_translation/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/.agent/skills/translate_cpp2py/references/external_repo/README.md b/.agent/skills/translate_cpp2py/references/external_repo/README.md index c36bc440..8ac51e5e 100644 --- a/.agent/skills/translate_cpp2py/references/external_repo/README.md +++ b/.agent/skills/translate_cpp2py/references/external_repo/README.md @@ -2,7 +2,7 @@ This directory holds the 3rd-party repos that are used internally by PTO-DSL: - https://github.com/zhangstevenunity/PTOAS: implements "ptoas" command line tool, the PTO MLIR dialect and its Python bindings, and the InjectSync pass to insert set_flag/wait_flag for "auto-sync" mode. Important files are: - `PTOAS/include/PTO/IR/PTOOps.td` defines the MLIR PTO dialect - `PTOAS/python/pto/dialects/pto.py` has low-level Python wrappers of PTO MLIR python binding (more Pythonic wrappers are in pto-dsl package) - - `PTOAS/lib/PTO/Transforms/PTOToEmitC.cpp` the compile pass that converts `*.pto` IR to C++ source code based on PTO-ISA headers. + - `PTOAS/lib/PTO/Transforms/PTOToEmitC.cpp` the compile pass that converts `*.pto` IR to C++ source code based on PTO-ISA headers. - https://gitcode.com/cann/pto-isa: header-only library that defined the C++ APIs of PTO-ISA. It is the target API set for the `PTOToEmitC` pass in PTOAS. Important files are: - `pto-isa/include/pto/common/pto_instr.hpp` the top-level interface - `pto-isa/include/pto/common/*` common type definitions diff --git a/.agent/skills/translate_cpp2py/scripts/example_list.json b/.agent/skills/translate_cpp2py/scripts/example_list.json index 7c1b6a1b..453406ff 100644 --- a/.agent/skills/translate_cpp2py/scripts/example_list.json +++ b/.agent/skills/translate_cpp2py/scripts/example_list.json @@ -72,31 +72,31 @@ "cpp_file": "hadamard_manual_sync.cpp" }, { - "example_dir": "fast_inverse", + "example_dir": "fast_inverse/basic_dense", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./inverse_auto_sync_16.pto", - "ptoas_command": "ptoas --enable-insert-sync ./inverse_auto_sync_16.pto -o ./inverse_auto_sync_16.cpp", - "pto_file": "inverse_auto_sync_16.pto", - "cpp_file": "inverse_auto_sync_16.cpp" + "py_command": "python ./inverse_builder.py --matrix-size 64 > ./inverse_basic_dense_64.pto", + "ptoas_command": "ptoas --enable-insert-sync ./inverse_basic_dense_64.pto -o ./inverse_basic_dense_64.cpp", + "pto_file": "inverse_basic_dense_64.pto", + "cpp_file": "inverse_basic_dense_64.cpp" }, { - "example_dir": "fast_inverse", + "example_dir": "fast_inverse/block_inversion", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --manual-sync --matrix-size 16 --kernel-name tri_inv_trick_fp16_16 > ./inverse_manual_sync_16.pto", - "ptoas_command": "ptoas ./inverse_manual_sync_16.pto -o ./inverse_manual_sync_16.cpp", - "pto_file": "inverse_manual_sync_16.pto", - "cpp_file": "inverse_manual_sync_16.cpp" + "py_command": "python ./inverse_builder.py --matrix-size 64 > ./inverse_block_inversion_64.pto", + "ptoas_command": "ptoas --enable-insert-sync ./inverse_block_inversion_64.pto -o ./inverse_block_inversion_64.cpp", + "pto_file": "inverse_block_inversion_64.pto", + "cpp_file": "inverse_block_inversion_64.cpp" }, { - "example_dir": "fast_inverse", + "example_dir": "fast_inverse/basic_dense", "compile_script": "compile.sh", "py_source": "inverse_builder.py", - "py_command": "python ./inverse_builder.py --matrix-size 128 --kernel-name tri_inv_trick_fp16_128 > ./inverse_auto_sync_128.pto", - "ptoas_command": "ptoas --enable-insert-sync ./inverse_auto_sync_128.pto -o ./inverse_auto_sync_128.cpp", - "pto_file": "inverse_auto_sync_128.pto", - "cpp_file": "inverse_auto_sync_128.cpp" + "py_command": "python ./inverse_builder.py --matrix-size 128 > ./inverse_basic_dense_128.pto", + "ptoas_command": "ptoas --enable-insert-sync ./inverse_basic_dense_128.pto -o ./inverse_basic_dense_128.cpp", + "pto_file": "inverse_basic_dense_128.pto", + "cpp_file": "inverse_basic_dense_128.cpp" }, { "example_dir": "matmul_optimization_guide", diff --git a/docker/Dockerfile b/docker/Dockerfile index 73c20dfc..7eda8bc5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,6 +15,13 @@ RUN pip install --no-cache-dir \ pytest pybind11 nanobind setuptools wheel \ ipython jupyterlab matplotlib pandas +# certain operations need latest isa header, not CANN 8.5.0 default +# header on 2026/03/16 +ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613 +WORKDIR /sources +RUN git clone https://gitcode.com/cann/pto-isa.git \ + && cd pto-isa && git checkout $PTOISA_COMMIT + # cache above layers unrelated to ptoas version change # change this ununsed arg if need to force rebuild later lines @@ -54,10 +61,3 @@ RUN ptoas ./tmatmulk.pto -o ./tmatmulk.cpp RUN python ./abs.py > ./abs.pto RUN ptoas --enable-insert-sync ./abs.pto -o ./abs.cpp - -# certain operations need latest isa header, not CANN 8.5.0 default -# header on 2026/03/16 -ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613 -WORKDIR /sources -RUN git clone https://gitcode.com/cann/pto-isa.git \ - && cd pto-isa && git checkout $PTOISA_COMMIT diff --git a/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/.gitignore b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/.gitignore new file mode 100644 index 00000000..04529667 --- /dev/null +++ b/examples/aot/batch_matmul/matmul_dynbatch_multicore_2buf/.gitignore @@ -0,0 +1 @@ +mul.cpp diff --git a/examples/aot/fast_inverse/README.md b/examples/aot/fast_inverse/README.md deleted file mode 100644 index 204f17c4..00000000 --- a/examples/aot/fast_inverse/README.md +++ /dev/null @@ -1,7 +0,0 @@ -Usage: - -```bash -bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs -python ./run_inverse.py # default to auto-sync kernel -python ./run_inverse.py --manual-sync # test manual-sync kernel -``` diff --git a/examples/aot/fast_inverse/basic_dense/.gitignore b/examples/aot/fast_inverse/basic_dense/.gitignore new file mode 100644 index 00000000..e33609d2 --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/.gitignore @@ -0,0 +1 @@ +*.png diff --git a/examples/aot/fast_inverse/basic_dense/README.md b/examples/aot/fast_inverse/basic_dense/README.md new file mode 100644 index 00000000..acc5a93c --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/README.md @@ -0,0 +1,27 @@ +```bash +bash compile.sh 64 # build -> inverse_lib.so + +# Validate correctness +python run_inverse.py --matrix-size 64 + +# Another matrix size +python run_inverse.py --matrix-size 128 + +# Measure effective bandwidth +python bench_inverse.py --matrix-size 64 --out-png bench_inverse_bandwidth.png +``` + +`bench_inverse.py` reports and plots bandwidth using only: +- read of `in_delta` (`torch_to_ctypes(in_delta)`) +- write of `out` (`torch_to_ctypes(out)`) + +Timing measures only the kernel launch (`lib.call_kernel(...)`) and excludes tensor +preparation (`identity`, `in_delta`, `identity_neg`, `out` creation). + +This dense demo uses input shape `[batch, n, n]` and applies the same fast-inverse recurrence +as the block-diagonal example, with `log2_blocksize = log2(n)` (no extra diagonal block size). +It uses persistent-kernel style launch with fixed `blockDim=24`, and each core loops over +its assigned batch indices at runtime. + +For numerical stability in this educational demo, test inputs are generated as: +`M = I + scale * random`, and the kernel computes `inv(M)` via `A = M - I`. diff --git a/examples/aot/fast_inverse/basic_dense/bench_inverse.py b/examples/aot/fast_inverse/basic_dense/bench_inverse.py new file mode 100644 index 00000000..893f7ddf --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/bench_inverse.py @@ -0,0 +1,200 @@ +import argparse +import ctypes +import math +import random + +import numpy as np +import torch +import torch_npu # noqa: F401 + +try: + import matplotlib.pyplot as plt +except ImportError: + plt = None + +from ptodsl import do_bench +from ptodsl.test_util import get_test_device + +random.seed(42) +torch.manual_seed(42) +np.random.seed(42) + +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 128) +DEFAULT_BATCH_SIZES = [2**k for k in range(4, 16)] # 16, 32, ..., 32768 +try: + PERSISTENT_BLOCK_DIM = int(torch.npu.get_device_properties("npu").cube_core_num) +except Exception: + PERSISTENT_BLOCK_DIM = 24 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def _dtype_nbytes(dtype: torch.dtype) -> int: + return torch.empty((), dtype=dtype).element_size() + + +def inverse_io_bytes(in_delta: torch.Tensor, out: torch.Tensor) -> int: + # Requested traffic model: read in_delta + write out only. + return in_delta.numel() * _dtype_nbytes( + in_delta.dtype + ) + out.numel() * _dtype_nbytes(out.dtype) + + +def load_lib(lib_path): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim (fixed core count) + ctypes.c_void_p, # stream + ctypes.c_void_p, # out + ctypes.c_void_p, # in_delta (M - I) + ctypes.c_void_p, # identity_neg + ctypes.c_uint32, # runtime batch_size + ctypes.c_uint32, # log2(matrix_size) + ] + lib.call_kernel.restype = None + return lib + + +def dense_stable_matrix(n, batch, scale=0.02): + eye = np.eye(n, dtype=np.float32) + noise = np.random.uniform(-1.0, 1.0, size=(batch, n, n)).astype(np.float32) + out = eye[None, :, :] + scale * noise + return torch.from_numpy(out) + + +def benchmark_kernel_seconds(kernel_launch_fn, warmup: int, iters: int) -> float: + # Measure kernel launch only (preparation is done outside this function). + return do_bench( + kernel_launch_fn, warmup_iters=warmup, benchmark_iters=iters, unit="s" + ) + + +def run_benchmark( + lib, + *, + label: str, + matrix_size: int, + batch_sizes: list[int], + warmup: int, + iters: int, +): + log2_blocksize = int(math.log2(matrix_size)) + stream_ptr = torch.npu.current_stream()._as_parameter_ + bandwidth_gib_s = [] + + print(f"\n=== benchmark {label} ===") + for batch in batch_sizes: + inp = dense_stable_matrix(n=matrix_size, batch=batch).to(device) + inp_fp16 = inp.to(torch.float16).contiguous() + + # Preparation work excluded from benchmark timing. + identity = torch.eye(matrix_size, dtype=torch.float16, device=device) + in_delta = (inp_fp16 - identity).contiguous() + identity_neg = (-identity).contiguous() + out = torch.zeros_like(inp_fp16, dtype=torch.float32, device=device) + + def launch_only(): + lib.call_kernel( + PERSISTENT_BLOCK_DIM, + stream_ptr, + torch_to_ctypes(out), + torch_to_ctypes(in_delta), + torch_to_ctypes(identity_neg), + batch, + log2_blocksize, + ) + + avg_s = benchmark_kernel_seconds(launch_only, warmup=warmup, iters=iters) + io_bytes = inverse_io_bytes(in_delta, out) + total_traffic_gib = io_bytes / (1024**3) + gib_s = io_bytes / avg_s / (1024**3) + bandwidth_gib_s.append(gib_s) + print( + f"{label:>6s} | batch={batch:5d} | {avg_s * 1e3:.3f} ms | " + f"{gib_s:.2f} GiB/s | traffic={total_traffic_gib:.4f} GiB" + ) + + return bandwidth_gib_s + + +def plot_results( + batch_sizes: list[int], + bw_gib_s: list[float], + out_png: str, + n: int, +) -> None: + if plt is None: + print("Warning: matplotlib is not installed; skipping plot generation.") + return + + plt.figure(figsize=(8, 5)) + plt.plot(batch_sizes, bw_gib_s, "o-", label="kernel") + plt.xlabel("Batch size") + plt.ylabel("Bandwidth (GiB/s)") + plt.title(f"Fast Inverse Bandwidth (n={n})") + plt.xscale("log", base=2) + plt.xticks(batch_sizes, [str(x) for x in batch_sizes]) + plt.grid(True, linestyle="--", alpha=0.6) + plt.legend() + plt.tight_layout() + plt.savefig(out_png) + print(f"Saved plot to {out_png}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=64, + help="Dense matrix size n.", + ) + parser.add_argument( + "--batch-sizes", + type=int, + nargs="+", + default=DEFAULT_BATCH_SIZES, + help="Batch sizes to benchmark.", + ) + parser.add_argument( + "--warmup", + type=int, + default=20, + help="Number of warmup iterations for each batch size.", + ) + parser.add_argument( + "--iters", + type=int, + default=50, + help="Number of measured iterations for each batch size.", + ) + parser.add_argument( + "--lib-path", + type=str, + default="./inverse_lib.so", + help="Shared library path produced by compile.sh.", + ) + parser.add_argument( + "--out-png", + type=str, + default="bench_inverse_bandwidth.png", + help="Output image path for the benchmark figure.", + ) + args = parser.parse_args() + + device = get_test_device() + torch.npu.set_device(device) + + lib = load_lib(args.lib_path) + bw = run_benchmark( + lib, + label="kernel", + matrix_size=args.matrix_size, + batch_sizes=args.batch_sizes, + warmup=args.warmup, + iters=args.iters, + ) + plot_results(args.batch_sizes, bw, args.out_png, args.matrix_size) diff --git a/examples/aot/fast_inverse/basic_dense/caller.cpp b/examples/aot/fast_inverse/basic_dense/caller.cpp new file mode 100644 index 00000000..131846e6 --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/caller.cpp @@ -0,0 +1,26 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "inverse.cpp" +#endif + +#ifndef KERNEL_FN +#define KERNEL_FN tri_inv_trick_fp16 +#endif + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *tensor_out, + uint8_t *tensor_in, + uint8_t *identity_in, + uint32_t runtime_batch_size, + uint32_t log2_blocksize) +{ + KERNEL_FN<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(runtime_batch_size), + static_cast(log2_blocksize)); +} diff --git a/examples/aot/fast_inverse/basic_dense/compile.sh b/examples/aot/fast_inverse/basic_dense/compile.sh new file mode 100644 index 00000000..94619904 --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/compile.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARTIFACT_DIR="./build_artifacts" +MATRIX_SIZE="${1:-64}" +if [[ $# -gt 1 ]]; then + echo "Usage: bash compile.sh [matrix_size]" + exit 1 +fi + +mkdir -p "${ARTIFACT_DIR}" +rm -f "${ARTIFACT_DIR}/inverse.pto" "${ARTIFACT_DIR}/inverse.cpp" "inverse_lib.so" + +python ./inverse_builder.py --matrix-size "${MATRIX_SIZE}" > "${ARTIFACT_DIR}/inverse.pto" +ptoas --enable-insert-sync "${ARTIFACT_DIR}/inverse.pto" -o "${ARTIFACT_DIR}/inverse.cpp" + +PTO_LIB_PATH=/sources/pto-isa +# PTO_LIB_PATH=$ASCEND_TOOLKIT_HOME + +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/inverse.cpp\"" \ + ./caller.cpp \ + -o "./inverse_lib.so" diff --git a/examples/aot/fast_inverse/basic_dense/inverse_builder.py b/examples/aot/fast_inverse/basic_dense/inverse_builder.py new file mode 100644 index 00000000..498abdc5 --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/inverse_builder.py @@ -0,0 +1,174 @@ +# pyright: reportUndefinedVariable=false +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 128) + + +def make_meta_data(n: int): + def meta_data(): + in_dtype = pto.float16 + out_dtype = pto.float32 + i32 = pto.int32 + + in_ptr_type = pto.PtrType(in_dtype) + out_ptr_type = pto.PtrType(out_dtype) + in_tensor_type = pto.TensorType(rank=2, dtype=in_dtype) + out_tensor_type = pto.TensorType(rank=2, dtype=out_dtype) + in_subtensor = pto.SubTensorType(shape=[n, n], dtype=in_dtype) + out_subtensor = pto.SubTensorType(shape=[n, n], dtype=out_dtype) + l1_tile_type = pto.TileBufType( + shape=[n, n], valid_shape=[n, n], dtype=in_dtype, memory_space="MAT" + ) + l0a_tile_type = pto.TileBufType( + shape=[n, n], valid_shape=[n, n], dtype=in_dtype, memory_space="LEFT" + ) + l0b_tile_type = pto.TileBufType( + shape=[n, n], valid_shape=[n, n], dtype=in_dtype, memory_space="RIGHT" + ) + l0c_tile_type = pto.TileBufType( + shape=[n, n], valid_shape=[n, n], dtype=out_dtype, memory_space="ACC" + ) + + return { + "in_ptr_type": in_ptr_type, + "out_ptr_type": out_ptr_type, + "i32": i32, + "in_tensor_type": in_tensor_type, + "out_tensor_type": out_tensor_type, + "in_subtensor": in_subtensor, + "out_subtensor": out_subtensor, + "l1_tile_type": l1_tile_type, + "l0a_tile_type": l0a_tile_type, + "l0b_tile_type": l0b_tile_type, + "l0c_tile_type": l0c_tile_type, + } + + return meta_data + + +def build_kernel(matrix_size: int): + @to_ir_module(meta_data=make_meta_data(matrix_size)) + def tri_inv_trick_fp16( + out_ptr: "out_ptr_type", + in_ptr: "in_ptr_type", + i_neg_ptr: "in_ptr_type", + matrix_size_i32: "i32", + log2_blocksize_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + n_c = const(matrix_size) + + batch_size = s.index_cast(matrix_size_i32) + log2_blocksize = s.index_cast(log2_blocksize_i32) + block_idx = s.index_cast(pto.get_block_idx()) + num_cores = s.index_cast(pto.get_block_num()) + total_rows = batch_size * n_c + + # Persistent-kernel work split: base + remainder. + base = batch_size // num_cores + rem = batch_size % num_cores + lt_rem = s.lt(block_idx, rem) + min_bid_rem = s.min_u(block_idx, rem) + b_start = block_idx * base + min_bid_rem + length = base + s.select(lt_rem, c1, c0) + b_end = s.min_u(b_start + length, batch_size) + + tv_m = pto.as_tensor( + in_tensor_type, ptr=in_ptr, shape=[total_rows, n_c], strides=[n_c, c1] + ) + tv_out = pto.as_tensor( + out_tensor_type, ptr=out_ptr, shape=[total_rows, n_c], strides=[n_c, c1] + ) + tv_i_neg = pto.as_tensor( + in_tensor_type, ptr=i_neg_ptr, shape=[n_c, n_c], strides=[n_c, c1] + ) + + sv_i_neg = pto.slice_view( + in_subtensor, source=tv_i_neg, offsets=[c0, c0], sizes=[n_c, n_c] + ) + + i_neg_l1 = pto.alloc_tile(l1_tile_type) + x_l1 = pto.alloc_tile(l1_tile_type) + y_l1 = pto.alloc_tile(l1_tile_type) + i_l1 = pto.alloc_tile(l1_tile_type) + a_l0 = pto.alloc_tile(l0a_tile_type) + b_l0 = pto.alloc_tile(l0b_tile_type) + c_l0 = pto.alloc_tile(l0c_tile_type) + + pto.load(sv_i_neg, i_neg_l1) + # I = (-I) @ (-I) is batch-invariant, so compute it once. + tile.mov(i_neg_l1, a_l0) + tile.mov(i_neg_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, i_l1) + + for b_idx in pto.range(b_start, b_end, c1): + row_offset = b_idx * n_c + sv_m = pto.slice_view( + in_subtensor, + source=tv_m, + offsets=[row_offset, c0], + sizes=[n_c, n_c], + ) + sv_out = pto.slice_view( + out_subtensor, + source=tv_out, + offsets=[row_offset, c0], + sizes=[n_c, n_c], + ) + + # in_ptr carries A = M - I, where M is the dense matrix to invert. + pto.load(sv_m, y_l1) + + tile.mov(y_l1, a_l0) + tile.mov(y_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y_l1) # y = A @ A + + tile.mov(i_neg_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) # c = -A + + tile.mov(i_neg_l1, a_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) # c = I - A + tile.mov(c_l0, x_l1) # x = I - A + + # Mirrors: + # for i in range(log2_c - 1): + # X, Y = (X + X @ Y, Y @ Y) + for iter_idx in pto.range(c0, log2_blocksize, c1): + tile.mov(x_l1, a_l0) + tile.mov(i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + + tile.mov(y_l1, b_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) # x + x @ y + + with pto.if_context(iter_idx + c1 < log2_blocksize): + tile.mov(c_l0, x_l1) + tile.mov(y_l1, a_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y_l1) # y = y @ y + + pto.store(c_l0, sv_out) + + return tri_inv_trick_fp16 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=64, + help="Compile-time specialized dense matrix size.", + ) + args = parser.parse_args() + module = build_kernel(args.matrix_size) + print(module) diff --git a/examples/aot/fast_inverse/basic_dense/run_inverse.py b/examples/aot/fast_inverse/basic_dense/run_inverse.py new file mode 100644 index 00000000..13b51ff6 --- /dev/null +++ b/examples/aot/fast_inverse/basic_dense/run_inverse.py @@ -0,0 +1,155 @@ +import argparse +import ctypes +import math +import random +import warnings + +import numpy as np +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +random.seed(42) +torch.manual_seed(42) +np.random.seed(42) + +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 128) +try: + PERSISTENT_BLOCK_DIM = int(torch.npu.get_device_properties("npu").cube_core_num) +except Exception: + PERSISTENT_BLOCK_DIM = 24 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim (fixed core count) + ctypes.c_void_p, # stream + ctypes.c_void_p, # out + ctypes.c_void_p, # in_delta (M - I) + ctypes.c_void_p, # identity_neg + ctypes.c_uint32, # runtime batch_size + ctypes.c_uint32, # log2(matrix_size) + ] + lib.call_kernel.restype = None + return lib + + +def dense_stable_matrix(n, batch, scale=0.02): + eye = np.eye(n, dtype=np.float32) + noise = np.random.uniform(-1.0, 1.0, size=(batch, n, n)).astype(np.float32) + out = eye[None, :, :] + scale * noise + return torch.from_numpy(out) + + +def run_kernel(lib, inp): + inp_fp16 = inp.to(torch.float16).contiguous() + n = int(inp_fp16.shape[-1]) + batch = int(inp_fp16.shape[0]) + log2_blocksize = int(math.log2(n)) + + identity = torch.eye(n, dtype=torch.float16, device=inp_fp16.device) + in_delta = (inp_fp16 - identity).contiguous() + identity_neg = (-identity).contiguous() + out = torch.zeros_like(inp_fp16, dtype=torch.float32, device=inp_fp16.device) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + PERSISTENT_BLOCK_DIM, + stream_ptr, + torch_to_ctypes(out), + torch_to_ctypes(in_delta), + torch_to_ctypes(identity_neg), + batch, + log2_blocksize, + ) + torch.npu.synchronize() + return out + + +def reference_inverse(inp): + inp_cpu = inp.cpu().numpy().astype(np.float64) + inv_ref = np.linalg.inv(inp_cpu) + return torch.from_numpy(inv_ref) + + +def check_case(lib, n, batch, atol, rtol, ftol): + inp = dense_stable_matrix(n=n, batch=batch).to(device) + ref = reference_inverse(inp).to(torch.float64) + out = run_kernel(lib, inp).cpu().to(torch.float64) + + frob_error = torch.sqrt(torch.sum((ref - out) ** 2) / torch.sum(ref**2)) + allclose_ok = np.allclose(out.numpy(), ref.numpy(), atol=atol, rtol=rtol) + frob_ok = bool(frob_error <= ftol) + + nan_count = int(torch.isnan(out).sum().item()) + inf_count = int(torch.isinf(out).sum().item()) + + if allclose_ok and frob_ok: + print(f"[pass] n={n}, batch={batch}, frob={float(frob_error):.3e}") + return None + + msg = ( + f"[fail] n={n}, batch={batch}, frob={float(frob_error):.3e}, " + f"nan={nan_count}, inf={inf_count}" + ) + print(msg) + return msg + + +def run_test(lib, n, batch_list): + failures = [] + for batch in batch_list: + failure = check_case( + lib, + n=n, + batch=batch, + atol=6e-3, + rtol=5e-2, + ftol=8e-3, + ) + if failure is not None: + failures.append(failure) + + total = len(batch_list) + print( + f"summary: n={n}, pass={total - len(failures)}, fail={len(failures)}, total={total}" + ) + if failures: + warnings.warn( + f"{len(failures)} cases failed. First: {failures[0]}", + stacklevel=2, + ) + return failures + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=64, + help="Only validate this dense matrix size n.", + ) + parser.add_argument( + "--lib-path", + type=str, + default="./inverse_lib.so", + help="Shared library path produced by compile.sh.", + ) + args = parser.parse_args() + + device = get_test_device() + torch.npu.set_device(device) + batch_list = [1, 8, 24, 27, 48, 96, 99, 135] + + print(f"\n=== validating kernel: {args.lib_path} ===") + lib = load_lib(args.lib_path) + failures = run_test(lib, n=args.matrix_size, batch_list=batch_list) + print(f"\nfinished tests for n={args.matrix_size}, failures={len(failures)}.") diff --git a/examples/aot/fast_inverse/block_inversion/README.md b/examples/aot/fast_inverse/block_inversion/README.md new file mode 100644 index 00000000..8cf5f345 --- /dev/null +++ b/examples/aot/fast_inverse/block_inversion/README.md @@ -0,0 +1,20 @@ +```bash +bash compile.sh # default matrix size 64 +python run_inverse.py + +bash compile.sh 128 # another supported matrix size +python run_inverse.py --matrix-size 128 --lib-path ./inverse_lib.so +``` + +This demo implements one-level 2x2 block inversion for `inv(I + A)` with input shape +`[batch, n, n]`: + +- `A` is interpreted as block-lower-triangular: + `[[A11, 0], [A21, A22]]`, with `A11/A22` size `n/2`. +- `inv(I + A11)` and `inv(I + A22)` are computed by the same fast recurrence used in + the `basic_dense` / `block_diag` demos. +- `A21` block is recovered by `-inv(I + A22) @ A21 @ inv(I + A11)`. + +`run_inverse.py` includes: +- correctness checks on structured random / ill-conditioned generators +- a precision report line in the note style: `c= | error = ...` diff --git a/examples/aot/fast_inverse/block_inversion/caller.cpp b/examples/aot/fast_inverse/block_inversion/caller.cpp new file mode 100644 index 00000000..5ee9b5ca --- /dev/null +++ b/examples/aot/fast_inverse/block_inversion/caller.cpp @@ -0,0 +1,24 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "inverse.cpp" +#endif + +#ifndef KERNEL_FN +#define KERNEL_FN tri_inv_block2x2_fp16 +#endif + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *tensor_out, + uint8_t *tensor_in, + uint8_t *identity_in, + uint32_t log2_blocksize) +{ + KERNEL_FN<<>>( + reinterpret_cast(tensor_out), + reinterpret_cast(tensor_in), + reinterpret_cast(identity_in), + static_cast(log2_blocksize)); +} diff --git a/examples/aot/fast_inverse/block_inversion/compile.sh b/examples/aot/fast_inverse/block_inversion/compile.sh new file mode 100644 index 00000000..4d60f8c9 --- /dev/null +++ b/examples/aot/fast_inverse/block_inversion/compile.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARTIFACT_DIR="./build_artifacts" +MATRIX_SIZE="${1:-64}" + +mkdir -p "${ARTIFACT_DIR}" +rm -f "${ARTIFACT_DIR}/inverse.pto" "${ARTIFACT_DIR}/inverse.cpp" inverse_lib.so + +python ./inverse_builder.py \ + --matrix-size "${MATRIX_SIZE}" \ + > "${ARTIFACT_DIR}/inverse.pto" + +ptoas --enable-insert-sync "${ARTIFACT_DIR}/inverse.pto" -o "${ARTIFACT_DIR}/inverse.cpp" + +PTO_LIB_PATH=/sources/pto-isa +# PTO_LIB_PATH=$ASCEND_TOOLKIT_HOME + +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"${ARTIFACT_DIR}/inverse.cpp\"" \ + ./caller.cpp \ + -o ./inverse_lib.so diff --git a/examples/aot/fast_inverse/block_inversion/inverse_builder.py b/examples/aot/fast_inverse/block_inversion/inverse_builder.py new file mode 100644 index 00000000..1aa4ec4a --- /dev/null +++ b/examples/aot/fast_inverse/block_inversion/inverse_builder.py @@ -0,0 +1,235 @@ +# pyright: reportUndefinedVariable=false +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 128) + + +def make_meta_data(n: int): + h = n // 2 + + def meta_data(): + in_dtype = pto.float16 + out_dtype = pto.float32 + i32 = pto.int32 + + in_ptr_type = pto.PtrType(in_dtype) + out_ptr_type = pto.PtrType(out_dtype) + in_tensor_type = pto.TensorType(rank=2, dtype=in_dtype) + out_tensor_type = pto.TensorType(rank=2, dtype=out_dtype) + + in_subtensor_h = pto.SubTensorType(shape=[h, h], dtype=in_dtype) + out_subtensor_h = pto.SubTensorType(shape=[h, h], dtype=out_dtype) + + l1_tile_type = pto.TileBufType( + shape=[h, h], valid_shape=[h, h], dtype=in_dtype, memory_space="MAT" + ) + l0a_tile_type = pto.TileBufType( + shape=[h, h], valid_shape=[h, h], dtype=in_dtype, memory_space="LEFT" + ) + l0b_tile_type = pto.TileBufType( + shape=[h, h], valid_shape=[h, h], dtype=in_dtype, memory_space="RIGHT" + ) + l0c_tile_type = pto.TileBufType( + shape=[h, h], valid_shape=[h, h], dtype=out_dtype, memory_space="ACC" + ) + + return { + "in_ptr_type": in_ptr_type, + "out_ptr_type": out_ptr_type, + "i32": i32, + "in_tensor_type": in_tensor_type, + "out_tensor_type": out_tensor_type, + "in_subtensor_h": in_subtensor_h, + "out_subtensor_h": out_subtensor_h, + "l1_tile_type": l1_tile_type, + "l0a_tile_type": l0a_tile_type, + "l0b_tile_type": l0b_tile_type, + "l0c_tile_type": l0c_tile_type, + } + + return meta_data + + +def build_kernel(matrix_size: int): + assert matrix_size % 2 == 0 and matrix_size >= 16 + + @to_ir_module(meta_data=make_meta_data(matrix_size)) + def tri_inv_block2x2_fp16( + out_ptr: "out_ptr_type", + in_ptr: "in_ptr_type", + i_neg_ptr: "in_ptr_type", + log2_blocksize_i32: "i32", + ) -> None: + with pto.cube_section(): + c0 = const(0) + c1 = const(1) + n_c = const(matrix_size) + h_c = const(matrix_size // 2) + + log2_half = s.index_cast(log2_blocksize_i32) - c1 + block_idx = s.index_cast(pto.get_block_idx()) + num_blocks = s.index_cast(pto.get_block_num()) + + total_rows = num_blocks * n_c + row_offset = block_idx * n_c + row_offset_h = row_offset + h_c + + tv_in = pto.as_tensor( + in_tensor_type, ptr=in_ptr, shape=[total_rows, n_c], strides=[n_c, c1] + ) + tv_out = pto.as_tensor( + out_tensor_type, ptr=out_ptr, shape=[total_rows, n_c], strides=[n_c, c1] + ) + tv_i_neg = pto.as_tensor( + in_tensor_type, ptr=i_neg_ptr, shape=[h_c, h_c], strides=[h_c, c1] + ) + sv_i_neg = pto.slice_view( + in_subtensor_h, source=tv_i_neg, offsets=[c0, c0], sizes=[h_c, h_c] + ) + + sv_a11 = pto.slice_view( + in_subtensor_h, source=tv_in, offsets=[row_offset, c0], sizes=[h_c, h_c] + ) + sv_a21 = pto.slice_view( + in_subtensor_h, + source=tv_in, + offsets=[row_offset_h, c0], + sizes=[h_c, h_c], + ) + sv_a22 = pto.slice_view( + in_subtensor_h, + source=tv_in, + offsets=[row_offset_h, h_c], + sizes=[h_c, h_c], + ) + + sv_out11 = pto.slice_view( + out_subtensor_h, + source=tv_out, + offsets=[row_offset, c0], + sizes=[h_c, h_c], + ) + sv_out21 = pto.slice_view( + out_subtensor_h, + source=tv_out, + offsets=[row_offset_h, c0], + sizes=[h_c, h_c], + ) + sv_out22 = pto.slice_view( + out_subtensor_h, + source=tv_out, + offsets=[row_offset_h, h_c], + sizes=[h_c, h_c], + ) + + x11_l1 = pto.alloc_tile(l1_tile_type) + y11_l1 = pto.alloc_tile(l1_tile_type) + x22_l1 = pto.alloc_tile(l1_tile_type) + y22_l1 = pto.alloc_tile(l1_tile_type) + a21_l1 = pto.alloc_tile(l1_tile_type) + neg_i_l1 = pto.alloc_tile(l1_tile_type) + pos_i_l1 = pto.alloc_tile(l1_tile_type) + tmp_l1 = pto.alloc_tile(l1_tile_type) + + a_l0 = pto.alloc_tile(l0a_tile_type) + b_l0 = pto.alloc_tile(l0b_tile_type) + c_l0 = pto.alloc_tile(l0c_tile_type) + + # Build +/- identity tiles for half-size blocks. + # Also seed x11 = x22 = I for the recurrence below. + pto.load(sv_i_neg, neg_i_l1) + tile.mov(neg_i_l1, a_l0) + tile.mov(neg_i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, pos_i_l1) + tile.mov(c_l0, x11_l1) # x11 = I + tile.mov(c_l0, x22_l1) # x22 = I + + # Invert (I + A11): start the recurrence with y11 = -A11, x11 = I. + # The loop then computes x_{k+1} = x_k(I + y_k), y_{k+1} = y_k^2 + # which gives (I + A11)^{-1} after log2_half steps. + pto.load(sv_a11, y11_l1) + tile.mov(y11_l1, a_l0) + tile.mov(neg_i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) # c = -A11 + tile.mov(c_l0, y11_l1) # y11 = -A11 + + for iter_idx in pto.range(c0, log2_half, c1): + tile.mov(x11_l1, a_l0) + tile.mov(pos_i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + + tile.mov(y11_l1, b_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + + with pto.if_context(iter_idx + c1 < log2_half): + tile.mov(c_l0, x11_l1) + tile.mov(y11_l1, a_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y11_l1) + + tile.mov(c_l0, x11_l1) + pto.store(c_l0, sv_out11) + + # Invert (I + A22): start with y22 = -A22, x22 = I (already set above). + pto.load(sv_a22, y22_l1) + tile.mov(y22_l1, a_l0) + tile.mov(neg_i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) # c = -A22 + tile.mov(c_l0, y22_l1) # y22 = -A22 + + for iter_idx in pto.range(c0, log2_half, c1): + tile.mov(x22_l1, a_l0) + tile.mov(pos_i_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + + tile.mov(y22_l1, b_l0) + tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) + + with pto.if_context(iter_idx + c1 < log2_half): + tile.mov(c_l0, x22_l1) + tile.mov(y22_l1, a_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, y22_l1) + + tile.mov(c_l0, x22_l1) + pto.store(c_l0, sv_out22) + + # A21 term in block inversion: + # X21 = - X22 @ A21 @ X11 + pto.load(sv_a21, a21_l1) + + tile.mov(x22_l1, a_l0) + tile.mov(a21_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, tmp_l1) + + tile.mov(tmp_l1, a_l0) + tile.mov(x11_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + tile.mov(c_l0, tmp_l1) + + tile.mov(neg_i_l1, a_l0) + tile.mov(tmp_l1, b_l0) + tile.matmul(a_l0, b_l0, c_l0) + pto.store(c_l0, sv_out21) + + return tri_inv_block2x2_fp16 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=64, + help="Compile-time specialized matrix size.", + ) + args = parser.parse_args() + module = build_kernel(args.matrix_size) + print(module) diff --git a/examples/aot/fast_inverse/block_inversion/run_inverse.py b/examples/aot/fast_inverse/block_inversion/run_inverse.py new file mode 100644 index 00000000..85551c08 --- /dev/null +++ b/examples/aot/fast_inverse/block_inversion/run_inverse.py @@ -0,0 +1,215 @@ +import argparse +import ctypes +import math +import random +import warnings + +import numpy as np +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +random.seed(42) +torch.manual_seed(42) +np.random.seed(42) + +SUPPORTED_MATRIX_SIZES = (16, 32, 64, 128) +UNIFORM_ATOL = 1e-3 +UNIFORM_RTOL = 1e-3 +UNIFORM_FTOL = 1e-3 + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim (batch) + ctypes.c_void_p, # stream + ctypes.c_void_p, # out + ctypes.c_void_p, # in_delta + ctypes.c_void_p, # identity_neg_half + ctypes.c_uint32, # log2(matrix_size) + ] + lib.call_kernel.restype = None + return lib + + +def ill_matrix(n, batch, offdiag=0.5): + out = np.zeros((batch, n, n), dtype=np.float32) + for b in range(batch): + out[b] = offdiag * np.tril(np.ones((n, n), dtype=np.float32), k=-1) + return torch.from_numpy(out) + + +def structured_random_matrix(n, batch, scale=0.1): + h = n // 2 + out = np.zeros((batch, n, n), dtype=np.float32) + for b in range(batch): + a11 = scale * np.tril( + np.random.uniform(-1.0, 1.0, size=(h, h)).astype(np.float32), k=-1 + ) + a22 = scale * np.tril( + np.random.uniform(-1.0, 1.0, size=(h, h)).astype(np.float32), k=-1 + ) + a21 = scale * np.random.uniform(-1.0, 1.0, size=(h, h)).astype(np.float32) + out[b, :h, :h] = a11 + out[b, h:, h:] = a22 + out[b, h:, :h] = a21 + return torch.from_numpy(out) + + +def structured_scale_by_n(n): + # Keep larger matrices closer to identity so the trend follows the note: + # medium sizes are very accurate, while the hardest ill-conditioned cases + # degrade only at larger n. + return { + 16: 0.10, + 32: 0.08, + 64: 0.05, + 128: 0.03, + }[n] + + +def ill_offdiag_for_tests(n): + # Use a smaller scale for bigger sizes. + return { + 16: 0.2, + 32: 0.1, + 64: 0.05, + 128: 0.02, + }[n] + + +def run_kernel(lib, inp_delta): + inp_fp16 = inp_delta.to(torch.float16).contiguous() + n = int(inp_fp16.shape[-1]) + batch = int(inp_fp16.shape[0]) + h = n // 2 + log2_blocksize = int(math.log2(n)) + + identity_neg_half = torch.zeros((h, h), dtype=torch.float16, device=inp_fp16.device) + identity_neg_half.fill_diagonal_(-1) + out = torch.zeros((batch, n, n), dtype=torch.float32, device=inp_fp16.device) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + batch, + stream_ptr, + torch_to_ctypes(out), + torch_to_ctypes(inp_fp16), + torch_to_ctypes(identity_neg_half), + log2_blocksize, + ) + torch.npu.synchronize() + return out + + +def reference_inverse(inp_delta): + n = inp_delta.shape[-1] + identity = np.eye(n, dtype=np.float64) + inp_cpu = inp_delta.cpu().numpy().astype(np.float64) + return torch.from_numpy(np.linalg.inv(inp_cpu + identity)) + + +def check_case(lib, matrix_gen, n, batch, atol, rtol, ftol): + inp_delta = matrix_gen(n=n, batch=batch).to(device) + ref = reference_inverse(inp_delta).to(torch.float64) + out = run_kernel(lib, inp_delta).cpu().to(torch.float64) + + frob_error = torch.sqrt(torch.sum((ref - out) ** 2) / torch.sum(ref**2)) + allclose_ok = np.allclose(out.numpy(), ref.numpy(), atol=atol, rtol=rtol) + frob_ok = bool(frob_error <= ftol) + + nan_count = int(torch.isnan(out).sum().item()) + inf_count = int(torch.isinf(out).sum().item()) + + if allclose_ok and frob_ok: + print(f"[pass] n={n}, batch={batch}, frob={float(frob_error):.3e}") + return None + + msg = ( + f"[fail] n={n}, batch={batch}, frob={float(frob_error):.3e}, " + f"nan={nan_count}, inf={inf_count}" + ) + print(msg) + return msg + + +def run_test(lib, n): + failures = [] + structured_scale = structured_scale_by_n(n) + ill_offdiag = ill_offdiag_for_tests(n) + atol, rtol, ftol = UNIFORM_ATOL, UNIFORM_RTOL, UNIFORM_FTOL + structured_batches = [1, 4, 16, 24, 27, 48, 96, 99, 135] + ill_batches = [1, 4, 27] + + for batch in structured_batches: + failure = check_case( + lib, + matrix_gen=lambda n, batch: structured_random_matrix( + n=n, batch=batch, scale=structured_scale + ), + n=n, + batch=batch, + atol=atol, + rtol=rtol, + ftol=ftol, + ) + if failure is not None: + failures.append(failure) + + for batch in ill_batches: + failure = check_case( + lib, + matrix_gen=lambda n, batch: ill_matrix( + n=n, batch=batch, offdiag=ill_offdiag + ), + n=n, + batch=batch, + atol=atol, + rtol=rtol, + ftol=ftol, + ) + if failure is not None: + failures.append(failure) + + total_cases = len(structured_batches) + len(ill_batches) + print( + f"summary: n={n}, pass={total_cases - len(failures)}, " + f"fail={len(failures)}, total={total_cases}" + ) + + if failures: + warnings.warn( + f"{len(failures)} cases failed. First: {failures[0]}", + stacklevel=2, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--matrix-size", + type=int, + choices=SUPPORTED_MATRIX_SIZES, + default=64, + help="Only validate this matrix size n.", + ) + parser.add_argument( + "--lib-path", + type=str, + default="./inverse_lib.so", + help="Shared library path produced by compile.sh.", + ) + args = parser.parse_args() + + device = get_test_device() + torch.npu.set_device(device) + + kernel_lib = load_lib(args.lib_path) + run_test(kernel_lib, n=args.matrix_size) + print(f"Finished tests for n={args.matrix_size} with {args.lib_path}.") diff --git a/examples/aot/fast_inverse/caller.cpp b/examples/aot/fast_inverse/caller.cpp deleted file mode 100644 index fde1b237..00000000 --- a/examples/aot/fast_inverse/caller.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef KERNEL_CPP_16 -#define KERNEL_CPP_16 "inverse_auto_sync_16.cpp" -#endif -#ifndef KERNEL_CPP_32 -#define KERNEL_CPP_32 "inverse_auto_sync_32.cpp" -#endif -#ifndef KERNEL_CPP_64 -#define KERNEL_CPP_64 "inverse_auto_sync_64.cpp" -#endif -#ifndef KERNEL_CPP_96 -#define KERNEL_CPP_96 "inverse_auto_sync_96.cpp" -#endif -#ifndef KERNEL_CPP_128 -#define KERNEL_CPP_128 "inverse_auto_sync_128.cpp" -#endif - -#ifndef KERNEL_FN_16 -#define KERNEL_FN_16 tri_inv_trick_fp16_16 -#endif -#ifndef KERNEL_FN_32 -#define KERNEL_FN_32 tri_inv_trick_fp16_32 -#endif -#ifndef KERNEL_FN_64 -#define KERNEL_FN_64 tri_inv_trick_fp16_64 -#endif -#ifndef KERNEL_FN_96 -#define KERNEL_FN_96 tri_inv_trick_fp16_96 -#endif -#ifndef KERNEL_FN_128 -#define KERNEL_FN_128 tri_inv_trick_fp16_128 -#endif - -#include KERNEL_CPP_16 -#include KERNEL_CPP_32 -#include KERNEL_CPP_64 -#include KERNEL_CPP_96 -#include KERNEL_CPP_128 - -extern "C" void call_kernel( - uint32_t blockDim, - void *stream, - uint8_t *tensor_out, - uint8_t *tensor_in, - uint8_t *identity_in, - uint32_t matrix_size, - uint32_t max_block_size) -{ - switch (matrix_size) { - case 16: - KERNEL_FN_16<<>>( - reinterpret_cast(tensor_out), - reinterpret_cast(tensor_in), - reinterpret_cast(identity_in), - static_cast(matrix_size), - static_cast(max_block_size)); - break; - case 32: - KERNEL_FN_32<<>>( - reinterpret_cast(tensor_out), - reinterpret_cast(tensor_in), - reinterpret_cast(identity_in), - static_cast(matrix_size), - static_cast(max_block_size)); - break; - case 64: - KERNEL_FN_64<<>>( - reinterpret_cast(tensor_out), - reinterpret_cast(tensor_in), - reinterpret_cast(identity_in), - static_cast(matrix_size), - static_cast(max_block_size)); - break; - case 96: - KERNEL_FN_96<<>>( - reinterpret_cast(tensor_out), - reinterpret_cast(tensor_in), - reinterpret_cast(identity_in), - static_cast(matrix_size), - static_cast(max_block_size)); - break; - case 128: - KERNEL_FN_128<<>>( - reinterpret_cast(tensor_out), - reinterpret_cast(tensor_in), - reinterpret_cast(identity_in), - static_cast(matrix_size), - static_cast(max_block_size)); - break; - default: - break; - } -} diff --git a/examples/aot/fast_inverse/compile.sh b/examples/aot/fast_inverse/compile.sh deleted file mode 100644 index d6a65003..00000000 --- a/examples/aot/fast_inverse/compile.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ARTIFACT_DIR="./build_artifacts" - -mkdir -p "${ARTIFACT_DIR}" -rm -f \ - "${ARTIFACT_DIR}"/inverse_auto_sync_*.pto "${ARTIFACT_DIR}"/inverse_manual_sync_*.pto \ - "${ARTIFACT_DIR}"/inverse_auto_sync_*.cpp "${ARTIFACT_DIR}"/inverse_manual_sync_*.cpp \ - inverse_auto_sync_lib.so inverse_manual_sync_lib.so - -SIZES=(16 32 64 96 128) - -# Auto-sync path: rely on ptoas synchronization insertion. -for size in "${SIZES[@]}"; do - python ./inverse_builder.py \ - --matrix-size "${size}" \ - --kernel-name "tri_inv_trick_fp16_${size}" \ - > "${ARTIFACT_DIR}/inverse_auto_sync_${size}.pto" - ptoas --enable-insert-sync "${ARTIFACT_DIR}/inverse_auto_sync_${size}.pto" -o "${ARTIFACT_DIR}/inverse_auto_sync_${size}.cpp" -done - -# Manual-sync path: explicit record/wait events from builder. -for size in "${SIZES[@]}"; do - python ./inverse_builder.py \ - --manual-sync \ - --matrix-size "${size}" \ - --kernel-name "tri_inv_trick_fp16_${size}" \ - > "${ARTIFACT_DIR}/inverse_manual_sync_${size}.pto" - ptoas "${ARTIFACT_DIR}/inverse_manual_sync_${size}.pto" -o "${ARTIFACT_DIR}/inverse_manual_sync_${size}.cpp" -done - -PTO_LIB_PATH=/sources/pto-isa -# PTO_LIB_PATH=$ASCEND_TOOLKIT_HOME - -bisheng \ - -I${PTO_LIB_PATH}/include \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ - -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ - -xcce -Xhost-start -Xhost-end \ - -mllvm -cce-aicore-stack-size=0x8000 \ - -mllvm -cce-aicore-function-stack-size=0x8000 \ - -mllvm -cce-aicore-record-overflow=true \ - -mllvm -cce-aicore-addr-transform \ - -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -std=gnu++17 \ - -DKERNEL_CPP_16="\"${ARTIFACT_DIR}/inverse_auto_sync_16.cpp\"" \ - -DKERNEL_CPP_32="\"${ARTIFACT_DIR}/inverse_auto_sync_32.cpp\"" \ - -DKERNEL_CPP_64="\"${ARTIFACT_DIR}/inverse_auto_sync_64.cpp\"" \ - -DKERNEL_CPP_96="\"${ARTIFACT_DIR}/inverse_auto_sync_96.cpp\"" \ - -DKERNEL_CPP_128="\"${ARTIFACT_DIR}/inverse_auto_sync_128.cpp\"" \ - ./caller.cpp \ - -o ./inverse_auto_sync_lib.so - -bisheng \ - -I${PTO_LIB_PATH}/include \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ - -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ - -xcce -Xhost-start -Xhost-end \ - -mllvm -cce-aicore-stack-size=0x8000 \ - -mllvm -cce-aicore-function-stack-size=0x8000 \ - -mllvm -cce-aicore-record-overflow=true \ - -mllvm -cce-aicore-addr-transform \ - -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -std=gnu++17 \ - -DKERNEL_CPP_16="\"${ARTIFACT_DIR}/inverse_manual_sync_16.cpp\"" \ - -DKERNEL_CPP_32="\"${ARTIFACT_DIR}/inverse_manual_sync_32.cpp\"" \ - -DKERNEL_CPP_64="\"${ARTIFACT_DIR}/inverse_manual_sync_64.cpp\"" \ - -DKERNEL_CPP_96="\"${ARTIFACT_DIR}/inverse_manual_sync_96.cpp\"" \ - -DKERNEL_CPP_128="\"${ARTIFACT_DIR}/inverse_manual_sync_128.cpp\"" \ - ./caller.cpp \ - -o ./inverse_manual_sync_lib.so diff --git a/examples/aot/fast_inverse/inverse_builder.py b/examples/aot/fast_inverse/inverse_builder.py deleted file mode 100644 index 8ec0df20..00000000 --- a/examples/aot/fast_inverse/inverse_builder.py +++ /dev/null @@ -1,370 +0,0 @@ -# pyright: reportUndefinedVariable=false -import argparse - -from ptodsl import pto, tile, to_ir_module -from ptodsl import scalar as s - -const = s.const -SUPPORTED_MATRIX_SIZES = (16, 32, 64, 96, 128) - - -def make_meta_data(matrix_size: int): - def meta_data(): - # Match the hand-written kernel: - # - MAT/LEFT/RIGHT tiles are fp16 - # - ACC and global output are fp32 - # This enables legal TMOV Acc(fp32) -> Mat(fp16) lowering. - in_dtype = pto.float16 - out_dtype = pto.float32 - i32 = pto.int32 - - in_ptr_type = pto.PtrType(in_dtype) - out_ptr_type = pto.PtrType(out_dtype) - - in_tensor_type = pto.TensorType(rank=2, dtype=in_dtype) - out_tensor_type = pto.TensorType(rank=2, dtype=out_dtype) - in_subtensor = pto.SubTensorType( - shape=[matrix_size, matrix_size], dtype=in_dtype - ) - out_subtensor = pto.SubTensorType( - shape=[matrix_size, matrix_size], dtype=out_dtype - ) - l1_tile_type = pto.TileBufType( - shape=[matrix_size, matrix_size], - valid_shape=[matrix_size, matrix_size], - dtype=in_dtype, - memory_space="MAT", - ) - l0a_tile_type = pto.TileBufType( - shape=[matrix_size, matrix_size], - valid_shape=[matrix_size, matrix_size], - dtype=in_dtype, - memory_space="LEFT", - ) - l0b_tile_type = pto.TileBufType( - shape=[matrix_size, matrix_size], - valid_shape=[matrix_size, matrix_size], - dtype=in_dtype, - memory_space="RIGHT", - ) - l0c_tile_type = pto.TileBufType( - shape=[matrix_size, matrix_size], - valid_shape=[matrix_size, matrix_size], - dtype=out_dtype, - memory_space="ACC", - ) - - return { - "in_ptr_type": in_ptr_type, - "out_ptr_type": out_ptr_type, - "i32": i32, - "in_tensor_type": in_tensor_type, - "out_tensor_type": out_tensor_type, - "in_subtensor": in_subtensor, - "out_subtensor": out_subtensor, - "l1_tile_type": l1_tile_type, - "l0a_tile_type": l0a_tile_type, - "l0b_tile_type": l0b_tile_type, - "l0c_tile_type": l0c_tile_type, - } - - return meta_data - - -def build_kernel_autosync(matrix_size: int, kernel_name: str): - def tri_inv_trick_fp16_autosync( - out_ptr: "out_ptr_type", - in_ptr: "in_ptr_type", - i_neg_ptr: "in_ptr_type", - matrix_size_i32: "i32", - max_block_size_i32: "i32", - ) -> None: - with pto.cube_section(): - c0 = const(0) - c1 = const(1) - c2 = const(2) - c4 = const(4) - c8 = const(8) - c16 = const(16) - c32 = const(32) - matrix_size_c = const(matrix_size) - - max_block_size = s.index_cast(max_block_size_i32) - block_idx = s.index_cast(pto.get_block_idx()) - num_blocks = s.index_cast(pto.get_block_num()) - - total_rows = num_blocks * matrix_size_c - row_offset = block_idx * matrix_size_c - - # Keep the runtime signature unchanged while emitting - # compile-time-specialized tile/subtensor types. - _ = matrix_size_i32 - - tv_m = pto.as_tensor( - in_tensor_type, - ptr=in_ptr, - shape=[total_rows, matrix_size_c], - strides=[matrix_size_c, c1], - ) - tv_out = pto.as_tensor( - out_tensor_type, - ptr=out_ptr, - shape=[total_rows, matrix_size_c], - strides=[matrix_size_c, c1], - ) - tv_i_neg = pto.as_tensor( - in_tensor_type, - ptr=i_neg_ptr, - shape=[matrix_size_c, matrix_size_c], - strides=[matrix_size_c, c1], - ) - - sv_m = pto.slice_view( - in_subtensor, - source=tv_m, - offsets=[row_offset, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - sv_i_neg = pto.slice_view( - in_subtensor, - source=tv_i_neg, - offsets=[c0, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - sv_out = pto.slice_view( - out_subtensor, - source=tv_out, - offsets=[row_offset, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - - x_l1 = pto.alloc_tile(l1_tile_type) - y_l1 = pto.alloc_tile(l1_tile_type) - i_l1 = pto.alloc_tile(l1_tile_type) - a_l0 = pto.alloc_tile(l0a_tile_type) - b_l0 = pto.alloc_tile(l0b_tile_type) - c_l0 = pto.alloc_tile(l0c_tile_type) - - pto.load(sv_m, y_l1) - pto.load(sv_i_neg, x_l1) - - tile.mov(y_l1, a_l0) - - tile.mov(y_l1, b_l0) - - tile.matmul(a_l0, b_l0, c_l0) - tile.mov(c_l0, y_l1) - - tile.mov(x_l1, b_l0) - tile.matmul(a_l0, b_l0, c_l0) - - tile.mov(x_l1, a_l0) - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) - tile.mov(c_l0, x_l1) - - tile.matmul(a_l0, b_l0, c_l0) - tile.mov(c_l0, i_l1) - - def run_iteration(iter_i): - tile.mov(x_l1, a_l0) - tile.mov(i_l1, b_l0) - tile.matmul(a_l0, b_l0, c_l0) - - tile.mov(y_l1, b_l0) - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) - - with pto.if_context(iter_i < (max_block_size // c2)): - tile.mov(c_l0, x_l1) - tile.mov(y_l1, a_l0) - tile.matmul(a_l0, b_l0, c_l0) - tile.mov(c_l0, y_l1) - - # Mirror C++ `for (i = 1; i < max_block_size; i *= 2)`. - # TODO: simplify this code logic - for loop_i in (c1, c2, c4, c8, c16, c32): - # here only considers max_block_size up to 64 - with pto.if_context(loop_i < max_block_size): - run_iteration(loop_i) - - pto.store(c_l0, sv_out) - - tri_inv_trick_fp16_autosync.__name__ = kernel_name - return to_ir_module(meta_data=make_meta_data(matrix_size))( - tri_inv_trick_fp16_autosync - ) - - -def build_kernel_manualsync(matrix_size: int, kernel_name: str): - def tri_inv_trick_fp16_manualsync( - out_ptr: "out_ptr_type", - in_ptr: "in_ptr_type", - i_neg_ptr: "in_ptr_type", - matrix_size_i32: "i32", - max_block_size_i32: "i32", - ) -> None: - with pto.cube_section(): - c0 = const(0) - c1 = const(1) - c2 = const(2) - c4 = const(4) - c8 = const(8) - c16 = const(16) - c32 = const(32) - matrix_size_c = const(matrix_size) - - max_block_size = s.index_cast(max_block_size_i32) - block_idx = s.index_cast(pto.get_block_idx()) - num_blocks = s.index_cast(pto.get_block_num()) - - total_rows = num_blocks * matrix_size_c - row_offset = block_idx * matrix_size_c - - # Keep the runtime signature unchanged while emitting - # compile-time-specialized tile/subtensor types. - _ = matrix_size_i32 - - tv_m = pto.as_tensor( - in_tensor_type, - ptr=in_ptr, - shape=[total_rows, matrix_size_c], - strides=[matrix_size_c, c1], - ) - tv_out = pto.as_tensor( - out_tensor_type, - ptr=out_ptr, - shape=[total_rows, matrix_size_c], - strides=[matrix_size_c, c1], - ) - tv_i_neg = pto.as_tensor( - in_tensor_type, - ptr=i_neg_ptr, - shape=[matrix_size_c, matrix_size_c], - strides=[matrix_size_c, c1], - ) - - sv_m = pto.slice_view( - in_subtensor, - source=tv_m, - offsets=[row_offset, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - sv_i_neg = pto.slice_view( - in_subtensor, - source=tv_i_neg, - offsets=[c0, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - sv_out = pto.slice_view( - out_subtensor, - source=tv_out, - offsets=[row_offset, c0], - sizes=[matrix_size_c, matrix_size_c], - ) - - x_l1 = pto.alloc_tile(l1_tile_type) - y_l1 = pto.alloc_tile(l1_tile_type) - i_l1 = pto.alloc_tile(l1_tile_type) - a_l0 = pto.alloc_tile(l0a_tile_type) - b_l0 = pto.alloc_tile(l0b_tile_type) - c_l0 = pto.alloc_tile(l0c_tile_type) - - pto.load(sv_m, y_l1) - pto.load(sv_i_neg, x_l1) - pto.record_wait_pair("LOAD", "MOV_M2L", event_id=0) - - tile.mov(y_l1, a_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - - tile.mov(y_l1, b_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - - tile.matmul(a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) - tile.mov(c_l0, y_l1) - pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) - - tile.mov(x_l1, b_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - tile.matmul(a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_M2L", event_id=0) - - tile.mov(x_l1, a_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) - tile.mov(c_l0, x_l1) - pto.record_wait_pair("MOV_V2M", "MATMUL", event_id=0) - - tile.matmul(a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) - tile.mov(c_l0, i_l1) - pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) - - def run_iteration(iter_i): - tile.mov(x_l1, a_l0) - tile.mov(i_l1, b_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - tile.matmul(a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_M2L", event_id=0) - - tile.mov(y_l1, b_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - tile.matmul_acc(c_l0, a_l0, b_l0, c_l0) - - with pto.if_context(iter_i < (max_block_size // c2)): - pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) - tile.mov(c_l0, x_l1) - pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) - tile.mov(y_l1, a_l0) - pto.record_wait_pair("MOV_M2L", "MATMUL", event_id=0) - tile.matmul(a_l0, b_l0, c_l0) - pto.record_wait_pair("MATMUL", "MOV_V2M", event_id=0) - tile.mov(c_l0, y_l1) - pto.record_wait_pair("MOV_V2M", "MOV_M2L", event_id=0) - - # Mirror C++ `for (i = 1; i < max_block_size; i *= 2)`. - # TODO: simplify this code logic - for loop_i in (c1, c2, c4, c8, c16, c32): - # here only considers max_block_size up to 64 - with pto.if_context(loop_i < max_block_size): - run_iteration(loop_i) - - pto.record_wait_pair("MATMUL", "STORE_ACC", event_id=0) - pto.store(c_l0, sv_out) - - tri_inv_trick_fp16_manualsync.__name__ = kernel_name - return to_ir_module(meta_data=make_meta_data(matrix_size))( - tri_inv_trick_fp16_manualsync - ) - - -def build_kernel(manual_sync: bool, matrix_size: int, kernel_name: str): - if manual_sync: - return build_kernel_manualsync(matrix_size, kernel_name) - return build_kernel_autosync(matrix_size, kernel_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--manual-sync", - action="store_true", - help="Emit explicit record/wait events instead of relying on --enable-insert-sync.", - ) - parser.add_argument( - "--matrix-size", - type=int, - choices=SUPPORTED_MATRIX_SIZES, - default=128, - help="Compile-time specialized matrix size.", - ) - parser.add_argument( - "--kernel-name", - type=str, - default=None, - help="Kernel symbol name in emitted module.", - ) - args = parser.parse_args() - kernel_name = args.kernel_name or f"tri_inv_trick_fp16_{args.matrix_size}" - module = build_kernel(args.manual_sync, args.matrix_size, kernel_name) - print(module) diff --git a/examples/aot/fast_inverse/run_inverse.py b/examples/aot/fast_inverse/run_inverse.py deleted file mode 100644 index 7aef879d..00000000 --- a/examples/aot/fast_inverse/run_inverse.py +++ /dev/null @@ -1,220 +0,0 @@ -import argparse -import ctypes -import random -import warnings -from typing import Callable - -import numpy as np -import torch -import torch_npu # noqa: F401 - -from ptodsl.test_util import get_test_device - -random.seed(42) -torch.manual_seed(42) -np.random.seed(42) - - -def torch_to_ctypes(tensor): - return ctypes.c_void_p(tensor.data_ptr()) - - -def load_lib(lib_path): - lib = ctypes.CDLL(lib_path) - lib.call_kernel.argtypes = [ - ctypes.c_uint32, # blockDim - ctypes.c_void_p, # stream - ctypes.c_void_p, # out - ctypes.c_void_p, # in - ctypes.c_void_p, # identity_neg - ctypes.c_uint32, # matrix_size - ctypes.c_uint32, # block_size for block-diag-matrices - ] - lib.call_kernel.restype = None - return lib - - -def random_matrix(n, block_dim_x, block_dim_y, scale=0.01): - # TODO: this data generator is not used yet - return scale * torch.rand((block_dim_x, block_dim_y, n, n)) - - -def blockdiag_ones_matrix(n, block_dim_x, block_dim_y, block_size=16): - block = np.ones((block_size, block_size)) - n_blocks = n // block_size - out = np.zeros((block_dim_x, block_dim_y, n, n)) - for x in range(block_dim_x): - for y in range(block_dim_y): - for i in range(n_blocks): - start = i * block_size - end = start + block_size - out[x, y, start:end, start:end] = block - return torch.from_numpy(np.triu(out, 1)) - - -def blockdiag_random_matrix(n, block_dim_x, block_dim_y, block_size=16): - if block_size == 16: - scale = 0.2 - elif block_size == 32: - scale = 0.05 - elif block_size == 64: - scale = 0.01 - else: - raise ValueError("block_size must be 16/32/64") - block = scale * np.random.rand(block_size, block_size) - block = np.triu(block, k=1) - out = np.zeros((block_dim_x, block_dim_y, n, n)) - for x in range(block_dim_x): - for y in range(block_dim_y): - for i in range(0, n, block_size): - out[x, y, i : i + block_size, i : i + block_size] = block.copy() - return torch.from_numpy(out) - - -def run_kernel(lib, inp, blockdiag_size=16): - inp_fp16 = inp.to(torch.float16).contiguous() - n = int(inp_fp16.shape[-1]) - block_dim = int(inp_fp16.shape[0] * inp_fp16.shape[1]) - - # Run true matrix sizes directly (e.g., 32x32, 64x64) without padding. - run_n = n - inp_run = inp_fp16 - - out = torch.zeros_like(inp_run, dtype=torch.float32, device=inp_run.device) - identity_neg = torch.zeros( - (run_n, run_n), dtype=torch.float16, device=inp_run.device - ) - identity_neg.fill_diagonal_(-1) - - stream_ptr = torch.npu.current_stream()._as_parameter_ - lib.call_kernel( - block_dim, - stream_ptr, - torch_to_ctypes(out), - torch_to_ctypes(inp_run), - torch_to_ctypes(identity_neg), - run_n, - blockdiag_size, - ) - torch.npu.synchronize() - return out - - -def reference_inverse(inp): - n = inp.shape[-1] - identity = np.eye(n, dtype=np.double) - golden = np.zeros(inp.shape, dtype=np.double) - inp_cpu = inp.cpu() - for x in range(inp.shape[0]): - for y in range(inp.shape[1]): - golden[x, y] = np.linalg.inv( - inp_cpu[x, y].numpy().astype(np.double) + identity - ) - return torch.from_numpy(golden) - - -def check_case( - lib, matrix_gen: Callable, atol: float, rtol: float, ftol: float, blockdiag_size=16 -): - if blockdiag_size == 16: - n_list = [16, 32, 64, 96, 128] - elif blockdiag_size == 32: - n_list = [32, 64, 128] - elif blockdiag_size == 64: - n_list = [64, 128] - else: - raise ValueError("blockdiag_size must be 16/32/64") - block_dim_x_list = [1, 3, 7, 16] - block_dim_y_list = [1, 2, 4, 16] - failures = [] - passes = 0 - for n in n_list: - for block_dim_x in block_dim_x_list: - for block_dim_y in block_dim_y_list: - inp = matrix_gen( - n, block_dim_x, block_dim_y, block_size=blockdiag_size - ).to(device) - ref = reference_inverse(inp).to(torch.float64) - out = run_kernel(lib, inp).cpu().to(torch.float64) - - frob_error = torch.sqrt( - torch.sum((ref - out) * (ref - out)) / torch.sum(ref * ref) - ) - - nan_count = int(torch.isnan(out).sum().item()) - inf_count = int(torch.isinf(out).sum().item()) - - allclose_ok = np.allclose( - out.numpy(), ref.numpy(), atol=atol, rtol=rtol - ) - frob_ok = bool(frob_error <= ftol) - if allclose_ok and frob_ok: - passes += 1 - print( - f"[pass] n={n}, bx={block_dim_x}, by={block_dim_y}, " - f"frob={float(frob_error):.3e}" - ) - else: - msg = ( - f"[fail] n={n}, bx={block_dim_x}, by={block_dim_y}, " - f"frob={float(frob_error):.3e}, nan={nan_count}, inf={inf_count}" - ) - print(msg) - failures.append(msg) - - total = len(n_list) * len(block_dim_x_list) * len(block_dim_y_list) - print(f"summary: pass={passes}, fail={len(failures)}, total={total}") - return failures - - -def run_test(lib): - failures = [] - for blockdiag_size in (16,): - failures.extend( - check_case( - lib, - blockdiag_ones_matrix, - atol=0.0, - rtol=0.0, - ftol=0.0, - blockdiag_size=blockdiag_size, - ) - ) - for blockdiag_size in (16, 32, 64): - failures.extend( - check_case( - lib, - blockdiag_random_matrix, - atol=5e-5, - rtol=0.1, - ftol=1.2e-4, - blockdiag_size=blockdiag_size, - ) - ) - if failures: - warnings.warn( - f"{len(failures)} cases failed. First: {failures[0]}", - stacklevel=2, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--manual-sync", - action="store_true", - help="Use manual-sync library instead of the default auto-sync library.", - ) - args = parser.parse_args() - - lib_path = ( - "./inverse_manual_sync_lib.so" - if args.manual_sync - else "./inverse_auto_sync_lib.so" - ) - device = get_test_device() - torch.npu.set_device(device) - - kernel_lib = load_lib(lib_path) - run_test(kernel_lib) - print(f"All tests passed for {lib_path}.") diff --git a/examples/aot/matmul_optimization_guide/experimental/.gitignore b/examples/aot/matmul_optimization_guide/experimental/.gitignore index 03567fc4..7b55c120 100644 --- a/examples/aot/matmul_optimization_guide/experimental/.gitignore +++ b/examples/aot/matmul_optimization_guide/experimental/.gitignore @@ -1 +1,3 @@ outputs +matmul.cpp +matmul.pto diff --git a/examples/aot/print_tile/.gitignore b/examples/aot/print_tile/.gitignore new file mode 100644 index 00000000..1c66f912 --- /dev/null +++ b/examples/aot/print_tile/.gitignore @@ -0,0 +1 @@ +print_gen.cpp From 4526d42da49d4ee3d0c439786133fa2a6a18d3be Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:11:37 +0100 Subject: [PATCH 47/53] remove old ignore --- examples/aot/fast_inverse/.gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 examples/aot/fast_inverse/.gitignore diff --git a/examples/aot/fast_inverse/.gitignore b/examples/aot/fast_inverse/.gitignore deleted file mode 100644 index 2672482f..00000000 --- a/examples/aot/fast_inverse/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build_artifacts From 201ffbb758a1f4f547c12d158dc9d49f04e8b9bc Mon Sep 17 00:00:00 2001 From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com> Date: Fri, 20 Mar 2026 14:13:38 +0100 Subject: [PATCH 48/53] Add example translation collection check to CI (#94) * Add example translation collection check to CI * fix ci path * update ptoas version in CI --------- Co-authored-by: jiawei_zhuang --- .../scripts/collect_example_translate.py | 9 ++++++++- .github/workflows/ci.yml | 13 ++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py b/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py index a65e4a55..ee273c70 100644 --- a/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py +++ b/.agent/skills/translate_cpp2py/scripts/collect_example_translate.py @@ -133,6 +133,13 @@ def main() -> int: found = len(example_list) results: list[dict[str, str]] = [] + def display_path(path: Path) -> str: + try: + return str(path.relative_to(repo_root)) + except ValueError: + # In CI we may intentionally write outside repo root (e.g. /tmp). + return str(path) + for idx, example in enumerate(example_list, start=1): rel_dir = Path(str(example["example_dir"])) example_dir = aot_dir / rel_dir @@ -260,7 +267,7 @@ def main() -> int: { "name": example_name, "status": "OK", - "reason": f"collected to {dst.relative_to(repo_root)}", + "reason": f"collected to {display_path(dst)}", } ) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be69c3fa..491c52ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,8 +46,9 @@ jobs: image: quay.io/ascend/cann:8.5.0-910b-ubuntu22.04-py3.11 env: - RELEASE_REPO: huawei-csl/PTOAS - RELEASE_TAG: 20260309 + RELEASE_REPO: zhangstevenunity/PTOAS + RELEASE_VER: 0.9 + RELEASE_TAG: v0.9 CLI_DIR: /installers/ptoas-cli PTOISA_COMMIT: 672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc @@ -66,7 +67,7 @@ jobs: - name: Install ptoas wheel run: | - WHEEL_NAME=ptoas-0.1.1-cp311-none-manylinux_2_34_${{ matrix.arch }}.whl + WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${{ matrix.arch }}.whl wget https://github.com/${RELEASE_REPO}/releases/download/${RELEASE_TAG}/${WHEEL_NAME} pip install ./${WHEEL_NAME} python -c "import mlir.ir; from mlir.dialects import pto" @@ -103,3 +104,9 @@ jobs: pytest -v -m "not require_npu" ./tests/npu env: TORCH_DEVICE_BACKEND_AUTOLOAD: "0" + + - name: Run example translation collection check + run: | + python ./.agent/skills/translate_cpp2py/scripts/collect_example_translate.py \ + --repo-root . \ + --out-dir /tmp/example_translation From 1479a8f4c2b56594f442070e99afe50a3951431f Mon Sep 17 00:00:00 2001 From: Mirko De Vita <61700769+MirkoDeVita98@users.noreply.github.com> Date: Thu, 26 Mar 2026 10:11:25 +0100 Subject: [PATCH 49/53] mrgsort and sort32 dynamic multicore test + TopK semidynamic example (#96) * mrgsort and sort32 dynamic multicore tests and semidynamic topk example * updated README for topk example --------- Co-authored-by: mirkodevita --- examples/aot/topk/.gitignore | 4 + examples/aot/topk/README.md | 77 +++++ examples/aot/topk/caller.py | 48 +++ examples/aot/topk/compile.sh | 43 +++ examples/aot/topk/run_topk.py | 161 ++++++++++ examples/aot/topk/topk_builder.py | 288 ++++++++++++++++++ ptodsl/api/scalar.py | 2 + ptodsl/api/tile.py | 19 +- ptodsl/api/type_def.py | 3 +- .../npu/mrgsort_dynamic_multicore/builder.py | 160 ++++++++++ tests/npu/mrgsort_dynamic_multicore/caller.py | 37 +++ .../npu/mrgsort_dynamic_multicore/compile.sh | 34 +++ tests/npu/mrgsort_dynamic_multicore/gen_ir.py | 32 ++ .../mrgsort_dynamic_multicore/test_mrgsort.py | 194 ++++++++++++ tests/npu/sort32_dynamic_multicore/builder.py | 194 ++++++++++++ tests/npu/sort32_dynamic_multicore/caller.py | 36 +++ tests/npu/sort32_dynamic_multicore/compile.sh | 34 +++ tests/npu/sort32_dynamic_multicore/gen_ir.py | 28 ++ .../sort32_dynamic_multicore/test_tsort32.py | 146 +++++++++ 19 files changed, 1538 insertions(+), 2 deletions(-) create mode 100644 examples/aot/topk/.gitignore create mode 100644 examples/aot/topk/README.md create mode 100644 examples/aot/topk/caller.py create mode 100755 examples/aot/topk/compile.sh create mode 100644 examples/aot/topk/run_topk.py create mode 100644 examples/aot/topk/topk_builder.py create mode 100644 tests/npu/mrgsort_dynamic_multicore/builder.py create mode 100644 tests/npu/mrgsort_dynamic_multicore/caller.py create mode 100755 tests/npu/mrgsort_dynamic_multicore/compile.sh create mode 100644 tests/npu/mrgsort_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/mrgsort_dynamic_multicore/test_mrgsort.py create mode 100644 tests/npu/sort32_dynamic_multicore/builder.py create mode 100644 tests/npu/sort32_dynamic_multicore/caller.py create mode 100755 tests/npu/sort32_dynamic_multicore/compile.sh create mode 100644 tests/npu/sort32_dynamic_multicore/gen_ir.py create mode 100644 tests/npu/sort32_dynamic_multicore/test_tsort32.py diff --git a/examples/aot/topk/.gitignore b/examples/aot/topk/.gitignore new file mode 100644 index 00000000..e0e1a224 --- /dev/null +++ b/examples/aot/topk/.gitignore @@ -0,0 +1,4 @@ +caller.cpp +topk_float32.pto +topk_float32.cpp +topk_float32_lib.so diff --git a/examples/aot/topk/README.md b/examples/aot/topk/README.md new file mode 100644 index 00000000..b99e49bd --- /dev/null +++ b/examples/aot/topk/README.md @@ -0,0 +1,77 @@ +# TopK (AOT, dynamic n_rows, float32) + +Finds the top-K largest elements per row of a 2-D `[N_ROWS × N_COLS]` float32 +matrix using a TSORT32 → TMRGSORT → TGATHER pipeline on the NPU vector engine. + +`N_ROWS` is a **runtime** argument — a single compiled `.so` handles any row +count without recompilation. `N_COLS`, `TOPK`, and `BLOCK_DIM` are +compile-time constants because they govern tile buffer sizes and the number of +merge-sort passes, which must be statically known by the hardware. + +## Parameters + +| Symbol | Kind | Default | Meaning | +|------------------|:------------:|--------:|--------------------------------------| +| `N_ROWS` | **runtime** | any | rows in the input matrix | +| `N_COLS` | compile-time | 512 | input elements per row | +| `TOPK` | compile-time | 256 | top-k output count per row | +| `BLOCK_DIM` | compile-time | 24 | number of NPU compute blocks | +| `SORT_BLOCK_LEN` | compile-time | 32 | TSORT32 sorts in blocks of this many | + +Valid `N_COLS` values (with `SORT_BLOCK_LEN=32`): + +| `N_COLS` | `SORT_COLS` | Merge passes | +|---------:|------------:|:------------:| +| 128 | 256 | 1 | +| 512 | 1024 | 2 | +| 2048 | 4096 | 3 | + +## Pipeline (per row) + +``` +input row [1 x N_COLS] --> TSORT32 --> sort buffer [1 x 2*N_COLS] + (interleaved score/idx pairs) + TMRGSORT x passes --> fully sorted [1 x 2*N_COLS] + TMOV (gather window, valid=[1 x 2*TOPK]) + TGATHER P0101 --> tb_scores [1 x TOPK] float32 + TGATHER P1010 --> tb_indices [1 x TOPK] uint32 +``` + +The gather-window tile has `valid_shape=[1, 2*TOPK]`, which limits TGATHER +to exactly `TOPK` outputs even when `TOPK < N_COLS`. + +## Usage + +Compile all configs and validate all 11 test cases: + +```bash +python ./run_topk.py +``` + +To compile a single config manually or skip recompilation: + +```text +# compile one config: N_COLS TOPK BLOCK_DIM +bash ./compile.sh 512 256 24 + +# skip recompilation if .so files already exist +python ./run_topk.py --no-compile +``` + +## Files + +| File | Purpose | +|-------------------|------------------------------------------------------------| +| `topk_builder.py` | PTO-DSL builder – emits MLIR for a given `(N_COLS, TOPK)` | +| `caller.py` | Generates `caller.cpp` with `int32_t n_rows` at call time | +| `compile.sh` | End-to-end build: PTO → MLIR → C++ → `.so` | +| `run_topk.py` | Validates 11 configs against `torch.topk` | + +Generated build artifacts (gitignored): + +| Artifact | Created by | +|----------------------------------|--------------| +| `caller.cpp` | `compile.sh` | +| `topk_c_k.pto` | `compile.sh` | +| `topk_c_k.cpp` | `compile.sh` | +| `topk_c_k_lib.so` | `compile.sh` | diff --git a/examples/aot/topk/caller.py b/examples/aot/topk/caller.py new file mode 100644 index 00000000..7d0f4f12 --- /dev/null +++ b/examples/aot/topk/caller.py @@ -0,0 +1,48 @@ +"""Generate caller.cpp for a given TopK kernel function name. + +The generated file wraps the NPU kernel launch in an ``extern "C"`` function +that can be called from Python via ctypes. + +n_rows is passed at call time as an ``int32_t`` so the same shared library +handles any row count without recompilation. + +Usage +----- + python caller.py topk_c512_k256 + python caller.py topk_c128_k64 --block-dim 24 > caller.cpp +""" + +import argparse + +_DEFAULT_BLOCK_DIM = 24 + + +def generate(fn: str, block_dim: int = _DEFAULT_BLOCK_DIM) -> str: + return f"""\ +// Auto-generated by caller.py – do not edit by hand. +#include "{fn}.cpp" + +extern "C" void call_{fn}( + void *stream, + uint8_t *src, + uint8_t *inIdx, + uint8_t *out_scores, + uint8_t *out_indices, + int32_t n_rows) +{{ + {fn}<<<{block_dim}, nullptr, stream>>>( + reinterpret_cast(src), + reinterpret_cast(inIdx), + reinterpret_cast(out_scores), + reinterpret_cast(out_indices), + n_rows); +}} +""" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("fn_name", help="e.g. topk_c512_k256") + parser.add_argument("--block-dim", type=int, default=_DEFAULT_BLOCK_DIM) + args = parser.parse_args() + print(generate(args.fn_name, args.block_dim), end="") diff --git a/examples/aot/topk/compile.sh b/examples/aot/topk/compile.sh new file mode 100755 index 00000000..d1ae0db4 --- /dev/null +++ b/examples/aot/topk/compile.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Compile one TopK kernel config into a shared library. +# +# Usage: bash compile.sh [N_COLS] [TOPK] [BLOCK_DIM] +# Defaults: 512 256 24 +# +# N_ROWS is a runtime argument – the same library handles any row count. +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +N_COLS=${1:-512} +TOPK=${2:-256} +BLOCK_DIM=${3:-24} + +FN="topk_c${N_COLS}_k${TOPK}" + +TMP=$(mktemp -d) +trap "rm -rf $TMP" EXIT + +python "$SCRIPT_DIR/topk_builder.py" \ + --n-cols "$N_COLS" --topk "$TOPK" --block-dim "$BLOCK_DIM" \ + > "$TMP/${FN}.pto" +ptoas --enable-insert-sync "$TMP/${FN}.pto" -o "$TMP/${FN}.cpp" + +python "$SCRIPT_DIR/caller.py" "$FN" --block-dim "$BLOCK_DIM" > "$TMP/caller.cpp" + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + "$TMP/caller.cpp" \ + -o "$SCRIPT_DIR/${FN}_lib.so" + +echo "Built ${FN}_lib.so successfully." diff --git a/examples/aot/topk/run_topk.py b/examples/aot/topk/run_topk.py new file mode 100644 index 00000000..edf7e83c --- /dev/null +++ b/examples/aot/topk/run_topk.py @@ -0,0 +1,161 @@ +""" +Run and validate the TopK AOT kernel for multiple configurations. + +The kernel is **dynamic** in n_rows: a single compiled .so handles any row +count. Configs that share the same (n_cols, topk) pair reuse the same library. + +Usage: + python ./run_topk.py # compile + run all configs + python ./run_topk.py --no-compile # skip recompilation (libs already built) + +Valid N_COLS values (SORT_BLOCK_LEN=32) +--------------------------------------- + SORT_COLS = N_COLS*2 must be a power-of-4 multiple of HW_BLOCK_LEN=64: + N_COLS = 128 → 1 merge pass + N_COLS = 512 → 2 merge passes + N_COLS = 2048 → 3 merge passes +""" + +import argparse +import ctypes +import os +import subprocess + +import torch +import torch_npu + +from ptodsl.test_util import get_test_device +from topk_builder import fn_name + +_DIR = os.path.dirname(os.path.abspath(__file__)) + +# ── test configurations ─────────────────────────────────────────────────────── +# (n_rows, n_cols, topk, description) +# n_rows can be any positive integer – divisibility by BLOCK_DIM is NOT required. +# Configs sharing the same (n_cols, topk) reuse the same compiled .so. +_CONFIGS = [ + # 1 merge pass – topk < n_cols + (24, 128, 64, "n_rows=24, 1 pass, topk str: + return os.path.join(_DIR, f"{fn_name(n_cols, topk)}_lib.so") + + +def _compile(n_cols: int, topk: int) -> None: + subprocess.check_call( + ["bash", os.path.join(_DIR, "compile.sh"), str(n_cols), str(topk)], + cwd=_DIR, + ) + + +def _load_fn(n_cols: int, topk: int): + lib = ctypes.CDLL(_lib_path(n_cols, topk)) + fn = getattr(lib, f"call_{fn_name(n_cols, topk)}") + fn.argtypes = [ + ctypes.c_void_p, # stream + ctypes.c_void_p, # src [n_rows, n_cols] float32 + ctypes.c_void_p, # inIdx [n_cols] uint32 + ctypes.c_void_p, # out_scores [n_rows, topk] float32 + ctypes.c_void_p, # out_indices [n_rows, topk] uint32 + ctypes.c_int32, # n_rows (runtime) + ] + fn.restype = None + return fn + + +def _ptr(t: torch.Tensor) -> ctypes.c_void_p: + return ctypes.c_void_p(t.data_ptr()) + + +def _run_one(device: str, n_rows: int, n_cols: int, topk: int, desc: str) -> None: + fn = _load_fn(n_cols, topk) + torch.manual_seed(0) + + src = torch.rand(n_rows, n_cols, dtype=torch.float32, device=device) + inidx = torch.arange(n_cols, dtype=torch.int32, device=device) + out_scores = torch.empty(n_rows, topk, dtype=torch.float32, device=device) + out_indices = torch.empty(n_rows, topk, dtype=torch.int32, device=device) + + stream_ptr = torch.npu.current_stream()._as_parameter_ + torch.npu.synchronize() + fn( + stream_ptr, + _ptr(src), + _ptr(inidx), + _ptr(out_scores), + _ptr(out_indices), + ctypes.c_int32(n_rows), + ) + torch.npu.synchronize() + + src_cpu = src.cpu() + + # 1. Scores must exactly match torch.topk (descending, sorted). + ref_vals, _ = torch.topk(src_cpu, topk, dim=-1, largest=True, sorted=True) + torch.testing.assert_close( + out_scores.cpu(), + ref_vals, + rtol=0, + atol=0, + msg=f"scores mismatch ({desc})", + ) + + # 2. Each returned index must point to the correct value in the source row. + # (Don't compare indices directly – hardware may break ties differently.) + gathered = torch.gather(src_cpu, 1, out_indices.cpu().to(torch.int64)) + torch.testing.assert_close( + gathered, + out_scores.cpu(), + rtol=0, + atol=0, + msg=f"index↔score mismatch ({desc})", + ) + + print(f" PASSED {n_rows:5d}×{n_cols:5d} → top-{topk:5d} [{desc}]") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--no-compile", + action="store_true", + help="skip recompilation (assume .so files already exist)", + ) + args = parser.parse_args() + + device = get_test_device() + torch.npu.set_device(device) + + print(f"Running {len(_CONFIGS)} TopK configs on {device}") + print("-" * 70) + + compiled: set = set() + for n_rows, n_cols, topk, desc in _CONFIGS: + if not args.no_compile and (n_cols, topk) not in compiled: + _compile(n_cols, topk) + compiled.add((n_cols, topk)) + _run_one(device, n_rows, n_cols, topk, desc) + + print("-" * 70) + print(f"All {len(_CONFIGS)} configs PASSED.") + + +if __name__ == "__main__": + main() diff --git a/examples/aot/topk/topk_builder.py b/examples/aot/topk/topk_builder.py new file mode 100644 index 00000000..18de316a --- /dev/null +++ b/examples/aot/topk/topk_builder.py @@ -0,0 +1,288 @@ +""" +TopK AOT kernel: for each row of an [N_ROWS × N_COLS] float32 matrix, find the +top-TOPK elements and return their values and original column indices. + +Pipeline (per row) +------------------ + 1. TSORT32 – sort within SORT_BLOCK_LEN-element blocks, writing interleaved + (score_f32, idx_u32) pairs to the sort buffer. + 2. TMRGSORT – multi-pass 4-way merge until the sort buffer is fully sorted + descending by score. Unrolled at builder time (static sizes). + 3. TMOV tb_sort → tb_gather_win (valid_shape=[1, 2*TOPK]). + 4. TGATHER P0101 on tb_gather_win – extract top-TOPK scores (even slots). + 5. TGATHER P1010 on tb_gather_win – extract top-TOPK indices (odd slots, + stored as uint32 bit-patterns in a float32 tile). + 6. TSTORE – write scores and indices to global memory. + +The gather-window tile has the same physical shape as the sort buffer but its +valid_shape is limited to [1, 2*TOPK]. This ensures TGATHER P0101/P1010 sees +exactly 2*TOPK elements and produces exactly TOPK outputs, even when TOPK < N_COLS +(without the window, P0101 on a sort_cols-element tile would produce N_COLS +outputs and overflow the TOPK-element destination tile). + +Constraints (verified by assertions in build_topk) +--------------------------------------------------- + * TOPK must be ≤ N_COLS. + * N_ROWS is unconstrained at compile time (any value works at runtime). + * HW_BLOCK_LEN (= SORT_BLOCK_LEN × DST_STRIDE) must be a multiple of 64. + * SORT_COLS (= N_COLS × DST_STRIDE) must be an exact power-of-4 multiple of + HW_BLOCK_LEN (guarantees a clean merge with no tail block). + +Valid N_COLS values (SORT_BLOCK_LEN=32) +--------------------------------------- + SORT_COLS = N_COLS*2 must be a power-of-4 multiple of HW_BLOCK_LEN=64: + N_COLS = 128 → SORT_COLS = 256 (1 merge pass) + N_COLS = 512 → SORT_COLS = 1024 (2 merge passes) + N_COLS = 2048 → SORT_COLS = 4096 (3 merge passes) + +Usage +----- + python topk_builder.py # default: n_cols=512 topk=256 + python topk_builder.py --n-cols 128 --topk 64 +""" + +import argparse + +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# float32: TSORT32 expands each input element to (score_f32, idx_u32) = 2 words. +_DST_STRIDE = 2 +_SORT_BLOCK_LEN = 32 # TSORT32 sorts within blocks of this many input elements + + +def fn_name(n_cols: int, topk: int) -> str: + """Unique kernel name (n_rows is dynamic and not encoded).""" + return f"topk_c{n_cols}_k{topk}" + + +def build_topk( + n_cols: int = 512, + topk: int = 256, + block_dim: int = 24, + sort_block_len: int = _SORT_BLOCK_LEN, +): + """Return a compiled MLIR module for the given compile-time TopK shape. + + n_rows is NOT a compile-time parameter – it is a runtime ``int32`` argument + (``argN``) passed at each invocation. The kernel uses ``s.ceil_div`` to + distribute rows across blocks and guards the last block with ``if_context``, + so any n_rows value is supported without recompilation. + """ + sort_cols = n_cols * _DST_STRIDE + hw_block_len = sort_block_len * _DST_STRIDE + + assert topk <= n_cols, f"topk={topk} must be ≤ n_cols={n_cols}" + assert ( + hw_block_len % 64 == 0 + ), f"hw_block_len={hw_block_len} must be a multiple of 64" + _blk = hw_block_len + while _blk * 4 <= sort_cols: + _blk *= 4 + assert _blk == sort_cols, ( + f"sort_cols={sort_cols} is not a power-of-4 multiple of hw_block_len={hw_block_len}; " + "tail merging is not implemented in this example." + ) + + def _meta_data(): + f32 = pto.float32 + u32 = pto.uint32 + tile_cfg = pto.TileBufConfig() + return { + "ptr_f32": pto.PtrType(f32), + "ptr_u32": pto.PtrType(u32), + "index_dtype": pto.int32, + "tensor_src": pto.TensorType(rank=2, dtype=f32), + "tensor_inidx": pto.TensorType(rank=2, dtype=u32), + "tensor_scores": pto.TensorType(rank=2, dtype=f32), + "tensor_indices": pto.TensorType(rank=2, dtype=u32), + "sub_src": pto.SubTensorType(shape=[1, n_cols], dtype=f32), + "sub_inidx": pto.SubTensorType(shape=[1, n_cols], dtype=u32), + "sub_scores": pto.SubTensorType(shape=[1, topk], dtype=f32), + "sub_indices": pto.SubTensorType(shape=[1, topk], dtype=u32), + "tile_src": pto.TileBufType( + shape=[1, n_cols], + valid_shape=[1, n_cols], + dtype=f32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_inidx": pto.TileBufType( + shape=[1, n_cols], + valid_shape=[1, n_cols], + dtype=u32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_sort_f32": pto.TileBufType( + shape=[1, sort_cols], + valid_shape=[1, sort_cols], + dtype=f32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_sort_u32": pto.TileBufType( + shape=[1, sort_cols], + valid_shape=[1, sort_cols], + dtype=u32, + memory_space="VEC", + config=tile_cfg, + ), + # Gather window: same physical shape as tile_sort, but valid_shape + # limited to [1, 2*topk] so TGATHER P0101/P1010 produces topk outputs. + "tile_gather_win_f32": pto.TileBufType( + shape=[1, sort_cols], + valid_shape=[1, 2 * topk], + dtype=f32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_gather_win_u32": pto.TileBufType( + shape=[1, sort_cols], + valid_shape=[1, 2 * topk], + dtype=u32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_topk_f32": pto.TileBufType( + shape=[1, topk], + valid_shape=[1, topk], + dtype=f32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_topk_u32": pto.TileBufType( + shape=[1, topk], + valid_shape=[1, topk], + dtype=u32, + memory_space="VEC", + config=tile_cfg, + ), + } + + def _kernel( + src_ptr: "ptr_f32", # [n_rows, n_cols] float32 – input scores + inidx_ptr: "ptr_u32", # [n_cols] uint32 – original column indices + scores_ptr: "ptr_f32", # [n_rows, topk] float32 – output top-k scores + indices_ptr: "ptr_u32", # [n_rows, topk] uint32 – output top-k indices + argN: "index_dtype", # n_rows (runtime) + ) -> None: + c0 = const(0) + c1 = const(1) + c_ncols = const(n_cols) + c_topk = const(topk) + c_bdim = const(block_dim) + + n_rows_dyn = s.index_cast(argN) + bid = s.index_cast(pto.get_block_idx()) + + # Distribute rows across blocks with ceil_div – works for any n_rows. + rows_per_core = s.ceil_div(n_rows_dyn, c_bdim) + row_start = bid * rows_per_core + row_end_raw = row_start + rows_per_core + need_clamp = row_end_raw > n_rows_dyn + rows_this_core = s.select(need_clamp, n_rows_dyn - row_start, rows_per_core) + + with pto.vector_section(): + tv_src = pto.as_tensor( + tensor_src, + ptr=src_ptr, + shape=[n_rows_dyn, c_ncols], + strides=[c_ncols, c1], + ) + tv_inidx = pto.as_tensor( + tensor_inidx, ptr=inidx_ptr, shape=[c1, c_ncols], strides=[c_ncols, c1] + ) + tv_scores = pto.as_tensor( + tensor_scores, + ptr=scores_ptr, + shape=[n_rows_dyn, c_topk], + strides=[c_topk, c1], + ) + tv_indices = pto.as_tensor( + tensor_indices, + ptr=indices_ptr, + shape=[n_rows_dyn, c_topk], + strides=[c_topk, c1], + ) + + tb_src = pto.alloc_tile(tile_src) + tb_inidx = pto.alloc_tile(tile_inidx) + tb_sort = pto.alloc_tile(tile_sort_f32) + tb_sort_tmp = pto.alloc_tile(tile_sort_f32) + tb_gather_win_f = pto.alloc_tile(tile_gather_win_f32) + tb_gather_win_u = pto.alloc_tile(tile_gather_win_u32) + tb_scores = pto.alloc_tile(tile_topk_f32) + tb_indices = pto.alloc_tile(tile_topk_u32) + + # Load shared column-index vector once per core. + sv_inidx = pto.slice_view( + sub_inidx, source=tv_inidx, offsets=[c0, c0], sizes=[c1, c_ncols] + ) + pto.load(sv_inidx, tb_inidx) + + # Guard: blocks beyond n_rows do nothing. + with pto.if_context(row_start < n_rows_dyn): + with pto.if_context(rows_this_core > c0): + for i in pto.range(c0, rows_this_core, c1): + row = i + row_start + + # 1. Load input row. + sv_src = pto.slice_view( + sub_src, + source=tv_src, + offsets=[row, c0], + sizes=[c1, c_ncols], + ) + pto.load(sv_src, tb_src) + + # 2. TSORT32: sort within sort_block_len-element blocks. + tile.sort32(tb_src, tb_sort, tb_inidx) + + # 3. Multi-pass TMRGSORT (unrolled at build time). + cur_block = hw_block_len + while cur_block * 4 <= sort_cols: + tile.mrgsort(tb_sort, tb_sort_tmp, const(cur_block)) + tile.mov(tb_sort_tmp, tb_sort) + cur_block *= 4 + + # 4. Copy into gather window (valid_shape=[1, 2*topk]). + tile.mov(tb_sort, tb_gather_win_f) + + # 5. Extract top-topk scores (even slots = score_f32). + tile.gather(tb_gather_win_f, tb_scores, mask_pattern="P0101") + + # 6. Extract top-topk indices (odd slots = idx_u32 bits). + tile.mov(tb_sort, tb_gather_win_u) + tile.gather(tb_gather_win_u, tb_indices, mask_pattern="P1010") + + # 7. Store outputs. + sv_scores = pto.slice_view( + sub_scores, + source=tv_scores, + offsets=[row, c0], + sizes=[c1, c_topk], + ) + pto.store(tb_scores, sv_scores) + + sv_indices = pto.slice_view( + sub_indices, + source=tv_indices, + offsets=[row, c0], + sizes=[c1, c_topk], + ) + pto.store(tb_indices, sv_indices) + + _kernel.__name__ = fn_name(n_cols, topk) + return to_ir_module(meta_data=_meta_data)(_kernel) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Print MLIR IR for a TopK kernel") + parser.add_argument("--n-cols", type=int, default=512) + parser.add_argument("--topk", type=int, default=256) + parser.add_argument("--block-dim", type=int, default=24) + args = parser.parse_args() + print(build_topk(n_cols=args.n_cols, topk=args.topk, block_dim=args.block_dim)) diff --git a/ptodsl/api/scalar.py b/ptodsl/api/scalar.py index 90938daa..7f4e9d54 100644 --- a/ptodsl/api/scalar.py +++ b/ptodsl/api/scalar.py @@ -95,6 +95,8 @@ def __getattr__(name): return IntegerType.get_signless(32) if name == "int16": return IntegerType.get_signless(16) + if name == "uint32": + return IntegerType.get_unsigned(32) raise AttributeError(f"module '{__name__}' has no attribute '{name}'") diff --git a/ptodsl/api/tile.py b/ptodsl/api/tile.py index 0526cd08..2cffe513 100644 --- a/ptodsl/api/tile.py +++ b/ptodsl/api/tile.py @@ -1,5 +1,6 @@ +from mlir.dialects import arith as _arith from mlir.dialects import pto as _pto -from mlir.ir import BoolAttr +from mlir.ir import BoolAttr, IntegerType from .scalar import _unwrap @@ -142,6 +143,20 @@ def col_expand(src, dst): _pto.TColExpandOp(src=src, dst=dst) +def mrgsort(src, dst, block_len): + i32 = IntegerType.get_signless(32) + block_len_i32 = _arith.IndexCastOp(i32, _unwrap(block_len)).result + _pto.TMrgSortOp(srcs=[src], dsts=[dst], blockLen=block_len_i32) + + +def sort32(src, dst, idx): + """TSORT32: sort src tile within 32-element blocks, writing interleaved + (score, index) pairs to dst. idx is an input tile of uint32 indices + attached to each src element. For float16 src, dst must have 4x the + columns of src (each element expands to 4 float16 words).""" + _pto.TSort32Op(src, dst, idx) + + def subset(source, offsets, sizes): offset_vals = [_unwrap(v) for v in offsets] return _pto.subset(source, offset_vals, sizes) @@ -183,5 +198,7 @@ def print(source): "col_max", "col_prod", "col_expand", + "mrgsort", + "sort32", "subset", ] diff --git a/ptodsl/api/type_def.py b/ptodsl/api/type_def.py index 4f66eebb..251303f6 100644 --- a/ptodsl/api/type_def.py +++ b/ptodsl/api/type_def.py @@ -6,7 +6,7 @@ def __getattr__(name): # MLIR type factories require an active context, so keep dtype aliases lazy # and resolve them only when user code accesses them inside PTO/MLIR setup. - if name in {"bool", "float16", "float32", "int16", "int32"}: + if name in {"bool", "float16", "float32", "int16", "int32", "uint32"}: return getattr(scalar, name) raise AttributeError(f"module '{__name__}' has no attribute '{name}'") @@ -108,4 +108,5 @@ def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): "float32", "int16", "int32", + "uint32", ] diff --git a/tests/npu/mrgsort_dynamic_multicore/builder.py b/tests/npu/mrgsort_dynamic_multicore/builder.py new file mode 100644 index 00000000..3e054ce4 --- /dev/null +++ b/tests/npu/mrgsort_dynamic_multicore/builder.py @@ -0,0 +1,160 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +DTYPES = { + "float32": lambda: pto.float32, + "float16": lambda: pto.float16, +} + +# TMRGSORT's blockLen parameter is in float32-word units: +# hw_block_len = block_len * (sizeof(float) / sizeof(T)) +_TYPE_COEF = {"float32": 1, "float16": 2} + + +def meta_data(dtype=None, tile_length=1024): + if dtype is None: + dtype = "float32" + if isinstance(dtype, str): + dtype = DTYPES[dtype]() + + index_dtype = pto.int32 + ptr_type = pto.PtrType(dtype) + # 2D tensor view: shape [num_tiles, tile_length], matching the expand_builder pattern. + tensor_type = pto.TensorType(rank=2, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, tile_length], dtype=dtype) + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, tile_length], + valid_shape=[1, tile_length], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + "tile_length": tile_length, + } + + +def build_mrgsort_kernel( + fn_name="vec_mrgsort_1d_dynamic_float32", + dtype="float32", + tile_length=1024, + block_len=32, +): + """Build a 1D dynamic multicore merge-sort kernel. + + Each tile of tile_length elements is treated as containing + tile_length // block_len pre-sorted sub-lists of block_len elements. + TMRGSORT merges groups of 4 sub-lists (block_len*4 elements) independently; + repeatTimes = tile_length // (block_len * 4) such groups per tile. + + The hardware blockLen passed to TMRGSORT is scaled by TYPE_COEF + (= sizeof(float) / sizeof(T)) per the instruction's float32-word semantics: + hw_block_len = block_len * TYPE_COEF + + Constraints (enforced by TMRGSORT): + - hw_block_len must be a multiple of 64 + - tile_length must be a multiple of hw_block_len * 4 + - repeatTimes = tile_length / (hw_block_len * 4) must be in [1, 255] + """ + dtype_str = dtype if isinstance(dtype, str) else "float32" + hw_block_len = block_len * _TYPE_COEF.get(dtype_str, 1) + _meta_data = lambda: meta_data(dtype=dtype, tile_length=tile_length) + + def _kernel( + arg0: "ptr_type", # src: input with sorted sub-lists + arg1: "ptr_type", # out: merged sorted output + argN: "index_dtype", # total number of elements (multiple of tile_length) + ) -> None: + assert tile_length % (hw_block_len * 4) == 0 + assert hw_block_len % 64 == 0 + c0 = const(0) + c1 = const(1) + c_tile = const(tile_length) + + total_elements = s.index_cast(argN) + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + vid = cid * sub_bnum + sub_bid + num_blocks = pto.get_block_num() + + vid_idx = s.index_cast(vid) + # Total virtual cores = num_blocks * subblock_num (matches add_dynamic_multicore). + num_cores = s.index_cast(num_blocks * sub_bnum) + + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) + tile_offset_this_core = vid_idx * num_tiles_per_core + + with pto.vector_section(): + # 2D tensor views: shape=[num_tiles, tile_length], strides=[tile_length, 1]. + # Mirrors the expand_builder layout where rows are tiles and columns are elements. + tv0 = pto.as_tensor( + tensor_type, + ptr=arg0, + shape=[num_tiles_global, c_tile], + strides=[c_tile, c1], + ) + tv1 = pto.as_tensor( + tensor_type, + ptr=arg1, + shape=[num_tiles_global, c_tile], + strides=[c_tile, c1], + ) + + tb_src = pto.alloc_tile(tile_type) + tb_tmp = pto.alloc_tile(tile_type) + tb_dst = pto.alloc_tile(tile_type) + + with pto.if_context(tile_offset_this_core < num_tiles_global): + tiles_end_this_core = tile_offset_this_core + num_tiles_per_core + need_truncate = tiles_end_this_core > num_tiles_global + remaining_tiles = num_tiles_global - tile_offset_this_core + tiles_to_process = s.select( + need_truncate, remaining_tiles, num_tiles_per_core + ) + + with pto.if_context(tiles_to_process > c0): + for i in pto.range(c0, tiles_to_process, c1): + tile_idx = i + tile_offset_this_core + + sv0 = pto.slice_view( + subtensor_type, + source=tv0, + offsets=[tile_idx, c0], + sizes=[c1, c_tile], + ) + + pto.load(sv0, tb_src) + # Multi-pass merge sort: blockLen doubles each pass (reference + # MrgsortSingleRow pattern). This loop is unrolled at code-gen + # time since all bounds are Python-level constants. + cur_block_len = hw_block_len + while cur_block_len * 4 <= tile_length: + tile.mrgsort(tb_src, tb_tmp, const(cur_block_len)) + tile.mov(tb_tmp, tb_src) + cur_block_len *= 4 + tile.mov(tb_src, tb_dst) + + sv1 = pto.slice_view( + subtensor_type, + source=tv1, + offsets=[tile_idx, c0], + sizes=[c1, c_tile], + ) + pto.store(tb_dst, sv1) + + _kernel.__name__ = fn_name + return to_ir_module(meta_data=_meta_data)(_kernel) + + +if __name__ == "__main__": + print(build_mrgsort_kernel(dtype="float32", tile_length=1024, block_len=64)) diff --git a/tests/npu/mrgsort_dynamic_multicore/caller.py b/tests/npu/mrgsort_dynamic_multicore/caller.py new file mode 100644 index 00000000..eadf4960 --- /dev/null +++ b/tests/npu/mrgsort_dynamic_multicore/caller.py @@ -0,0 +1,37 @@ +"""Generate caller.cpp for the dynamic multicore merge-sort kernel.""" + +import sys + +_DTYPE_TO_CTYPE = { + "float32": "float", + "float16": "half", +} + +_BLOCK_DIM = 24 + + +def fn_name(dtype): + return f"vec_mrgsort_1d_dynamic_{dtype}" + + +def generate_caller(dtype): + ctype = _DTYPE_TO_CTYPE[dtype] + fn = fn_name(dtype) + return f"""\ +#include "{fn}.cpp" + +extern "C" void call_{fn}( + void *stream, uint8_t *src, uint8_t *out, int32_t N) +{{ + {fn}<<<{_BLOCK_DIM}, nullptr, stream>>>( + ({ctype} *)src, ({ctype} *)out, (int32_t)N); +}} +""" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python caller.py ", file=sys.stderr) + sys.exit(1) + dtype = sys.argv[1] + print(generate_caller(dtype)) diff --git a/tests/npu/mrgsort_dynamic_multicore/compile.sh b/tests/npu/mrgsort_dynamic_multicore/compile.sh new file mode 100755 index 00000000..ccb6aacb --- /dev/null +++ b/tests/npu/mrgsort_dynamic_multicore/compile.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +DTYPE=${1:?Usage: compile.sh } + +FN_NAME="vec_mrgsort_1d_dynamic_${DTYPE}" + +TMP=$(mktemp -d) +trap "rm -rf $TMP" EXIT + +python "$SCRIPT_DIR/gen_ir.py" "$DTYPE" > "$TMP/${FN_NAME}.pto" +ptoas --enable-insert-sync "$TMP/${FN_NAME}.pto" -o "$TMP/${FN_NAME}.cpp" + +python "$SCRIPT_DIR/caller.py" "$DTYPE" > "$TMP/caller.cpp" + +PTO_LIB_PATH=/sources/pto-isa +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + "$TMP/caller.cpp" \ + -o "$SCRIPT_DIR/${FN_NAME}_lib.so" + +echo "Built ${FN_NAME}_lib.so successfully." diff --git a/tests/npu/mrgsort_dynamic_multicore/gen_ir.py b/tests/npu/mrgsort_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..1326395d --- /dev/null +++ b/tests/npu/mrgsort_dynamic_multicore/gen_ir.py @@ -0,0 +1,32 @@ +"""Print MLIR IR for the dynamic multicore merge-sort kernel. + +Usage: python gen_ir.py +""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from builder import build_mrgsort_kernel + +TILE_LENGTH = 1024 +BLOCK_LEN = 64 + + +def fn_name(dtype): + return f"vec_mrgsort_1d_dynamic_{dtype}" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python gen_ir.py ", file=sys.stderr) + sys.exit(1) + dtype = sys.argv[1] + module = build_mrgsort_kernel( + fn_name=fn_name(dtype), + dtype=dtype, + tile_length=TILE_LENGTH, + block_len=BLOCK_LEN, + ) + print(module) diff --git a/tests/npu/mrgsort_dynamic_multicore/test_mrgsort.py b/tests/npu/mrgsort_dynamic_multicore/test_mrgsort.py new file mode 100644 index 00000000..3011e830 --- /dev/null +++ b/tests/npu/mrgsort_dynamic_multicore/test_mrgsort.py @@ -0,0 +1,194 @@ +import os +import ctypes +import subprocess + +import pytest +import torch +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() + +# TMRGSORT single-list constraints (in terms of hw_block_len = BLOCK_LEN * TYPE_COEF): +# hw_block_len % 64 == 0 +# tile_length % (hw_block_len * 4) == 0 +# 1 <= tile_length // (hw_block_len * 4) <= 255 +# TYPE_COEF = sizeof(float) / sizeof(T): 1 for float32, 2 for float16. +TILE_LENGTH = 1024 +BLOCK_LEN = 64 +TYPE_COEFS = {"float32": 1, "float16": 2} + +# TMRGSORT operates on (float32, uint32) interleaved pairs for float16 tiles, +# not on plain float16 values. Sorting plain float16 with TMRGSORT requires +# a TSORT32 pre-pass (to produce the pair format) and a TGATHER post-pass +# (to extract values). The tests here cover the plain-value sort path only, +# which is supported for float32. +DTYPES = ["float32"] +SIZES = [1024, 2048, 3072, 4096, 8192, 16384] + +TORCH_DTYPES = { + "float32": torch.float32, + "float16": torch.float16, +} + +_DTYPE_PARAMS = [pytest.param(dtype, id=dtype) for dtype in DTYPES] +_SIZE_PARAMS = [pytest.param(N, id=f"N{N}") for N in SIZES] + + +def _fn_name(dtype: str) -> str: + return f"vec_mrgsort_1d_dynamic_{dtype}" + + +def _lib_path(dtype: str) -> str: + return os.path.join(_DIR, f"{_fn_name(dtype)}_lib.so") + + +def _ctypes_ptr(tensor: torch.Tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +@pytest.fixture(scope="session", params=_DTYPE_PARAMS) +def compiled_lib(request): + dtype = request.param + subprocess.check_call( + ["bash", os.path.join(_DIR, "compile.sh"), dtype], + cwd=_DIR, + ) + yield {"dtype": dtype} + libp = _lib_path(dtype) + if os.path.exists(libp): + os.remove(libp) + + +def _check_preconditions(N: int, block_len: int, tile_length: int): + assert block_len % 64 == 0, f"block_len must be multiple of 64, got {block_len}" + assert tile_length % (block_len * 4) == 0, ( + f"tile_length must be multiple of block_len*4, got " + f"tile_length={tile_length}, block_len={block_len}" + ) + repeat_times = tile_length // (block_len * 4) + assert ( + 1 <= repeat_times <= 255 + ), f"repeat_times must be in [1, 255], got {repeat_times}" + assert N % tile_length == 0, f"N must be a multiple of tile_length, got N={N}" + + +def _make_sorted_sublists( + N: int, + block_len: int, + device, + torch_dtype: torch.dtype, +) -> torch.Tensor: + """ + Create N values split into sorted descending sublists of length block_len. + """ + assert N % block_len == 0 + data = torch.rand(N, dtype=torch.float32) + data = data.view(-1, block_len) + data = torch.sort(data, dim=1, descending=True).values + return data.reshape(-1).to(dtype=torch_dtype, device=device) + + +def _load_fn(dtype: str): + lib = ctypes.CDLL(_lib_path(dtype)) + fn = getattr(lib, f"call_{_fn_name(dtype)}") + fn.argtypes = [ + ctypes.c_void_p, # stream + ctypes.c_void_p, # src + ctypes.c_void_p, # out + ctypes.c_int32, # N + ] + fn.restype = None + return fn + + +def _run_kernel(fn, stream_ptr, src: torch.Tensor, N: int) -> torch.Tensor: + import torch_npu + + out = torch.empty_like(src) + torch.npu.synchronize() + fn(stream_ptr, _ctypes_ptr(src), _ctypes_ptr(out), ctypes.c_int32(N)) + torch.npu.synchronize() + return out + + +def _sort_tiles(x: torch.Tensor, tile_length: int) -> torch.Tensor: + """ + Sort each tile independently descending. + The multi-pass kernel fully sorts each tile (float32) or sorts within + hw_block_len*4 sub-segments (float16); either way, sorted(out_tile) must + equal sorted(src_tile) for a correct permutation sort. + """ + x = x.cpu().float().reshape(-1, tile_length) + x = torch.sort(x, dim=1, descending=True).values + return x.reshape(-1) + + +def test_build_mrgsort(compiled_lib): + dtype = compiled_lib["dtype"] + assert os.path.exists(_lib_path(dtype)) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("N", _SIZE_PARAMS) +def test_mrgsort_equal_after_canonicalization(compiled_lib, N): + """ + Compare exact equality after canonicalizing each hw_block_len*4 segment. + This is the right equality check for the current single-list TMRGSORT behavior. + """ + import torch_npu + + dtype = compiled_lib["dtype"] + hw_block_len = BLOCK_LEN * TYPE_COEFS[dtype] + _check_preconditions(N, hw_block_len, TILE_LENGTH) + torch.npu.set_device(_DEVICE) + + torch_dtype = TORCH_DTYPES[dtype] + fn = _load_fn(dtype) + stream_ptr = torch.npu.current_stream()._as_parameter_ + + src = _make_sorted_sublists(N, hw_block_len, _DEVICE, torch_dtype) + out = _run_kernel(fn, stream_ptr, src, N) + + ref = _sort_tiles(src, TILE_LENGTH).to(torch_dtype) + got = _sort_tiles(out, TILE_LENGTH).to(torch_dtype) + + torch.testing.assert_close( + got.cpu(), + ref.cpu(), + msg="sorted output does not match sorted reference", + ) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("N", [1024]) +def test_mrgsort_deterministic(compiled_lib, N): + """ + Same input must produce identical output on two runs. + """ + import torch_npu + + dtype = compiled_lib["dtype"] + hw_block_len = BLOCK_LEN * TYPE_COEFS[dtype] + _check_preconditions(N, hw_block_len, TILE_LENGTH) + torch.npu.set_device(_DEVICE) + + torch_dtype = TORCH_DTYPES[dtype] + fn = _load_fn(dtype) + stream_ptr = torch.npu.current_stream()._as_parameter_ + + src = _make_sorted_sublists(N, hw_block_len, _DEVICE, torch_dtype) + out1 = _run_kernel(fn, stream_ptr, src, N) + out2 = _run_kernel(fn, stream_ptr, src, N) + + torch.testing.assert_close( + out1.cpu(), + out2.cpu(), + msg="kernel output is not deterministic", + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/tests/npu/sort32_dynamic_multicore/builder.py b/tests/npu/sort32_dynamic_multicore/builder.py new file mode 100644 index 00000000..a6c419eb --- /dev/null +++ b/tests/npu/sort32_dynamic_multicore/builder.py @@ -0,0 +1,194 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# TSORT32 sorts within fixed 32-element blocks. +# Each input element expands into (score, index) pairs in the output: +# float16: 4 float16 words [score_f16, zero, idx_lo_u16, idx_hi_u16] +# float32: 2 float32 words [score_f32, idx_u32] +_SORT_BLOCK_LEN = 32 + +_DTYPES = { + "float16": lambda: pto.float16, + "float32": lambda: pto.float32, +} + +# Output words per input element (in units of the src dtype) +_DST_STRIDE = { + "float16": 4, + "float32": 2, +} + + +def meta_data(dtype="float16", tile_length=1024): + if isinstance(dtype, str): + dtype_str = dtype + pto_dtype = _DTYPES[dtype]() + else: + pto_dtype = dtype + dtype_str = "float16" + + dst_stride = _DST_STRIDE[dtype_str] + u32 = pto.uint32 + dst_tile_length = tile_length * dst_stride + + tile_cfg = pto.TileBufConfig() + return { + "ptr_src": pto.PtrType(pto_dtype), + "ptr_u32": pto.PtrType(u32), + "ptr_dst": pto.PtrType(pto_dtype), + "index_dtype": pto.int32, + "tensor_src": pto.TensorType(rank=2, dtype=pto_dtype), + "tensor_u32": pto.TensorType(rank=2, dtype=u32), + "tensor_dst": pto.TensorType(rank=2, dtype=pto_dtype), + "subtensor_src": pto.SubTensorType(shape=[1, tile_length], dtype=pto_dtype), + "subtensor_u32": pto.SubTensorType(shape=[1, tile_length], dtype=u32), + "subtensor_dst": pto.SubTensorType(shape=[1, dst_tile_length], dtype=pto_dtype), + "tile_src": pto.TileBufType( + shape=[1, tile_length], + valid_shape=[1, tile_length], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ), + "tile_u32": pto.TileBufType( + shape=[1, tile_length], + valid_shape=[1, tile_length], + dtype=u32, + memory_space="VEC", + config=tile_cfg, + ), + "tile_dst": pto.TileBufType( + shape=[1, dst_tile_length], + valid_shape=[1, dst_tile_length], + dtype=pto_dtype, + memory_space="VEC", + config=tile_cfg, + ), + "tile_length": tile_length, + "dst_tile_length": dst_tile_length, + } + + +def build_tsort32_kernel( + fn_name="tsort32_1d_dynamic_float16", + dtype="float16", + tile_length=1024, +): + """Build a 1D dynamic multicore TSORT32 kernel. + + For each tile of tile_length elements: + - Reads src (scores) and idx (uint32 indices). + - Calls TSORT32, which sorts within _SORT_BLOCK_LEN-element blocks and + writes interleaved (score, index) pairs to dst. + - dst is dst_stride times wider than src in same-dtype words: + float16: dst_stride=4 → dst is float16[N * 4] + float32: dst_stride=2 → dst is float32[N * 2] + + Constraints: + - tile_length must be a multiple of _SORT_BLOCK_LEN (32) + - N (total input elements) must be a multiple of tile_length + """ + assert ( + tile_length % _SORT_BLOCK_LEN == 0 + ), f"tile_length must be a multiple of {_SORT_BLOCK_LEN}, got {tile_length}" + dtype_str = dtype if isinstance(dtype, str) else "float16" + dst_stride = _DST_STRIDE[dtype_str] + dst_tile_length = tile_length * dst_stride + _meta_data = lambda: meta_data(dtype=dtype, tile_length=tile_length) + + def _kernel( + arg_src: "ptr_src", # input scores [N] + arg_idx: "ptr_u32", # uint32 input indices [N] + arg_dst: "ptr_dst", # output pairs [N * dst_stride] + argN: "index_dtype", # total input elements (multiple of tile_length) + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(tile_length) + c_dst_tile = const(dst_tile_length) + + total_elements = s.index_cast(argN) + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + vid = cid * sub_bnum + sub_bid + num_blocks = pto.get_block_num() + + vid_idx = s.index_cast(vid) + num_cores = s.index_cast(num_blocks * sub_bnum) + + num_tiles_global = s.ceil_div(total_elements, c_tile) + num_tiles_per_core = s.ceil_div(num_tiles_global, num_cores) + tile_offset_this_core = vid_idx * num_tiles_per_core + + with pto.vector_section(): + tv_src = pto.as_tensor( + tensor_src, + ptr=arg_src, + shape=[num_tiles_global, c_tile], + strides=[c_tile, c1], + ) + tv_idx = pto.as_tensor( + tensor_u32, + ptr=arg_idx, + shape=[num_tiles_global, c_tile], + strides=[c_tile, c1], + ) + tv_dst = pto.as_tensor( + tensor_dst, + ptr=arg_dst, + shape=[num_tiles_global, c_dst_tile], + strides=[c_dst_tile, c1], + ) + + tb_src = pto.alloc_tile(tile_src) + tb_idx = pto.alloc_tile(tile_u32) + tb_dst = pto.alloc_tile(tile_dst) + + with pto.if_context(tile_offset_this_core < num_tiles_global): + tiles_end_this_core = tile_offset_this_core + num_tiles_per_core + need_truncate = tiles_end_this_core > num_tiles_global + remaining_tiles = num_tiles_global - tile_offset_this_core + tiles_to_process = s.select( + need_truncate, remaining_tiles, num_tiles_per_core + ) + + with pto.if_context(tiles_to_process > c0): + for i in pto.range(c0, tiles_to_process, c1): + ti = i + tile_offset_this_core + + sv_src = pto.slice_view( + subtensor_src, + source=tv_src, + offsets=[ti, c0], + sizes=[c1, c_tile], + ) + sv_idx = pto.slice_view( + subtensor_u32, + source=tv_idx, + offsets=[ti, c0], + sizes=[c1, c_tile], + ) + sv_dst = pto.slice_view( + subtensor_dst, + source=tv_dst, + offsets=[ti, c0], + sizes=[c1, c_dst_tile], + ) + + pto.load(sv_src, tb_src) + pto.load(sv_idx, tb_idx) + tile.sort32(tb_src, tb_dst, tb_idx) + pto.store(tb_dst, sv_dst) + + _kernel.__name__ = fn_name + return to_ir_module(meta_data=_meta_data)(_kernel) + + +if __name__ == "__main__": + import sys + + dtype = sys.argv[1] if len(sys.argv) > 1 else "float16" + print(build_tsort32_kernel(dtype=dtype)) diff --git a/tests/npu/sort32_dynamic_multicore/caller.py b/tests/npu/sort32_dynamic_multicore/caller.py new file mode 100644 index 00000000..1a092dc5 --- /dev/null +++ b/tests/npu/sort32_dynamic_multicore/caller.py @@ -0,0 +1,36 @@ +"""Generate caller.cpp for the dynamic multicore TSORT32 kernel.""" + +import sys + +_DTYPE_TO_CTYPE = { + "float16": "half", + "float32": "float", +} + +_BLOCK_DIM = 24 + + +def fn_name(dtype): + return f"tsort32_1d_dynamic_{dtype}" + + +def generate_caller(dtype): + ctype = _DTYPE_TO_CTYPE[dtype] + fn = fn_name(dtype) + return f"""\ +#include "{fn}.cpp" + +extern "C" void call_{fn}( + void *stream, uint8_t *src, uint8_t *idx, uint8_t *dst, int32_t N) +{{ + {fn}<<<{_BLOCK_DIM}, nullptr, stream>>>( + ({ctype} *)src, (uint32_t *)idx, ({ctype} *)dst, (int32_t)N); +}} +""" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python caller.py ", file=sys.stderr) + sys.exit(1) + print(generate_caller(sys.argv[1])) diff --git a/tests/npu/sort32_dynamic_multicore/compile.sh b/tests/npu/sort32_dynamic_multicore/compile.sh new file mode 100755 index 00000000..1e21d2c0 --- /dev/null +++ b/tests/npu/sort32_dynamic_multicore/compile.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +DTYPE=${1:?Usage: compile.sh } + +FN_NAME="tsort32_1d_dynamic_${DTYPE}" + +TMP=$(mktemp -d) +trap "rm -rf $TMP" EXIT + +python "$SCRIPT_DIR/gen_ir.py" "$DTYPE" > "$TMP/${FN_NAME}.pto" +ptoas --enable-insert-sync "$TMP/${FN_NAME}.pto" -o "$TMP/${FN_NAME}.cpp" + +python "$SCRIPT_DIR/caller.py" "$DTYPE" > "$TMP/caller.cpp" + +PTO_LIB_PATH=/sources/pto-isa +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + "$TMP/caller.cpp" \ + -o "$SCRIPT_DIR/${FN_NAME}_lib.so" + +echo "Built ${FN_NAME}_lib.so successfully." diff --git a/tests/npu/sort32_dynamic_multicore/gen_ir.py b/tests/npu/sort32_dynamic_multicore/gen_ir.py new file mode 100644 index 00000000..a2632f69 --- /dev/null +++ b/tests/npu/sort32_dynamic_multicore/gen_ir.py @@ -0,0 +1,28 @@ +"""Print MLIR IR for the dynamic multicore TSORT32 kernel. + +Usage: python gen_ir.py +""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from builder import build_tsort32_kernel + +TILE_LENGTH = 1024 + + +def fn_name(dtype): + return f"tsort32_1d_dynamic_{dtype}" + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python gen_ir.py ", file=sys.stderr) + sys.exit(1) + dtype = sys.argv[1] + module = build_tsort32_kernel( + fn_name=fn_name(dtype), dtype=dtype, tile_length=TILE_LENGTH + ) + print(module) diff --git a/tests/npu/sort32_dynamic_multicore/test_tsort32.py b/tests/npu/sort32_dynamic_multicore/test_tsort32.py new file mode 100644 index 00000000..453db5bc --- /dev/null +++ b/tests/npu/sort32_dynamic_multicore/test_tsort32.py @@ -0,0 +1,146 @@ +import os +import ctypes +import subprocess + +import pytest +import torch +from ptodsl.test_util import get_test_device + +torch.manual_seed(0) + +_DIR = os.path.dirname(os.path.abspath(__file__)) +_DEVICE = get_test_device() + +# TSORT32 sorts within fixed 32-element blocks. +# Each input element expands into (score, index) pairs in the output: +# float16: dst_stride=4 → [score_f16, zero, idx_lo_u16, idx_hi_u16] +# float32: dst_stride=2 → [score_f32, idx_u32] +# tile_length must be a multiple of SORT_BLOCK_LEN. +TILE_LENGTH = 1024 +SORT_BLOCK_LEN = 32 +DTYPES = ["float16", "float32"] +SIZES = [1024, 2048, 3072, 4096, 6144, 8192, 16384] + +_DST_STRIDE = {"float16": 4, "float32": 2} +_TORCH_DTYPES = {"float16": torch.float16, "float32": torch.float32} + +_DTYPE_PARAMS = [pytest.param(dtype, id=dtype) for dtype in DTYPES] +_SIZE_PARAMS = [pytest.param(N, id=f"N{N}") for N in SIZES] + + +def _fn_name(dtype): + return f"tsort32_1d_dynamic_{dtype}" + + +def _lib_path(dtype): + return os.path.join(_DIR, f"{_fn_name(dtype)}_lib.so") + + +def _ctypes_ptr(tensor: torch.Tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +@pytest.fixture(scope="session", params=_DTYPE_PARAMS) +def compiled_lib(request): + dtype = request.param + subprocess.check_call( + ["bash", os.path.join(_DIR, "compile.sh"), dtype], + cwd=_DIR, + ) + yield {"dtype": dtype} + libp = _lib_path(dtype) + if os.path.exists(libp): + os.remove(libp) + + +def _load_fn(dtype): + lib = ctypes.CDLL(_lib_path(dtype)) + fn = getattr(lib, f"call_{_fn_name(dtype)}") + fn.argtypes = [ + ctypes.c_void_p, # stream + ctypes.c_void_p, # src + ctypes.c_void_p, # idx (uint32) + ctypes.c_void_p, # dst (N * dst_stride elements) + ctypes.c_int32, # N + ] + fn.restype = None + return fn + + +def _run_kernel( + fn, stream_ptr, src: torch.Tensor, idx: torch.Tensor, N: int, dst_stride: int +) -> torch.Tensor: + import torch_npu + + dst = torch.empty(N * dst_stride, dtype=src.dtype, device=src.device) + torch.npu.synchronize() + fn( + stream_ptr, + _ctypes_ptr(src), + _ctypes_ptr(idx), + _ctypes_ptr(dst), + ctypes.c_int32(N), + ) + torch.npu.synchronize() + return dst + + +def _check_preconditions(N: int): + assert ( + N % TILE_LENGTH == 0 + ), f"N must be a multiple of TILE_LENGTH={TILE_LENGTH}, got {N}" + assert TILE_LENGTH % SORT_BLOCK_LEN == 0 + + +def _extract_scores(dst: torch.Tensor, dst_stride: int) -> torch.Tensor: + """Slot 0 of each output group holds the sorted score.""" + return dst.cpu().reshape(-1, dst_stride)[:, 0] + + +def _reference_scores(src: torch.Tensor) -> torch.Tensor: + """Sort each SORT_BLOCK_LEN-element group descending.""" + return ( + src.cpu() + .reshape(-1, SORT_BLOCK_LEN) + .sort(dim=1, descending=True) + .values.reshape(-1) + ) + + +def test_build_tsort32(compiled_lib): + dtype = compiled_lib["dtype"] + assert os.path.exists(_lib_path(dtype)) + + +@pytest.mark.require_npu +@pytest.mark.parametrize("N", _SIZE_PARAMS) +def test_tsort32_scores(compiled_lib, N): + """Scores extracted from TSORT32 output match per-block sorted input.""" + import torch_npu + + dtype = compiled_lib["dtype"] + torch_dtype = _TORCH_DTYPES[dtype] + dst_stride = _DST_STRIDE[dtype] + + _check_preconditions(N) + torch.npu.set_device(_DEVICE) + + fn = _load_fn(dtype) + stream_ptr = torch.npu.current_stream()._as_parameter_ + + src = torch.rand(N, dtype=torch_dtype, device=_DEVICE) + idx = torch.arange(N, dtype=torch.int32, device=_DEVICE) + dst = _run_kernel(fn, stream_ptr, src, idx, N, dst_stride) + + scores_got = _extract_scores(dst, dst_stride) + scores_ref = _reference_scores(src) + + torch.testing.assert_close( + scores_got, + scores_ref, + msg="TSORT32 scores do not match per-block sorted reference", + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) From b836487b53a4ea236294f6b1bb16b7eecb38b5a2 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Thu, 26 Mar 2026 10:19:56 +0100 Subject: [PATCH 50/53] Free up resources in examples and other fixes (#97) --- examples/aot/matmul_optimization_guide/bench_matmul.py | 2 ++ .../experimental/bench_matmul.py | 3 +++ .../matmul_optimization_guide/experimental/run_matmul.py | 8 +++++++- examples/aot/matmul_optimization_guide/run_matmul.py | 8 +++++++- examples/aot/print_tile/compile.sh | 4 +++- 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/examples/aot/matmul_optimization_guide/bench_matmul.py b/examples/aot/matmul_optimization_guide/bench_matmul.py index 476ddb8f..5c6cc83e 100644 --- a/examples/aot/matmul_optimization_guide/bench_matmul.py +++ b/examples/aot/matmul_optimization_guide/bench_matmul.py @@ -342,6 +342,8 @@ def main(): args.warmup, args.repeat, ) + del a_list, b_list + torch.npu.empty_cache() flops = 2.0 * m * n * k double_auto_swizzle_tflops = flops / double_auto_swizzle_us / 1e6 diff --git a/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py b/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py index 1c021b5e..6054f407 100644 --- a/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py +++ b/examples/aot/matmul_optimization_guide/experimental/bench_matmul.py @@ -429,6 +429,9 @@ def _custom(a, b, _d=swizzle_direction, _c=swizzle_count): } ) + del a_list, b_list, c, c_ref + torch.npu.empty_cache() + if no_swizzle_time_us is None or no_swizzle_tflops is None: raise RuntimeError( "No no-swizzle baseline result found " diff --git a/examples/aot/matmul_optimization_guide/experimental/run_matmul.py b/examples/aot/matmul_optimization_guide/experimental/run_matmul.py index f21af14a..54a2339f 100644 --- a/examples/aot/matmul_optimization_guide/experimental/run_matmul.py +++ b/examples/aot/matmul_optimization_guide/experimental/run_matmul.py @@ -106,7 +106,7 @@ def run_case(matmul_abt, a, b, c_ref, *, block_dim, swizzle_direction, swizzle_c swizzle_count=swizzle_count, ) torch.npu.synchronize() - return CaseResult( + result = CaseResult( m=int(a.shape[0]), n=int(b.shape[0]), k=int(a.shape[1]), @@ -116,6 +116,9 @@ def run_case(matmul_abt, a, b, c_ref, *, block_dim, swizzle_direction, swizzle_c max_absdiff=float((c - c_ref).abs().max().item()), mean_absdiff=float((c - c_ref).abs().mean().item()), ) + del c + torch.npu.empty_cache() + return result def test_matmul(): @@ -169,6 +172,9 @@ def test_matmul(): ): global_worst = result + del a, b, c_ref + torch.npu.empty_cache() + print( f"(m, n, k)=({m}, {n}, {k}) " f"worst(block_dim, swizzle_direction, swizzle_count)=" diff --git a/examples/aot/matmul_optimization_guide/run_matmul.py b/examples/aot/matmul_optimization_guide/run_matmul.py index 29d155b4..99f4669b 100644 --- a/examples/aot/matmul_optimization_guide/run_matmul.py +++ b/examples/aot/matmul_optimization_guide/run_matmul.py @@ -91,7 +91,7 @@ def matmul_abt( def run_case(matmul_abt, a, b, c_ref, *, block_dim): c = matmul_abt(a, b, block_dim=block_dim) torch.npu.synchronize() - return CaseResult( + result = CaseResult( m=int(a.shape[0]), n=int(b.shape[0]), k=int(a.shape[1]), @@ -99,6 +99,9 @@ def run_case(matmul_abt, a, b, c_ref, *, block_dim): max_absdiff=float((c - c_ref).abs().max().item()), mean_absdiff=float((c - c_ref).abs().mean().item()), ) + del c + torch.npu.empty_cache() + return result def test_matmul(): @@ -175,6 +178,9 @@ def test_matmul(): ): global_worst = result + del a, b, c_ref + torch.npu.empty_cache() + print( f"(m, n, k)=({m}, {n}, {k}) " f"worst(block_dim)={shape_worst.block_dim} " diff --git a/examples/aot/print_tile/compile.sh b/examples/aot/print_tile/compile.sh index ce093b4c..9a55a8c4 100644 --- a/examples/aot/print_tile/compile.sh +++ b/examples/aot/print_tile/compile.sh @@ -2,6 +2,8 @@ #!/usr/bin/env bash set -e +ARCH=$(uname -m) + PTO_DIR="$ASCEND_HOME_PATH/include/pto" PTO_BACKUP="$ASCEND_HOME_PATH/include/pto_hidden" PTO_LIB_PATH="/sources/pto-isa" @@ -30,7 +32,7 @@ bisheng \ -xcce -Xhost-start -Xhost-end \ --npu-arch=dav-2201 -DMEMORY_BASE \ -D_DEBUG --cce-enable-print \ - -I${ASCEND_HOME_PATH}/aarch64-linux/pkg_inc/runtime/runtime \ + -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/runtime \ -I${PTO_LIB_PATH}/include \ -std=gnu++17 \ ./caller.cpp \ From ef996b0dbc07eb131cd591d19975afc8194664a1 Mon Sep 17 00:00:00 2001 From: RuoyuZhou Date: Mon, 30 Mar 2026 00:02:45 +0800 Subject: [PATCH 51/53] feat(frontend): add mxfp8 helpers and examples --- .../aot/matmul_mxfp8/matmul_mxfp8_builder.py | 84 +++ .../aot/matmul_mxfp8/mxfp8_ppt_example.py | 80 +++ examples/ppt/mixed_pto_vector_slide.md | 77 +++ ptodsl/compiler/jit.py | 10 + ptodsl/language.py | 536 ++++++++++++++++++ tests/frontend/test_caller_gen.py | 27 + tests/frontend/test_mxfp8_frontend.py | 55 ++ 7 files changed, 869 insertions(+) create mode 100644 examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py create mode 100644 examples/aot/matmul_mxfp8/mxfp8_ppt_example.py create mode 100644 examples/ppt/mixed_pto_vector_slide.md create mode 100644 ptodsl/language.py create mode 100644 tests/frontend/test_mxfp8_frontend.py diff --git a/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py b/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py new file mode 100644 index 00000000..5cf713fa --- /dev/null +++ b/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py @@ -0,0 +1,84 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + + +def build(M=16, K=64, N=32, lhs_variant="e5m2", rhs_variant="e5m2"): + def meta_data(): + mx = pto.make_mxfp8(lhs=lhs_variant, rhs=rhs_variant) + scale_k = mx.scale_k(K) + + ptr_lhs = pto.PtrType(mx.lhs) + ptr_rhs = pto.PtrType(mx.rhs) + ptr_scale = pto.PtrType(mx.scale) + ptr_bias = pto.PtrType(mx.acc) + + lhs_tensor = pto.TensorType(rank=2, dtype=mx.lhs) + rhs_tensor = pto.TensorType(rank=2, dtype=mx.rhs) + lhs_scale_tensor = pto.TensorType(rank=2, dtype=mx.scale) + rhs_scale_tensor = pto.TensorType(rank=2, dtype=mx.scale) + bias_tensor = pto.TensorType(rank=2, dtype=mx.acc) + + lhs_tile_view = pto.SubTensorType(shape=[M, K], dtype=mx.lhs) + rhs_tile_view = pto.SubTensorType(shape=[K, N], dtype=mx.rhs) + lhs_scale_tile_view = pto.SubTensorType(shape=[M, scale_k], dtype=mx.scale) + rhs_scale_tile_view = pto.SubTensorType(shape=[scale_k, N], dtype=mx.scale) + bias_tile_view = pto.SubTensorType(shape=[1, N], dtype=mx.acc) + + lhs_tile = pto.TileBufType(shape=[M, K], dtype=mx.lhs, memory_space="LEFT") + rhs_tile = pto.TileBufType(shape=[K, N], dtype=mx.rhs, memory_space="RIGHT") + lhs_scale_tile = pto.LeftScaleTileBufType(shape=[M, scale_k], dtype=mx.scale) + rhs_scale_tile = pto.RightScaleTileBufType(shape=[scale_k, N], dtype=mx.scale) + bias_tile = pto.TileBufType(shape=[1, N], dtype=mx.acc, memory_space="BIAS") + acc_tile = pto.TileBufType(shape=[M, N], dtype=mx.acc, memory_space="ACC") + + return locals() + + const = pto.const + + @to_ir_module(meta_data=meta_data) + def matmul_mxfp8( + a_ptr: "ptr_lhs", + a_scale_ptr: "ptr_scale", + b_ptr: "ptr_rhs", + b_scale_ptr: "ptr_scale", + bias_ptr: "ptr_bias", + ) -> None: + c0 = const(0) + c1 = const(1) + cM = const(M) + cK = const(K) + cN = const(N) + cScaleK = const(scale_k) + + tv_a = pto.as_tensor(lhs_tensor, ptr=a_ptr, shape=[cM, cK], strides=[cK, c1]) + tv_b = pto.as_tensor(rhs_tensor, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) + tv_scale_a = pto.as_tensor(lhs_scale_tensor, ptr=a_scale_ptr, shape=[cM, cScaleK], strides=[cScaleK, c1]) + tv_scale_b = pto.as_tensor(rhs_scale_tensor, ptr=b_scale_ptr, shape=[cScaleK, cN], strides=[cN, c1]) + tv_bias = pto.as_tensor(bias_tensor, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) + + sv_a = pto.slice_view(lhs_tile_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK]) + sv_b = pto.slice_view(rhs_tile_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN]) + sv_scale_a = pto.slice_view(lhs_scale_tile_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK]) + sv_scale_b = pto.slice_view(rhs_scale_tile_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN]) + sv_bias = pto.slice_view(bias_tile_view, source=tv_bias, offsets=[c0, c0], sizes=[c1, cN]) + + with pto.cube_section(): + a_tile = pto.alloc_tile(lhs_tile) + b_tile = pto.alloc_tile(rhs_tile) + a_scale_tile = pto.alloc_tile(lhs_scale_tile) + b_scale_tile = pto.alloc_tile(rhs_scale_tile) + bias_tile_buf = pto.alloc_tile(bias_tile) + acc_tile_buf = pto.alloc_tile(acc_tile) + + pto.load(sv_a, a_tile) + pto.load(sv_b, b_tile) + pto.load(sv_scale_a, a_scale_tile) + pto.load(sv_scale_b, b_scale_tile) + pto.load(sv_bias, bias_tile_buf) + pto.matmul_mx_bias(a_tile, a_scale_tile, b_tile, b_scale_tile, bias_tile_buf, acc_tile_buf) + + return matmul_mxfp8 + + +if __name__ == "__main__": + print(build()) diff --git a/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py b/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py new file mode 100644 index 00000000..b7988c9e --- /dev/null +++ b/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py @@ -0,0 +1,80 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + + +M, K, N = 16, 64, 32 + + +def meta_data(): + # 1) 选择 MXFP8 组合。默认是 lhs=e5m2, rhs=e5m2, scale=e8m0, acc=f32。 + mx = pto.make_mxfp8(lhs="e5m2", rhs="e5m2") + scale_k = mx.scale_k(K) # MXFP8 的 scale 张量沿 K 维按 32:1 压缩 + + # 2) 全局输入指针类型 + a_ptr = pto.PtrType(mx.lhs) + b_ptr = pto.PtrType(mx.rhs) + scale_ptr = pto.PtrType(mx.scale) + + # 3) TensorView 类型 + a_tensor = pto.TensorType(rank=2, dtype=mx.lhs) + b_tensor = pto.TensorType(rank=2, dtype=mx.rhs) + scale_a_tensor = pto.TensorType(rank=2, dtype=mx.scale) + scale_b_tensor = pto.TensorType(rank=2, dtype=mx.scale) + + # 4) TileView / TileBuf 类型 + a_view = pto.SubTensorType(shape=[M, K], dtype=mx.lhs) + b_view = pto.SubTensorType(shape=[K, N], dtype=mx.rhs) + scale_a_view = pto.SubTensorType(shape=[M, scale_k], dtype=mx.scale) + scale_b_view = pto.SubTensorType(shape=[scale_k, N], dtype=mx.scale) + + a_tile = pto.TileBufType(shape=[M, K], dtype=mx.lhs, memory_space="LEFT") + b_tile = pto.TileBufType(shape=[K, N], dtype=mx.rhs, memory_space="RIGHT") + scale_a_tile = pto.LeftScaleTileBufType(shape=[M, scale_k], dtype=mx.scale) + scale_b_tile = pto.RightScaleTileBufType(shape=[scale_k, N], dtype=mx.scale) + acc_tile = pto.TileBufType(shape=[M, N], dtype=mx.acc, memory_space="ACC") + + return locals() + + +@to_ir_module(meta_data=meta_data) +def matmul_mxfp8_core( + a: "a_ptr", + scale_a: "scale_ptr", + b: "b_ptr", + scale_b: "scale_ptr", +) -> None: + c0 = pto.const(0) + c1 = pto.const(1) + cM = pto.const(M) + cK = pto.const(K) + cN = pto.const(N) + cScaleK = pto.const(scale_k) + + tv_a = pto.as_tensor(a_tensor, ptr=a, shape=[cM, cK], strides=[cK, c1]) + tv_b = pto.as_tensor(b_tensor, ptr=b, shape=[cK, cN], strides=[cN, c1]) + tv_scale_a = pto.as_tensor(scale_a_tensor, ptr=scale_a, shape=[cM, cScaleK], strides=[cScaleK, c1]) + tv_scale_b = pto.as_tensor(scale_b_tensor, ptr=scale_b, shape=[cScaleK, cN], strides=[cN, c1]) + + sv_a = pto.slice_view(a_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK]) + sv_b = pto.slice_view(b_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN]) + sv_scale_a = pto.slice_view(scale_a_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK]) + sv_scale_b = pto.slice_view(scale_b_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN]) + + with pto.cube_section(): + ta = pto.alloc_tile(a_tile) + tb = pto.alloc_tile(b_tile) + tsa = pto.alloc_tile(scale_a_tile) + tsb = pto.alloc_tile(scale_b_tile) + tc = pto.alloc_tile(acc_tile) + + pto.load(sv_a, ta) + pto.load(sv_b, tb) + pto.load(sv_scale_a, tsa) + pto.load(sv_scale_b, tsb) + + # 核心调用:MXFP8 data tile + scale tile -> Acc tile + pto.matmul_mx(ta, tsa, tb, tsb, tc) + + +if __name__ == "__main__": + print(matmul_mxfp8_core) diff --git a/examples/ppt/mixed_pto_vector_slide.md b/examples/ppt/mixed_pto_vector_slide.md new file mode 100644 index 00000000..803180dd --- /dev/null +++ b/examples/ppt/mixed_pto_vector_slide.md @@ -0,0 +1,77 @@ +# PTO `t*` + `v*` 混合示例 + +## 一页版表达 + +```text +Outer PTO tile flow: + make_tensor_view -> partition_view -> tload -> [vector inner loop] -> tstore + +Inner vector loop: + vlds -> vlds -> vadd -> vsts +``` + +## PPT 版伪 IR + +```mlir +module { + func.func @vec_add_mixed( + %a: !pto.ptr, + %b: !pto.ptr, + %c: !pto.ptr) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + // 1) 先用 PTO tile op 选出一个 32x32 工作块 + %A = pto.make_tensor_view %a, shape = [%c32, %c32], strides = [%c32, %c1] + : !pto.tensor_view + %B = pto.make_tensor_view %b, shape = [%c32, %c32], strides = [%c32, %c1] + : !pto.tensor_view + %C = pto.make_tensor_view %c, shape = [%c32, %c32], strides = [%c32, %c1] + : !pto.tensor_view + + %tileA = pto.partition_view %A, offsets = [%c0, %c0], sizes = [%c32, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %tileB = pto.partition_view %B, offsets = [%c0, %c0], sizes = [%c32, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %tileC = pto.partition_view %C, offsets = [%c0, %c0], sizes = [%c32, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + + // 统一记号:!tile 表示 vec-local 32x32 f32 tile_buf + %bufA = pto.alloc_tile : !pto.tile_buf + %bufB = pto.alloc_tile : !pto.tile_buf + %bufC = pto.alloc_tile : !pto.tile_buf + + // 2) tile 级搬运:GM -> local tile + pto.tload ins(%tileA : !pto.partition_tensor_view<32x32xf32>) + outs(%bufA : !pto.tile_buf) + pto.tload ins(%tileB : !pto.partition_tensor_view<32x32xf32>) + outs(%bufB : !pto.tile_buf) + + // 3) vector 级计算:在 local tile 内部按 64-lane 分块 + %ptrA = pto.tile_buf_addr %bufA : !pto.tile_buf<...> -> !llvm.ptr<6> + %ptrB = pto.tile_buf_addr %bufB : !pto.tile_buf<...> -> !llvm.ptr<6> + %ptrC = pto.tile_buf_addr %bufC : !pto.tile_buf<...> -> !llvm.ptr<6> + + scf.for %i = %c0 to %c1024 step %c64 { + %va = pto.vlds %ptrA[%i] : !llvm.ptr<6> -> !pto.vreg<64xf32> + %vb = pto.vlds %ptrB[%i] : !llvm.ptr<6> -> !pto.vreg<64xf32> + %vc = pto.vadd %va, %vb + : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32> + pto.vsts %vc, %ptrC[%i] : !pto.vreg<64xf32>, !llvm.ptr<6> + } + + // 4) tile 级写回:local tile -> GM + pto.tstore ins(%bufC : !pto.tile_buf) + outs(%tileC : !pto.partition_tensor_view<32x32xf32>) + return + } +} +``` + +## 讲解时只强调这两层 + +- `pto.t*` 负责选 tile 和搬 tile:`make_tensor_view -> partition_view -> tload -> tstore` +- `pto.v*` 负责在 tile 内做向量计算:`vlds -> vadd -> vsts` diff --git a/ptodsl/compiler/jit.py b/ptodsl/compiler/jit.py index 820fc00b..9e39ebeb 100644 --- a/ptodsl/compiler/jit.py +++ b/ptodsl/compiler/jit.py @@ -21,6 +21,12 @@ def _is_ptr_type(type_obj): def _ptr_elem_cpp_type(type_obj): type_repr = _type_repr(type_obj) + if "e8m0" in type_repr: + return "float8_e8m0_t" + if "e4m3" in type_repr: + return "float8_e4m3_t" + if "e5m2" in type_repr: + return "float8_e5m2_t" if "f32" in type_repr: return "float" if "f16" in type_repr: @@ -52,6 +58,8 @@ def _scalar_cpp_type(type_obj): return "int32_t" if "i64" in type_repr or "index" in type_repr: return "int64_t" + if "e8m0" in type_repr or "e4m3" in type_repr or "e5m2" in type_repr: + return "uint8_t" if "f32" in type_repr: return "float" if "f16" in type_repr: @@ -63,6 +71,8 @@ def _scalar_ctype(type_obj): type_repr = _type_repr(type_obj) if "i64" in type_repr or "index" in type_repr: return ctypes.c_int64 + if "e8m0" in type_repr or "e4m3" in type_repr or "e5m2" in type_repr: + return ctypes.c_uint8 if "f32" in type_repr: return ctypes.c_float if "f16" in type_repr: diff --git a/ptodsl/language.py b/ptodsl/language.py new file mode 100644 index 00000000..7f45ac42 --- /dev/null +++ b/ptodsl/language.py @@ -0,0 +1,536 @@ +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Sequence + +from mlir import ir as mlir_ir +from mlir.dialects import arith, pto, scf +from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType + + +def _unwrap(value): + if isinstance(value, Value): + return value.raw + return value + + +class Value: + # TODO: generalize to more comprehensive wrappers like https://github.com/makslevental/mlir-python-extras/blob/0.0.8.2/mlir/extras/dialects/ext/arith.py + def __init__(self, raw): + self.raw = raw + + def __mul__(self, other): + return Value(arith.MulIOp(_unwrap(self), _unwrap(other)).result) + + def __rmul__(self, other): + return Value(arith.MulIOp(_unwrap(other), _unwrap(self)).result) + + def __add__(self, other): + return Value(arith.AddIOp(_unwrap(self), _unwrap(other)).result) + + def __radd__(self, other): + return Value(arith.AddIOp(_unwrap(other), _unwrap(self)).result) + + def __sub__(self, other): + return Value(arith.SubIOp(_unwrap(self), _unwrap(other)).result) + + def __rsub__(self, other): + return Value(arith.SubIOp(_unwrap(other), _unwrap(self)).result) + + def __floordiv__(self, other): + return Value(arith.DivSIOp(_unwrap(self), _unwrap(other)).result) + + def __rfloordiv__(self, other): + return Value(arith.DivSIOp(_unwrap(other), _unwrap(self)).result) + + def __truediv__(self, other): + return Value(arith.DivFOp(_unwrap(self), _unwrap(other)).result) + + def __rtruediv__(self, other): + return Value(arith.DivFOp(_unwrap(other), _unwrap(self)).result) + + def __mod__(self, other): + return Value(arith.RemSIOp(_unwrap(self), _unwrap(other)).result) + + def __rmod__(self, other): + return Value(arith.RemSIOp(_unwrap(other), _unwrap(self)).result) + + @staticmethod + def _cmp(lhs, rhs, predicate): + return Value(arith.CmpIOp(predicate, _unwrap(lhs), _unwrap(rhs)).result) + + def __lt__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.slt) + + def __gt__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sgt) + + def __le__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sle) + + def __ge__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.sge) + + def __eq__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.eq) + + def __ne__(self, other): + return Value._cmp(self, other, arith.CmpIPredicate.ne) + + def __getattr__(self, item): + return getattr(self.raw, item) + + +def wrap_value(value): + if isinstance(value, Value): + return value + return Value(value) + + +@dataclass(frozen=True) +class MXFP8DType: + lhs: object + rhs: object + scale: object + acc: object + scale_factor: int = 32 + + @property + def data(self): + return self.lhs + + def scale_k(self, k): + if k % self.scale_factor != 0: + raise ValueError(f"k={k} must be divisible by scale_factor={self.scale_factor} for MXFP8.") + return k // self.scale_factor + + +def _get_mlir_float_type(alias_name, *type_names): + for type_name in type_names: + type_ctor = getattr(mlir_ir, type_name, None) + if type_ctor is not None: + return type_ctor.get() + supported = ", ".join(type_names) + raise AttributeError( + f"module '{__name__}' has no attribute '{alias_name}' because the active MLIR " + f"Python bindings do not expose any of: {supported}" + ) + + +def make_mxfp8(*, lhs="e5m2", rhs="e5m2", acc=None, scale_factor=32): + variants = { + "e4m3": __getattr__("fp8_e4m3"), + "e5m2": __getattr__("fp8_e5m2"), + } + if lhs not in variants: + raise ValueError(f"Unsupported lhs variant '{lhs}'. Expected one of: {', '.join(sorted(variants))}.") + if rhs not in variants: + raise ValueError(f"Unsupported rhs variant '{rhs}'. Expected one of: {', '.join(sorted(variants))}.") + return MXFP8DType( + lhs=variants[lhs], + rhs=variants[rhs], + scale=__getattr__("fp8_e8m0"), + acc=__getattr__("float32") if acc is None else acc, + scale_factor=scale_factor, + ) + + +def __getattr__(name): + # Keep aliases conservative and only expose types that map cleanly to MLIR/PTO. + if name == "bool": + return IntegerType.get_signless(1) + if name == "float32": + return F32Type.get() + if name == "float16": + return F16Type.get() + if name == "bfloat16": + return _get_mlir_float_type(name, "BF16Type") + if name in ("fp8_e4m3", "float8_e4m3"): + return _get_mlir_float_type(name, "Float8E4M3FNType", "Float8E4M3FNUZType") + if name in ("fp8_e5m2", "float8_e5m2"): + return _get_mlir_float_type(name, "Float8E5M2Type", "Float8E5M2FNUZType") + if name in ("fp8_e8m0", "float8_e8m0"): + return _get_mlir_float_type(name, "Float8E8M0FNUType", "Float8E8M0FNType") + if name == "mxfp8": + return make_mxfp8(lhs="e5m2", rhs="e5m2") + if name == "mxfp8_e4m3": + return make_mxfp8(lhs="e4m3", rhs="e4m3") + if name == "mxfp8_e5m2": + return make_mxfp8(lhs="e5m2", rhs="e5m2") + if name == "int32": + return IntegerType.get_signless(32) + if name == "int16": + return IntegerType.get_signless(16) + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + +def PtrType(dtype): + return pto.PtrType.get(dtype) + + +def TensorType(*, rank, dtype): + return pto.TensorViewType.get(rank, dtype) + + +def SubTensorType(*, shape, dtype): + return pto.PartitionTensorViewType.get(shape, dtype) + + +class TileBufConfig: + def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null"): + # TODO: expose and validate a broader set of tile buffer knobs if PTO adds + # more layout/padding/fractal settings that should be configurable here. + self._bl = pto.BLayoutAttr.get(getattr(pto.BLayout, blayout)) + self._sl = pto.SLayoutAttr.get(getattr(pto.SLayout, slayout)) + self._pd = pto.PadValueAttr.get(getattr(pto.PadValue, pad)) + self._s_fractal_size = s_fractal_size + + @property + def attr(self): + return pto.TileBufConfigAttr.get(self._bl, self._sl, self._s_fractal_size, self._pd) + + +def _default_tile_config(memory_space, shape): + space = memory_space.upper() + # Defaults mirror the explicit configs used by the verbose matmul builder. + if space == "MAT": + if len(shape) >= 1 and shape[0] == 1: + return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) + if space == "LEFT": + return TileBufConfig(blayout="RowMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) + if space == "RIGHT": + return TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=pto.TileConfig.fractalABSize) + if space == "ACC": + return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalCSize) + if space == "BIAS": + return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) + if space == "SCALING": + return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) + if space == "VEC": + return TileBufConfig() + raise ValueError(f"Unsupported memory_space '{memory_space}' for default tile config.") + + +def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): + space = pto.AddressSpaceAttr.get(getattr(pto.AddressSpace, memory_space)) + if valid_shape is None: + valid_shape = shape + if config is None: + config = _default_tile_config(memory_space, shape) + cfg = config.attr if isinstance(config, TileBufConfig) else config + return pto.TileBufType.get(shape, dtype, space, valid_shape, cfg) + + +def LeftScaleTileBufType(*, shape, dtype, valid_shape=None, config=None): + if config is None: + config = TileBufConfig( + blayout="RowMajor", + slayout="RowMajor", + s_fractal_size=pto.TileConfig.fractalMxSize, + ) + return TileBufType(shape=shape, dtype=dtype, memory_space="SCALING", valid_shape=valid_shape, config=config) + + +def RightScaleTileBufType(*, shape, dtype, valid_shape=None, config=None): + if config is None: + config = TileBufConfig( + blayout="ColMajor", + slayout="ColMajor", + s_fractal_size=pto.TileConfig.fractalMxSize, + ) + return TileBufType(shape=shape, dtype=dtype, memory_space="SCALING", valid_shape=valid_shape, config=config) + + +def const(value): + return Value(arith.ConstantOp(IndexType.get(), value).result) + + +def get_block_idx(): + return Value(pto.GetBlockIdxOp().result) + + +def get_subblock_idx(): + return Value(pto.GetSubBlockIdxOp().result) + + +def get_subblock_num(): + return Value(pto.GetSubBlockNumOp().result) + + +def get_block_num(): + return Value(pto.GetBlockNumOp().result) + + +def index_cast(value, index_type=IndexType): + if hasattr(index_type, "get"): + dst = index_type.get() + else: + dst = index_type + return Value(arith.IndexCastOp(dst, _unwrap(value)).result) + + +def as_tensor(tensor_type, *, ptr, shape, strides): + shape_vals = [_unwrap(v) for v in shape] + stride_vals = [_unwrap(v) for v in strides] + return pto.MakeTensorViewOp(tensor_type, _unwrap(ptr), shape_vals, stride_vals).result + + +def slice_view(subtensor_type, *, source, offsets, sizes): + offset_vals = [_unwrap(v) for v in offsets] + size_vals = [_unwrap(v) for v in sizes] + return pto.PartitionViewOp(subtensor_type, source, offsets=offset_vals, sizes=size_vals).result + + +@contextmanager +def vector_section(): + section = pto.SectionVectorOp() + block = section.body.blocks.append() + with InsertionPoint(block): + yield + + +@contextmanager +def cube_section(): + section = pto.SectionCubeOp() + block = section.body.blocks.append() + with InsertionPoint(block): + yield + + +def for_range(start, stop, step): + loop = scf.ForOp(_unwrap(start), _unwrap(stop), _unwrap(step)) + with InsertionPoint(loop.body): + yield Value(loop.induction_variable) + scf.YieldOp([]) + + +def alloc_tile(tile_type, *, valid_row=None, valid_col=None): + kwargs = {} + if valid_row is not None: + kwargs["valid_row"] = _unwrap(valid_row) + if valid_col is not None: + kwargs["valid_col"] = _unwrap(valid_col) + return pto.AllocTileOp(tile_type, **kwargs).result + + +def subset(source, offsets, sizes): + offset_vals = [_unwrap(v) for v in offsets] + return pto.subset(source, offset_vals, sizes) + + +def load(source, dest): + pto.TLoadOp(None, source, dest) + + +def mov(source, dest): + pto.TMovOp(None, source, dest) + + +def add(lhs, rhs, out): + pto.TAddOp(lhs, rhs, out) + + +def sub(lhs, rhs, out): + pto.TSubOp(lhs, rhs, out) + + +def div(lhs, rhs, out): + pto.TDivOp(lhs, rhs, out) + + +def mul(lhs, rhs, out): + pto.TMulOp(lhs, rhs, out) + + +def or_(lhs, rhs, out): + pto.TOrOp(lhs, rhs, out) + + +def gather(src, out, indices=None, *, mask_pattern=None): + if mask_pattern is not None: + mp = pto.MaskPatternAttr.get(getattr(pto.MaskPattern, mask_pattern)) + pto.TGatherOp(src, out, maskPattern=mp) + else: + pto.TGatherOp(src, out, indices=indices) + + +def exp(inp, out): + pto.TExpOp(inp, out) + + +def log(inp, out): + pto.TLogOp(inp, out) + + +def relu(inp, out): + pto.TReluOp(inp, out) + + +def abs(inp, out): + pto.TAbsOp(inp, out) + + +def sqrt(inp, out): + pto.TSqrtOp(inp, out) + + +def store(source, dest): + pto.TStoreOp(None, source, dest) + + +def matmul(lhs, rhs, out): + pto.TMatmulOp(None, lhs, rhs, out) + + +def matmul_bias(lhs, rhs, bias, out): + pto.TMatmulBiasOp(None, lhs, rhs, bias, out) + + +def matmul_acc(acc, lhs, rhs, out): + pto.TMatmulAccOp(None, acc, lhs, rhs, out) + + +def _emit_dps_op(op_name, *operands): + op_ctor = getattr(pto, op_name, None) + if op_ctor is not None: + return op_ctor(None, *operands) + generic_name = { + "TMatmulMxOp": "pto.tmatmul.mx", + "TMatmulMxAccOp": "pto.tmatmul.mx.acc", + "TMatmulMxBiasOp": "pto.tmatmul.mx.bias", + }[op_name] + return mlir_ir.Operation.create(generic_name, operands=list(operands)) + + +def matmul_mx(lhs, lhs_scale, rhs, rhs_scale, out): + _emit_dps_op("TMatmulMxOp", lhs, lhs_scale, rhs, rhs_scale, out) + + +def matmul_mx_acc(acc, lhs, lhs_scale, rhs, rhs_scale, out): + _emit_dps_op("TMatmulMxAccOp", acc, lhs, lhs_scale, rhs, rhs_scale, out) + + +def matmul_mx_bias(lhs, lhs_scale, rhs, rhs_scale, bias, out): + _emit_dps_op("TMatmulMxBiasOp", lhs, lhs_scale, rhs, rhs_scale, bias, out) + + +def ceil_div(a, b): + return Value(arith.CeilDivSIOp(_unwrap(a), _unwrap(b)).result) + + +def div_s(a, b): + return Value(arith.DivSIOp(_unwrap(a), _unwrap(b)).result) + + +def rem_s(a, b): + return Value(arith.RemSIOp(_unwrap(a), _unwrap(b)).result) + + +def min_u(a, b): + return Value(arith.MinUIOp(_unwrap(a), _unwrap(b)).result) + + +def eq(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.eq, _unwrap(a), _unwrap(b)).result) + + +def lt(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.slt, _unwrap(a), _unwrap(b)).result) + + +def gt(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.sgt, _unwrap(a), _unwrap(b)).result) + + +def ge(a, b): + return Value(arith.CmpIOp(arith.CmpIPredicate.sge, _unwrap(a), _unwrap(b)).result) + + +def select(cond, true_val, false_val): + return Value(arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result) + + +class _IfElseBranch: + def __init__(self, if_op): + self._if_op = if_op + @contextmanager + def else_context(self): + with InsertionPoint(self._if_op.else_block): + yield + scf.YieldOp([]) + +@contextmanager +def if_context(condition, has_else=False): + if has_else: + op = scf.IfOp(_unwrap(condition), [], hasElse=True) + branch = _IfElseBranch(op) + else: + op = scf.IfOp(_unwrap(condition)) + branch = None + + with InsertionPoint(op.then_block): + yield branch + scf.YieldOp([]) + + +def cond(condition, then_builder, else_builder): + op = scf.IfOp(_unwrap(condition), [], hasElse=True) + with InsertionPoint(op.then_block): + then_builder() + scf.YieldOp([]) + with InsertionPoint(op.else_block): + else_builder() + scf.YieldOp([]) + return op + +def _resolve_sync_op(sync_op): + if isinstance(sync_op, str): + normalized = sync_op.strip().upper() + if not normalized.startswith("T"): + normalized = f"T{normalized}" + try: + return getattr(pto, normalized) + except AttributeError as exc: + raise ValueError(f"Unsupported sync op type '{sync_op}'.") from exc + return sync_op + + +def _resolve_event_id(event_id): + if isinstance(event_id, int): + if event_id < 0 or event_id > 7: + raise ValueError(f"event_id must be in range [0, 7], got {event_id}.") + return getattr(pto, f"EVENT_ID{event_id}") + return event_id + + +def record_event(record_op, wait_op, event_id: int|Sequence[int]=0): + if not isinstance(event_id, int): + for eid in event_id: + pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + else: + pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) + + + +def wait_event(record_op, wait_op, event_id: int|Sequence[int]=0): + if not isinstance(event_id, int): + for eid in event_id: + pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + else: + pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) + + +def record_wait_pair(record_op, wait_op, event_id: int|Sequence[int]=0): + rec = _resolve_sync_op(record_op) + w = _resolve_sync_op(wait_op) + ev = _resolve_event_id(event_id) + pto.record_event(rec, w, ev) + pto.wait_event(rec, w, ev) + + +def barrier(sync_op): + pto.barrier(_resolve_sync_op(sync_op)) + + +def row_sum(src, tmp, dst): + pto.TRowSumOp(src = src, tmp = tmp, dst = dst) diff --git a/tests/frontend/test_caller_gen.py b/tests/frontend/test_caller_gen.py index 47e01a02..5a2e50ea 100644 --- a/tests/frontend/test_caller_gen.py +++ b/tests/frontend/test_caller_gen.py @@ -65,6 +65,33 @@ def mixed_kernel(data: "ptr_i8", count: "i64_type", idx: "index_dtype") -> None: ) +def test_generate_caller_cpp_maps_mxfp8_pointer_and_scalar_types(): + def mixed_mxfp8_kernel( + lhs: "ptr_e5m2", + lhs_scale: "ptr_e8m0", + alpha: "e4m3_type", + ) -> None: + return None + + wrapper = JitWrapper(mixed_mxfp8_kernel, meta_data=lambda: {}, block_dim=4) + wrapper._arg_types = [ + _FakeType("!pto.ptr"), + _FakeType("!pto.ptr"), + _FakeType("f8E4M3FN"), + ] + + caller_cpp = wrapper._generate_caller_cpp("generated.cpp") + + assert ( + 'extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *lhs, ' + "uint8_t *lhs_scale, uint8_t alpha)" + ) in caller_cpp + assert ( + "mixed_mxfp8_kernel<<>>((float8_e5m2_t *)lhs, " + "(float8_e8m0_t *)lhs_scale, alpha);" + ) in caller_cpp + + def test_generate_caller_cpp_for_dynamic_1d_add_signature(): def vec_add_1d_dynamic( arg0: "ptr_type", diff --git a/tests/frontend/test_mxfp8_frontend.py b/tests/frontend/test_mxfp8_frontend.py new file mode 100644 index 00000000..03b0a70e --- /dev/null +++ b/tests/frontend/test_mxfp8_frontend.py @@ -0,0 +1,55 @@ +import types + +import ptodsl.language as pto + + +class _StubType: + @staticmethod + def get(): + return object() + + +def test_mxfp8_family_uses_e5m2_data_and_e8m0_scale(monkeypatch): + stub_ir = types.SimpleNamespace( + Float8E5M2Type=_StubType, + Float8E8M0FNUType=_StubType, + Float8E4M3FNType=_StubType, + ) + monkeypatch.setattr(pto, "mlir_ir", stub_ir) + + mx = pto.mxfp8 + + assert mx.lhs is not None + assert mx.rhs is not None + assert mx.data is not None + assert mx.scale is not None + assert mx.acc is not None + assert mx.scale_k(64) == 2 + + +def test_float8_aliases_accept_common_mlir_ctor_names(monkeypatch): + stub_ir = types.SimpleNamespace( + Float8E4M3FNType=_StubType, + Float8E5M2Type=_StubType, + Float8E8M0FNUType=_StubType, + ) + monkeypatch.setattr(pto, "mlir_ir", stub_ir) + + assert pto.fp8_e4m3 is not None + assert pto.fp8_e5m2 is not None + assert pto.fp8_e8m0 is not None + + +def test_make_mxfp8_accepts_mixed_lhs_rhs_variants(monkeypatch): + stub_ir = types.SimpleNamespace( + Float8E4M3FNType=_StubType, + Float8E5M2Type=_StubType, + Float8E8M0FNUType=_StubType, + ) + monkeypatch.setattr(pto, "mlir_ir", stub_ir) + + mx = pto.make_mxfp8(lhs="e4m3", rhs="e5m2") + + assert mx.lhs is not None + assert mx.rhs is not None + assert mx.scale is not None From 49cefb3bf266db2596951132b9c89719e9f5b777 Mon Sep 17 00:00:00 2001 From: RuoyuZhou Date: Mon, 30 Mar 2026 21:19:00 +0800 Subject: [PATCH 52/53] Add A5 PTODSL library and micro coverage --- ptodsl/lib/__init__.py | 3 + ptodsl/lib/a5/README.md | 27 + ptodsl/lib/a5/TILE_MICRO_CHECKLIST.md | 43 + ptodsl/lib/a5/__init__.py | 23 + ptodsl/lib/a5/generated/a5_cube_matmul.pto | 47 + .../lib/a5/generated/a5_elementwise_add.pto | 51 + .../lib/a5/generated/a5_micro_vector_copy.pto | 11 + ptodsl/lib/a5/kernels.py | 224 ++ ptodsl/lib/a5/ops.py | 2094 +++++++++++++++++ ptodsl/lib/a5/tile_micro_coverage.py | 198 ++ pyproject.toml | 2 + scripts/generate_a5_pto.py | 84 + scripts/update_tile_micro_checklist.py | 21 + tests/regression/test_a5_lib_regression.py | 410 ++++ tests/regression/test_tile_micro_coverage.py | 39 + 15 files changed, 3277 insertions(+) create mode 100644 ptodsl/lib/__init__.py create mode 100644 ptodsl/lib/a5/README.md create mode 100644 ptodsl/lib/a5/TILE_MICRO_CHECKLIST.md create mode 100644 ptodsl/lib/a5/__init__.py create mode 100644 ptodsl/lib/a5/generated/a5_cube_matmul.pto create mode 100644 ptodsl/lib/a5/generated/a5_elementwise_add.pto create mode 100644 ptodsl/lib/a5/generated/a5_micro_vector_copy.pto create mode 100644 ptodsl/lib/a5/kernels.py create mode 100644 ptodsl/lib/a5/ops.py create mode 100644 ptodsl/lib/a5/tile_micro_coverage.py create mode 100644 scripts/generate_a5_pto.py create mode 100644 scripts/update_tile_micro_checklist.py create mode 100644 tests/regression/test_a5_lib_regression.py create mode 100644 tests/regression/test_tile_micro_coverage.py diff --git a/ptodsl/lib/__init__.py b/ptodsl/lib/__init__.py new file mode 100644 index 00000000..c8d47101 --- /dev/null +++ b/ptodsl/lib/__init__.py @@ -0,0 +1,3 @@ +from . import a5 + +__all__ = ["a5"] diff --git a/ptodsl/lib/a5/README.md b/ptodsl/lib/a5/README.md new file mode 100644 index 00000000..baa2c9ee --- /dev/null +++ b/ptodsl/lib/a5/README.md @@ -0,0 +1,27 @@ +# A5 Library Layer + +This directory contains a first PTODSL library-style translation layer for the +`pto-isa/include/pto/npu/a5` surface. + +The scope of this pass is: + +- Pythonic wrappers over PTO tile ops and selected micro instructions +- A5-flavored compatibility aliases such as `TLoad`, `TAdd`, `TMatmul`, and `TStore` +- Translated builder kernels that emit `.pto` through PTODSL +- A checked-in generation flow for reproducible `.pto` artifacts + +Entry points: + +- [`ops.py`](./ops.py): reusable A5-style helpers built on PTODSL and PTO dialect ops +- [`kernels.py`](./kernels.py): translated example kernels +- [`generated`](./generated): emitted `.pto` artifacts from `scripts/generate_a5_pto.py` + +Regenerate the current artifacts with: + +```bash +PYTHONPATH=/Users/zhoubot/github/.llvm-19.1.7/build-mlir-py312/tools/mlir/python_packages/mlir_core:/Users/zhoubot/github/pto-org/PTOAS/install-src312:/Users/zhoubot/github/pto-org/PTOAS/build-src312/python \ +/Users/zhoubot/github/.venv-ptoas-src312/bin/python scripts/generate_a5_pto.py +``` + +`--emit-cpp` is best-effort: the tile-based kernels lower through local `ptoas`, +while the direct micro-only kernel currently remains `.pto`-only in this environment. diff --git a/ptodsl/lib/a5/TILE_MICRO_CHECKLIST.md b/ptodsl/lib/a5/TILE_MICRO_CHECKLIST.md new file mode 100644 index 00000000..42f4175f --- /dev/null +++ b/ptodsl/lib/a5/TILE_MICRO_CHECKLIST.md @@ -0,0 +1,43 @@ +# Tile Micro Coverage + +- Total public tile ops: `32` +- Implemented: `26` +- Partial: `1` +- Pending: `0` +- Blocked: `4` +- Not applicable: `1` + +| tile op | status | helper | note | +| --- | --- | --- | --- | +| `mov` | `implemented` | `mov_micro` | UB stage + vlds/vsts copy loop. | +| `add` | `implemented` | `add_micro` | UB stage + constexpr-specialized TBinOp-style vlds/vadd/vsts lowering. | +| `sub` | `implemented` | `sub_micro` | UB stage + constexpr-specialized TBinOp-style vlds/vsub/vsts lowering. | +| `div` | `implemented` | `div_micro` | UB stage + constexpr-specialized TBinOp-style vlds/vdiv/vsts lowering. | +| `mul` | `implemented` | `mul_micro` | UB stage + constexpr-specialized TBinOp-style vlds/vmul/vsts lowering. | +| `or_` | `implemented` | `or_micro` | UB stage + constexpr-specialized TBinOp-style vlds/vor/vsts lowering. | +| `gather` | `partial` | `gather_micro` | Indexed gather is implemented via vgather2 for same-width source/index pairs; mask-pattern gather still needs unsupported vsqz-style micro support. | +| `exp` | `implemented` | `exp_micro` | UB stage + vlds/vexp/vsts loop. | +| `log` | `implemented` | `log_micro` | UB stage + vlds/vln/vsts loop. | +| `relu` | `implemented` | `relu_micro` | UB stage + vlds/vrelu/vsts loop. | +| `abs` | `implemented` | `abs_micro` | UB stage + vlds/vabs/vsts loop. | +| `sqrt` | `implemented` | `sqrt_micro` | UB stage + vlds/vsqrt/vsts loop. | +| `rsqrt` | `implemented` | `rsqrt_micro` | UB stage + vsqrt/vrec micro sequence. | +| `reciprocal` | `implemented` | `reciprocal_micro` | UB stage + vlds/vrec/vsts loop. | +| `matmul` | `blocked` | `-` | Cube/L0 path is not a pure vector-micro rewrite target. | +| `matmul_bias` | `blocked` | `-` | Cube/L0 path is not a pure vector-micro rewrite target. | +| `matmul_acc` | `blocked` | `-` | Cube/L0 path is not a pure vector-micro rewrite target. | +| `extract` | `blocked` | `-` | Layout/L0 extraction op, not a vector-micro compute rewrite. | +| `row_sum` | `implemented` | `row_sum_micro` | Static-shape row reduction via vcadd + point-store. | +| `row_min` | `implemented` | `row_min_micro` | Static-shape row reduction via vcmin + point-store. | +| `row_max` | `implemented` | `row_max_micro` | Static-shape row reduction via vcmax + point-store. | +| `row_expand` | `implemented` | `row_expand_micro` | Static-shape canonical broadcast via vldas/vldus/vdup/vsts. | +| `row_expand_sub` | `implemented` | `row_expand_sub_micro` | Static-shape canonical broadcast via vldas/vldus/vdup/vsub/vsts. | +| `row_expand_div` | `implemented` | `row_expand_div_micro` | Static-shape canonical broadcast via vldas/vldus/vdup/vdiv/vsts. | +| `row_expand_mul` | `implemented` | `row_expand_mul_micro` | Static-shape canonical broadcast via vldas/vldus/vdup/vmul/vsts. | +| `col_sum` | `implemented` | `col_sum_micro` | Static-shape TColReduceOps-style column reduction via vadd. | +| `col_min` | `implemented` | `col_min_micro` | Static-shape TColReduceOps-style column reduction via vmin. | +| `col_max` | `implemented` | `col_max_micro` | Static-shape TColReduceOps-style column reduction via vmax. | +| `col_expand` | `implemented` | `col_expand_micro` | Static-shape canonical broadcast via vlds/vsts replication. | +| `mrgsort` | `implemented` | `mrgsort_micro` | Single-list row-major merge sort via vmrgsort4. | +| `sort32` | `implemented` | `sort32_micro` | Static-shape block sort via vbitsort. | +| `subset` | `not_applicable` | `-` | View helper only, not a tile compute op. | diff --git a/ptodsl/lib/a5/__init__.py b/ptodsl/lib/a5/__init__.py new file mode 100644 index 00000000..6bc0da50 --- /dev/null +++ b/ptodsl/lib/a5/__init__.py @@ -0,0 +1,23 @@ +from . import ops +from .kernels import ( + KERNEL_BUILDERS, + build_cube_matmul, + build_elementwise_add, + build_micro_vector_copy, + build_mxfp8_matmul, + build_templated_elementwise_add, +) +from .ops import * +from .tile_micro_coverage import TILE_MICRO_COVERAGE, coverage_markdown, coverage_summary + +__all__ = list(ops.__all__) + [ + "KERNEL_BUILDERS", + "TILE_MICRO_COVERAGE", + "build_cube_matmul", + "build_elementwise_add", + "build_micro_vector_copy", + "build_mxfp8_matmul", + "build_templated_elementwise_add", + "coverage_markdown", + "coverage_summary", +] diff --git a/ptodsl/lib/a5/generated/a5_cube_matmul.pto b/ptodsl/lib/a5/generated/a5_cube_matmul.pto new file mode 100644 index 00000000..ae16b9d4 --- /dev/null +++ b/ptodsl/lib/a5/generated/a5_cube_matmul.pto @@ -0,0 +1,47 @@ +module { + func.func @a5_cube_matmul(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1 = arith.constant 1 : index + %0 = pto.make_tensor_view %arg0, shape = [%c16, %c32], strides = [%c32, %c1] : !pto.tensor_view + %c32_0 = arith.constant 32 : index + %c16_1 = arith.constant 16 : index + %c1_2 = arith.constant 1 : index + %1 = pto.make_tensor_view %arg1, shape = [%c32_0, %c16_1], strides = [%c16_1, %c1_2] : !pto.tensor_view + %c16_3 = arith.constant 16 : index + %c16_4 = arith.constant 16 : index + %c1_5 = arith.constant 1 : index + %2 = pto.make_tensor_view %arg2, shape = [%c16_3, %c16_4], strides = [%c16_4, %c1_5] : !pto.tensor_view + pto.section.cube { + %c0_6 = arith.constant 0 : index + %c0_7 = arith.constant 0 : index + %c16_8 = arith.constant 16 : index + %c32_9 = arith.constant 32 : index + %3 = pto.partition_view %0, offsets = [%c0_6, %c0_7], sizes = [%c16_8, %c32_9] : !pto.tensor_view -> !pto.partition_tensor_view<16x32xf16> + %4 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%3 : !pto.partition_tensor_view<16x32xf16>) outs(%4 : !pto.tile_buf) + %c0_10 = arith.constant 0 : index + %c0_11 = arith.constant 0 : index + %c32_12 = arith.constant 32 : index + %c16_13 = arith.constant 16 : index + %5 = pto.partition_view %1, offsets = [%c0_10, %c0_11], sizes = [%c32_12, %c16_13] : !pto.tensor_view -> !pto.partition_tensor_view<32x16xf16> + %6 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%5 : !pto.partition_tensor_view<32x16xf16>) outs(%6 : !pto.tile_buf) + %7 = pto.alloc_tile : !pto.tile_buf + %8 = pto.alloc_tile : !pto.tile_buf + %9 = pto.alloc_tile : !pto.tile_buf + pto.textract ins(%4, %c0, %c0 : !pto.tile_buf, index, index) outs(%7 : !pto.tile_buf) + pto.tmov ins(%6 : !pto.tile_buf) outs(%8 : !pto.tile_buf) + pto.tmatmul ins(%7, %8 : !pto.tile_buf, !pto.tile_buf) outs(%9 : !pto.tile_buf) + %c0_14 = arith.constant 0 : index + %c0_15 = arith.constant 0 : index + %c16_16 = arith.constant 16 : index + %c16_17 = arith.constant 16 : index + %10 = pto.partition_view %2, offsets = [%c0_14, %c0_15], sizes = [%c16_16, %c16_17] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + pto.tstore ins(%9 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x16xf32>) + } + return + } +} + diff --git a/ptodsl/lib/a5/generated/a5_elementwise_add.pto b/ptodsl/lib/a5/generated/a5_elementwise_add.pto new file mode 100644 index 00000000..1363c6d2 --- /dev/null +++ b/ptodsl/lib/a5/generated/a5_elementwise_add.pto @@ -0,0 +1,51 @@ +module { + func.func @a5_elementwise_add(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) { + %c1 = arith.constant 1 : index + %0 = pto.make_tensor_view %arg0, shape = [%arg3, %arg4], strides = [%arg4, %c1] : !pto.tensor_view + %c1_0 = arith.constant 1 : index + %1 = pto.make_tensor_view %arg1, shape = [%arg3, %arg4], strides = [%arg4, %c1_0] : !pto.tensor_view + %c1_1 = arith.constant 1 : index + %2 = pto.make_tensor_view %arg2, shape = [%arg3, %arg4], strides = [%arg4, %c1_1] : !pto.tensor_view + %c0 = arith.constant 0 : index + %c0_2 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c32_3 = arith.constant 32 : index + %3 = pto.partition_view %0, offsets = [%c0, %c0_2], sizes = [%c32, %c32_3] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %c0_4 = arith.constant 0 : index + %c0_5 = arith.constant 0 : index + %c32_6 = arith.constant 32 : index + %c32_7 = arith.constant 32 : index + %4 = pto.partition_view %1, offsets = [%c0_4, %c0_5], sizes = [%c32_6, %c32_7] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %c0_8 = arith.constant 0 : index + %c0_9 = arith.constant 0 : index + %c32_10 = arith.constant 32 : index + %c32_11 = arith.constant 32 : index + %5 = pto.partition_view %2, offsets = [%c0_8, %c0_9], sizes = [%c32_10, %c32_11] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + pto.section.vector { + %c0_i64 = arith.constant 0 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c8192_i64 = arith.constant 8192 : i64 + %6 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %7 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf + %8 = pto.alloc_tile addr = %c8192_i64 : !pto.tile_buf + pto.tload ins(%3 : !pto.partition_tensor_view<32x32xf32>) outs(%6 : !pto.tile_buf) + pto.tload ins(%4 : !pto.partition_tensor_view<32x32xf32>) outs(%7 : !pto.tile_buf) + %9 = pto.castptr %c0_i64 : i64 -> !pto.ptr + %10 = pto.castptr %c4096_i64 : i64 -> !pto.ptr + %11 = pto.castptr %c8192_i64 : i64 -> !pto.ptr + %12 = pto.pset_b32 "PAT_ALL" : !pto.mask + %c0_12 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c64 = arith.constant 64 : index + scf.for %arg5 = %c0_12 to %c1024 step %c64 { + %13 = pto.vlds %9[%arg5] : !pto.ptr -> !pto.vreg<64xf32> + %14 = pto.vlds %10[%arg5] : !pto.ptr -> !pto.vreg<64xf32> + %15 = pto.vadd %13, %14, %12 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %15, %11[%arg5], %12 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + pto.tstore ins(%8 : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<32x32xf32>) + } + return + } +} + diff --git a/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto b/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto new file mode 100644 index 00000000..a5c5b940 --- /dev/null +++ b/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto @@ -0,0 +1,11 @@ +module { + func.func @a5_micro_vector_copy(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) { + pto.section.vector { + %0 = pto.vlds %arg0[%arg2] : !pto.ptr -> !pto.vreg<64xf32> + %1 = pto.pset_b32 "PAT_ALL" : !pto.mask + pto.vsts %0, %arg1[%arg2], %1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + return + } +} + diff --git a/ptodsl/lib/a5/kernels.py b/ptodsl/lib/a5/kernels.py new file mode 100644 index 00000000..70f3f301 --- /dev/null +++ b/ptodsl/lib/a5/kernels.py @@ -0,0 +1,224 @@ +from mlir.dialects import pto as _raw_pto +from mlir.ir import IndexType + +from ... import Constexpr, pto, scalar as s, to_ir_module +from ...language import make_mxfp8 +from . import ops + + +def build_elementwise_add(*, rows=32, cols=32, tile_rows=32, tile_cols=32, dtype=None): + dtype = pto.float32 if dtype is None else dtype + + def meta_data(): + return { + "ptr_t": pto.ptr(dtype), + "index_t": IndexType.get(), + } + + @to_ir_module(meta_data=meta_data) + def a5_elementwise_add( + src0: "ptr_t", + src1: "ptr_t", + dst: "ptr_t", + n_rows: "index_t", + n_cols: "index_t", + ) -> None: + lhs = pto.make_tensor(src0, shape=[n_rows, n_cols], dtype=dtype) + rhs = pto.make_tensor(src1, shape=[n_rows, n_cols], dtype=dtype) + out = pto.make_tensor(dst, shape=[n_rows, n_cols], dtype=dtype) + + lhs_tile = lhs.slice([0, 0], [tile_rows, tile_cols]) + rhs_tile = rhs.slice([0, 0], [tile_rows, tile_cols]) + out_tile = out.slice([0, 0], [tile_rows, tile_cols]) + + with pto.vector_section(): + ops.add_micro( + lhs_tile, + rhs_tile, + out_tile, + dtype=dtype, + shape=[tile_rows, tile_cols], + ) + + return a5_elementwise_add + + +def build_templated_elementwise_add(*, dtype=None): + dtype = pto.float32 if dtype is None else dtype + + def meta_data(ROWS=32, COLS=32): + return { + "ptr_t": pto.ptr(dtype), + "shape": [ROWS, COLS], + } + + @to_ir_module(meta_data=meta_data) + def a5_templated_elementwise_add( + src0: "ptr_t", + src1: "ptr_t", + dst: "ptr_t", + ROWS: Constexpr[int] = 32, + COLS: Constexpr[int] = 32, + VF_IMPL: Constexpr[str] = ops.VF_IMPL_DEFAULT, + ) -> None: + lhs = pto.make_tensor(src0, shape=shape, dtype=dtype) + rhs = pto.make_tensor(src1, shape=shape, dtype=dtype) + out = pto.make_tensor(dst, shape=shape, dtype=dtype) + + with pto.vector_section(): + ops.add_micro( + lhs.slice([0, 0], shape), + rhs.slice([0, 0], shape), + out.slice([0, 0], shape), + dtype=dtype, + shape=shape, + impl=VF_IMPL, + ) + + return a5_templated_elementwise_add + + +def build_micro_vector_copy(*, lanes=64, dtype=None): + dtype = pto.float32 if dtype is None else dtype + + def meta_data(): + return { + "ptr_t": pto.ptr(dtype, space="VEC"), + "index_t": IndexType.get(), + } + + @to_ir_module(meta_data=meta_data) + def a5_micro_vector_copy(src: "ptr_t", dst: "ptr_t", offset: "index_t") -> None: + with pto.vector_section(): + ops.vector_copy(src, dst, offset, lanes=lanes, dtype=dtype) + + return a5_micro_vector_copy + + +def build_mxfp8_matmul(*, m=16, k=64, n=32, lhs_variant="e5m2", rhs_variant="e5m2"): + mx = make_mxfp8(lhs=lhs_variant, rhs=rhs_variant) + scale_k = mx.scale_k(k) + + def meta_data(): + return { + "ptr_lhs": pto.ptr(mx.lhs), + "ptr_rhs": pto.ptr(mx.rhs), + "ptr_scale": pto.ptr(mx.scale), + "ptr_bias": pto.ptr(mx.acc), + "ptr_out": pto.ptr(mx.acc), + } + + @to_ir_module(meta_data=meta_data) + def a5_mxfp8_matmul( + lhs_ptr: "ptr_lhs", + lhs_scale_ptr: "ptr_scale", + rhs_ptr: "ptr_rhs", + rhs_scale_ptr: "ptr_scale", + bias_ptr: "ptr_bias", + out_ptr: "ptr_out", + ) -> None: + lhs = pto.make_tensor(lhs_ptr, shape=[m, k], dtype=mx.lhs) + rhs = pto.make_tensor(rhs_ptr, shape=[k, n], dtype=mx.rhs) + lhs_scale = pto.make_tensor(lhs_scale_ptr, shape=[m, scale_k], dtype=mx.scale) + rhs_scale = pto.make_tensor(rhs_scale_ptr, shape=[scale_k, n], dtype=mx.scale) + bias = pto.make_tensor(bias_ptr, shape=[1, n], dtype=mx.acc) + out = pto.make_tensor(out_ptr, shape=[m, n], dtype=mx.acc) + + with pto.cube_section(): + lhs_tile = ops.load_tile( + lhs.slice([0, 0], [m, k]), dtype=mx.lhs, shape=[m, k], space="LEFT" + ) + rhs_tile = ops.load_tile( + rhs.slice([0, 0], [k, n]), dtype=mx.rhs, shape=[k, n], space="RIGHT" + ) + lhs_scale_tile = ops.load_tile( + lhs_scale.slice([0, 0], [m, scale_k]), + dtype=mx.scale, + shape=[m, scale_k], + space="SCALING", + config=pto.TileBufConfig( + blayout="RowMajor", + slayout="RowMajor", + s_fractal_size=_raw_pto.TileConfig.fractalMxSize, + ), + ) + rhs_scale_tile = ops.load_tile( + rhs_scale.slice([0, 0], [scale_k, n]), + dtype=mx.scale, + shape=[scale_k, n], + space="SCALING", + config=pto.TileBufConfig( + blayout="ColMajor", + slayout="ColMajor", + s_fractal_size=_raw_pto.TileConfig.fractalMxSize, + ), + ) + bias_tile = ops.load_tile( + bias.slice([0, 0], [1, n]), dtype=mx.acc, shape=[1, n], space="BIAS" + ) + acc_tile = pto.make_tile_buffer(mx.acc, [m, n], space="ACC").alloc() + ops.matmul_mx_bias( + lhs_tile, + lhs_scale_tile, + rhs_tile, + rhs_scale_tile, + bias_tile, + acc_tile, + ) + ops.store_tile(acc_tile, out.slice([0, 0], [m, n])) + + return a5_mxfp8_matmul + + +def build_cube_matmul(*, m=16, k=32, n=16, lhs_dtype=None, rhs_dtype=None, acc_dtype=None): + lhs_dtype = pto.float16 if lhs_dtype is None else lhs_dtype + rhs_dtype = pto.float16 if rhs_dtype is None else rhs_dtype + acc_dtype = pto.float32 if acc_dtype is None else acc_dtype + + def meta_data(): + return { + "ptr_lhs": pto.ptr(lhs_dtype), + "ptr_rhs": pto.ptr(rhs_dtype), + "ptr_out": pto.ptr(acc_dtype), + } + + @to_ir_module(meta_data=meta_data) + def a5_cube_matmul(lhs_ptr: "ptr_lhs", rhs_ptr: "ptr_rhs", out_ptr: "ptr_out") -> None: + c0 = s.const(0) + lhs = pto.make_tensor(lhs_ptr, shape=[m, k], dtype=lhs_dtype) + rhs = pto.make_tensor(rhs_ptr, shape=[k, n], dtype=rhs_dtype) + out = pto.make_tensor(out_ptr, shape=[m, n], dtype=acc_dtype) + + with pto.cube_section(): + lhs_mat = ops.load_tile( + lhs.slice([0, 0], [m, k]), dtype=lhs_dtype, shape=[m, k], space="MAT" + ) + rhs_mat = ops.load_tile( + rhs.slice([0, 0], [k, n]), dtype=rhs_dtype, shape=[k, n], space="MAT" + ) + lhs_tile = pto.make_tile_buffer(lhs_dtype, [m, k], space="LEFT").alloc() + rhs_tile = pto.make_tile_buffer(rhs_dtype, [k, n], space="RIGHT").alloc() + acc_tile = pto.make_tile_buffer(acc_dtype, [m, n], space="ACC").alloc() + ops.extract(lhs_mat, c0, c0, lhs_tile) + ops.move_tile(rhs_mat, rhs_tile) + ops.matmul(lhs_tile, rhs_tile, acc_tile) + ops.store_tile(acc_tile, out.slice([0, 0], [m, n])) + + return a5_cube_matmul + + +KERNEL_BUILDERS = { + "a5_elementwise_add": build_elementwise_add, + "a5_micro_vector_copy": build_micro_vector_copy, + "a5_cube_matmul": build_cube_matmul, +} + + +__all__ = [ + "KERNEL_BUILDERS", + "build_cube_matmul", + "build_elementwise_add", + "build_micro_vector_copy", + "build_mxfp8_matmul", + "build_templated_elementwise_add", +] diff --git a/ptodsl/lib/a5/ops.py b/ptodsl/lib/a5/ops.py new file mode 100644 index 00000000..689bd7da --- /dev/null +++ b/ptodsl/lib/a5/ops.py @@ -0,0 +1,2094 @@ +import builtins +import re + +from mlir.dialects import arith as _arith +from mlir.dialects import pto as _pto +from mlir.ir import IntegerAttr, IntegerType + +from ... import pto as _dsl_pto +from ... import scalar as _scalar +from ... import const_expr, range_constexpr +from ...api.scalar import _unwrap + + +VF_IMPL_DEFAULT = "default" +VF_IMPL_1D_NO_POST_UPDATE = "1d_no_post_update" +VF_IMPL_1D_POST_UPDATE = "1d_post_update" +VF_IMPL_2D_NO_POST_UPDATE = "2d_no_post_update" +VF_IMPL_2D_POST_UPDATE = "2d_post_update" + + +_DTYPE_ALIAS_GROUPS = { + "f32": {"f32", "float32"}, + "f16": {"f16", "float16", "half"}, + "bf16": {"bf16", "bfloat16"}, + "i32": {"i32", "int32"}, + "u32": {"u32", "uint32"}, + "i16": {"i16", "int16"}, + "u16": {"u16", "uint16"}, + "i8": {"i8", "int8"}, + "u8": {"u8", "uint8"}, +} + + +def _call(op, *args, **kwargs): + return op( + *(_unwrap(arg) for arg in args), + **{name: _unwrap(value) for name, value in kwargs.items()}, + ) + + +def _cmp_mode_attr(mode): + if mode is None: + return None + if isinstance(mode, str): + return _pto.CmpModeAttr.get(getattr(_pto.CmpMode, mode.upper())) + return mode + + +def _const_i64(value): + i64 = IntegerType.get_signless(64) + return _arith.ConstantOp(i64, IntegerAttr.get(i64, value)).result + + +def _const_i32(value): + i32 = IntegerType.get_signless(32) + return _arith.ConstantOp(i32, IntegerAttr.get(i32, value)).result + + +def _const_float(dtype, value): + return _arith.ConstantOp(_scalar.resolve_type(dtype), value).result + + +def _dtype_token(dtype): + text = str(_scalar.resolve_type(dtype)).lower() + for canonical, aliases in _DTYPE_ALIAS_GROUPS.items(): + if any(alias in text for alias in aliases): + return canonical + raise ValueError(f"Unsupported dtype token for '{dtype}'.") + + +def _dtype_byte_width(dtype): + text = str(dtype) + if "float32" in text or "f32" in text or "int32" in text or "i32" in text or "uint32" in text or "u32" in text: + return 4 + if "float16" in text or "f16" in text or "bfloat16" in text or "bf16" in text or "int16" in text or "i16" in text or "u16" in text: + return 2 + if "i8" in text or "u8" in text: + return 1 + raise ValueError(f"Unsupported dtype byte width for '{dtype}'.") + + +def _extract_static_tensor_shape(value): + raw = _unwrap(value) + type_obj = getattr(raw, "type", None) + if type_obj is None: + return None + text = str(type_obj) + match = re.search( + r"!pto\.(?:partition_)?tensor_view<(?P[^>]+)>|!pto\.tile_buf<[^,]+,\s*(?P[^>]+)>", + text, + ) + if not match: + return None + payload = match.group("payload") or match.group("tile_payload") + dims = re.findall(r"(\?|\d+)x", payload) + if not dims: + return None + shape = [] + for dim in dims: + if dim == "?": + return None + shape.append(int(dim)) + return shape + + +def _extract_tensor_dtype_token(value): + raw = _unwrap(value) + type_obj = getattr(raw, "type", None) + if type_obj is None: + return None + text = str(type_obj).lower() + for canonical, aliases in _DTYPE_ALIAS_GROUPS.items(): + if any(alias in text for alias in aliases): + return canonical + return None + + +def _require_supported_dtype(dtype, *, allowed, message): + try: + token = _dtype_token(dtype) + except ValueError as exc: + raise ValueError(message) from exc + if token not in allowed: + raise ValueError(message) + return token + + +def _require_view_shape(view, expected_shape, *, context, message): + actual_shape = _extract_static_tensor_shape(view) + if actual_shape is None: + return + if list(actual_shape) != list(expected_shape): + raise ValueError(f"{message} Expected {expected_shape}, got {actual_shape}.") + + +def _require_view_dtype(view, dtype, *, message): + actual_token = _extract_tensor_dtype_token(view) + if actual_token is None: + return + if actual_token != _dtype_token(dtype): + raise ValueError(message) + + +def _micro_lane_count(dtype): + return 256 // _dtype_byte_width(dtype) + + +def _resolve_lanes(dtype, lanes): + if lanes is None: + return _micro_lane_count(dtype) + return lanes + + +def _full_mask(dtype): + width = _dtype_byte_width(dtype) + if width == 4: + return _dsl_pto.pset_b32(_dsl_pto.MaskType(), "PAT_ALL") + if width == 2: + return _dsl_pto.pset_b16(_dsl_pto.MaskType(), "PAT_ALL") + if width == 1: + return _dsl_pto.pset_b8(_dsl_pto.MaskType(), "PAT_ALL") + raise ValueError(f"Unsupported dtype mask width for '{dtype}'.") + + +def _tail_mask(dtype, active_lanes): + i32 = IntegerType.get_signless(32) + width = _dtype_byte_width(dtype) + active = _const_i32(active_lanes) + if width == 4: + mask, _ = _dsl_pto.plt_b32(_dsl_pto.MaskType(), i32, active) + return mask + if width == 2: + mask, _ = _dsl_pto.plt_b16(_dsl_pto.MaskType(), i32, active) + return mask + if width == 1: + mask, _ = _dsl_pto.plt_b8(_dsl_pto.MaskType(), i32, active) + return mask + raise ValueError(f"Unsupported dtype tail mask width for '{dtype}'.") + + +def _mask_for_chunk(dtype, active_lanes): + lanes = _micro_lane_count(dtype) + if active_lanes == lanes: + return _full_mask(dtype) + return _tail_mask(dtype, active_lanes) + + +def _onept_dist(dtype): + width = _dtype_byte_width(dtype) + if width == 4: + return "ONEPT_B32" + if width == 2: + return "ONEPT_B16" + if width == 1: + return "ONEPT_B8" + raise ValueError(f"Unsupported dtype point-store width for '{dtype}'.") + + +def _normalize_vf_impl_kind(impl): + if impl is None: + return VF_IMPL_DEFAULT + + normalized = str(impl).strip().lower() + aliases = { + "default": VF_IMPL_DEFAULT, + "vfimpl_default": VF_IMPL_DEFAULT, + "1d_no_post_update": VF_IMPL_1D_NO_POST_UPDATE, + "vfimpl_1d_no_post_update": VF_IMPL_1D_NO_POST_UPDATE, + "1d_post_update": VF_IMPL_1D_POST_UPDATE, + "vfimpl_1d_post_update": VF_IMPL_1D_POST_UPDATE, + "2d_no_post_update": VF_IMPL_2D_NO_POST_UPDATE, + "vfimpl_2d_no_post_update": VF_IMPL_2D_NO_POST_UPDATE, + "2d_post_update": VF_IMPL_2D_POST_UPDATE, + "vfimpl_2d_post_update": VF_IMPL_2D_POST_UPDATE, + } + if normalized not in aliases: + supported = ", ".join(sorted(aliases)) + raise ValueError(f"Unsupported VF impl kind '{impl}'. Expected one of: {supported}.") + return aliases[normalized] + + +def _alloc_like_view(view, *, dtype, shape, space, valid_shape=None, config=None): + return _dsl_pto.make_tile_buffer( + dtype, + shape, + space=space, + valid_shape=valid_shape, + config=config, + ).alloc() + + +def load_tile( + view, + tile_buffer=None, + *, + dtype=None, + shape=None, + space="VEC", + valid_shape=None, + config=None, +): + if tile_buffer is None: + if dtype is None or shape is None: + raise ValueError( + "`load_tile(...)` requires either `tile_buffer=` or both `dtype=` and `shape=`." + ) + tile_buffer = _alloc_like_view( + view, + dtype=dtype, + shape=shape, + space=space, + valid_shape=valid_shape, + config=config, + ) + _dsl_pto.load(view, tile_buffer) + return tile_buffer + + +def store_tile(tile_buffer, view): + _dsl_pto.store(tile_buffer, view) + return view + + +def move_tile(source, dest): + _call(_pto.TMovOp, None, source, dest) + return dest + + +def add(lhs, rhs, out): + _call(_pto.TAddOp, lhs, rhs, out) + return out + + +def add_micro( + lhs_view, + rhs_view, + out_view, + *, + dtype, + shape, + lanes=None, + base_addr=0, + impl=VF_IMPL_DEFAULT, +): + return _binary_micro( + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vadd", + impl=impl, + ) + + +def sub_micro( + lhs_view, + rhs_view, + out_view, + *, + dtype, + shape, + lanes=None, + base_addr=0, + impl=VF_IMPL_DEFAULT, +): + return _binary_micro( + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vsub", + impl=impl, + ) + + +def mul_micro( + lhs_view, + rhs_view, + out_view, + *, + dtype, + shape, + lanes=None, + base_addr=0, + impl=VF_IMPL_DEFAULT, +): + return _binary_micro( + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vmul", + impl=impl, + ) + + +def div_micro( + lhs_view, + rhs_view, + out_view, + *, + dtype, + shape, + lanes=None, + base_addr=0, + impl=VF_IMPL_DEFAULT, +): + return _binary_micro( + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vdiv", + impl=impl, + ) + + +def or_micro( + lhs_view, + rhs_view, + out_view, + *, + dtype, + shape, + lanes=None, + base_addr=0, + impl=VF_IMPL_DEFAULT, +): + return _binary_micro( + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vor", + impl=impl, + ) + + +def mov_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name=None, + ) + + +def exp_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vexp", + ) + + +def log_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vln", + ) + + +def relu_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vrelu", + ) + + +def abs_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vabs", + ) + + +def sqrt_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vsqrt", + ) + + +def rsqrt_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _rsqrt_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + ) + + +def reciprocal_micro(src_view, out_view, *, dtype, shape, lanes=None, base_addr=0): + return _unary_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + lanes=lanes, + base_addr=base_addr, + op_name="vrec", + ) + + +def gather_micro( + src_view, + indices_view, + out_view, + *, + dtype, + index_dtype, + shape, + base_addr=0, +): + return _gather_micro( + src_view, + indices_view, + out_view, + dtype=dtype, + index_dtype=index_dtype, + shape=shape, + base_addr=base_addr, + ) + + +def col_expand_micro(src_view, out_view, *, dtype, shape, base_addr=0): + rows, cols = _check_col_expand_operands( + src_view, out_view, dtype=dtype, shape=shape, context="TCOLEXPAND" + ) + lanes = _micro_lane_count(dtype) + vreg_type = _dsl_pto.VRegType(lanes, dtype) + buf_bytes = rows * cols * _dtype_byte_width(dtype) + + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer( + dtype, shape, space="VEC", valid_shape=[1, cols] + ).alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + + for col in range(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + col_offset = _scalar.const(col) + vec = _dsl_pto.vlds(vreg_type, src_ptr, col_offset) + for row in range(rows): + dst_offset = _scalar.const(row * cols + col) + _dsl_pto.vsts(vec, out_ptr, dst_offset, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def row_expand_micro(src_view, out_view, *, dtype, shape, base_addr=0): + rows, cols = _check_row_expand_operands( + src_view, out_view, dtype=dtype, shape=shape, context="TROWEXPAND" + ) + lanes = _micro_lane_count(dtype) + vreg_type = _dsl_pto.VRegType(lanes, dtype) + buf_bytes = rows * cols * _dtype_byte_width(dtype) + + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer( + dtype, shape, space="VEC", valid_shape=[rows, 1] + ).alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + + for row in range(rows): + scalar_offset = _scalar.const(row * cols) + align = _dsl_pto.vldas(_dsl_pto.AlignType(), src_ptr, scalar_offset) + scalar_vec, _, _ = _dsl_pto.vldus( + vreg_type, + _dsl_pto.AlignType(), + _dsl_pto.ptr(dtype, space="VEC"), + src_ptr, + scalar_offset, + align, + ) + broadcast = _dsl_pto.vdup(vreg_type, scalar_vec, position="POS_LOWEST") + for col in range(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + dst_offset = _scalar.const(row * cols + col) + _dsl_pto.vsts(broadcast, out_ptr, dst_offset, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def row_expand_sub_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): + return _row_expand_binary_micro( + base_view, + expand_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + op_name="vsub", + ) + + +def row_expand_mul_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): + return _row_expand_binary_micro( + base_view, + expand_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + op_name="vmul", + ) + + +def row_expand_div_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): + return _row_expand_binary_micro( + base_view, + expand_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + op_name="vdiv", + ) + + +def row_sum_micro(src_view, out_view, *, dtype, shape, base_addr=0): + return _row_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vcadd", + combine_op_name="vadd", + init_value=0.0, + ) + + +def row_max_micro(src_view, out_view, *, dtype, shape, base_addr=0): + return _row_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vcmax", + combine_op_name="vmax", + init_value=float("-inf"), + ) + + +def row_min_micro(src_view, out_view, *, dtype, shape, base_addr=0): + return _row_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vcmin", + combine_op_name="vmin", + init_value=float("inf"), + ) + + +def col_sum_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): + return _col_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vadd", + impl=impl, + ) + + +def col_max_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): + return _col_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vmax", + impl=impl, + ) + + +def col_min_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): + return _col_reduce_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + reduce_op_name="vmin", + impl=impl, + ) + + +def mrgsort_micro(src_view, out_view, *, dtype, shape, block_len, base_addr=0): + return _mrgsort_micro( + src_view, + out_view, + dtype=dtype, + shape=shape, + block_len=block_len, + base_addr=base_addr, + ) + + +def sort32_micro(src_view, idx_view, out_view, *, dtype, shape, base_addr=0): + return _sort32_micro( + src_view, + idx_view, + out_view, + dtype=dtype, + shape=shape, + base_addr=base_addr, + ) + + +def _require_static_matrix_shape(shape, *, context): + if len(shape) != 2 or any(not isinstance(dim, int) for dim in shape): + raise ValueError(f"{context} currently requires a static rank-2 integer shape.") + rows, cols = shape + if rows <= 0 or cols <= 0: + raise ValueError(f"{context} requires positive row/column sizes.") + return rows, cols + + +def _check_tbinop_operands(lhs_view, rhs_view, out_view, *, dtype, shape, context): + rows, cols = _require_static_matrix_shape(shape, context=context) + _require_supported_dtype( + dtype, + allowed={"f32", "f16", "bf16", "i32", "u32", "i16", "u16", "i8", "u8"}, + message=f"Fix: {context} has invalid data type.", + ) + for view, label in ((lhs_view, "src0"), (rhs_view, "src1"), (out_view, "dst")): + _require_view_shape( + view, + [rows, cols], + context=context, + message=f"Fix: {context} input tile {label} valid shape mismatch with output tile dst shape.", + ) + _require_view_dtype( + view, + dtype, + message=f"Fix: {context} input tile src0, src1 and dst tile data type mismatch.", + ) + return rows, cols + + +def _check_row_expand_operands(src_view, out_view, *, dtype, shape, context): + rows, cols = _require_static_matrix_shape(shape, context=context) + _require_supported_dtype( + dtype, + allowed={"f32", "f16", "bf16", "i32", "u32", "i16", "u16", "i8", "u8"}, + message=f"Fix: {context} data type must be b8/b16/b32", + ) + _require_view_shape( + src_view, + [rows, 1], + context=context, + message=f"Fix: {context} source valid shape must be [rows, 1].", + ) + _require_view_shape( + out_view, + [rows, cols], + context=context, + message=f"Fix: {context} output valid shape mismatch.", + ) + _require_view_dtype( + src_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + _require_view_dtype( + out_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + return rows, cols + + +def _check_col_expand_operands(src_view, out_view, *, dtype, shape, context): + rows, cols = _require_static_matrix_shape(shape, context=context) + _require_supported_dtype( + dtype, + allowed={"f32", "f16", "bf16", "i32", "u32", "i16", "u16", "i8", "u8"}, + message=f"Fix: {context} data type must be b8/b16/b32", + ) + _require_view_shape( + src_view, + [1, cols], + context=context, + message=f"Fix: {context} input valid col must be consistent with output valid col.", + ) + _require_view_shape( + out_view, + [rows, cols], + context=context, + message=f"Fix: {context} output valid shape mismatch.", + ) + _require_view_dtype( + src_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + _require_view_dtype( + out_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + return rows, cols + + +def _check_row_reduce_operands(src_view, out_view, *, dtype, shape, context): + rows, cols = _require_static_matrix_shape(shape, context=context) + _require_supported_dtype( + dtype, + allowed={"f32", "f16", "i32", "i16"}, + message=( + "Row reduction only supports 'half', 'float', 'int32', or 'int16' data types. " + "Fix: Define TileDataIn with DType = half, float, int32, or int16." + ), + ) + _require_view_shape( + src_view, + [rows, cols], + context=context, + message="Fix: Ensure src valid shape matches [rows, cols].", + ) + _require_view_shape( + out_view, + [rows, 1], + context=context, + message="Fix: Pass dstValidRow = srcValidRows and use a single-column output tile.", + ) + _require_view_dtype( + src_view, + dtype, + message="Fix: Ensure TileDataOut uses the same DType as TileDataIn.", + ) + _require_view_dtype( + out_view, + dtype, + message="Fix: Ensure TileDataOut uses the same DType as TileDataIn.", + ) + return rows, cols + + +def _check_col_reduce_operands(src_view, out_view, *, dtype, shape, context): + rows, cols = _require_static_matrix_shape(shape, context=context) + _require_supported_dtype( + dtype, + allowed={"f32", "f16", "bf16", "i32", "u32", "i16", "u16", "i8", "u8"}, + message=f"Fix: {context} input data type is not supported by this instruction.", + ) + _require_view_shape( + src_view, + [rows, cols], + context=context, + message=f"Fix: {context} input shape mismatch.", + ) + _require_view_shape( + out_view, + [1, cols], + context=context, + message=f"Fix: {context} input valid row must be consistent with the output valid row.", + ) + _require_view_dtype( + src_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + _require_view_dtype( + out_view, + dtype, + message=f"Fix: {context} input data type must be consistent with the output data type.", + ) + return rows, cols + + +def _check_gather_operands(src_view, indices_view, out_view, *, dtype, index_dtype, shape): + rows, cols = _require_static_matrix_shape(shape, context="TGATHER") + dtype_token = _require_supported_dtype( + dtype, + allowed={"f32", "f16", "i32", "u32", "i16", "u16"}, + message="Fix: TGATHER Src data type must be int16_t/uint16_t/int32_t/uint32_t/half/float.", + ) + index_token = _require_supported_dtype( + index_dtype, + allowed={"i32", "u32", "i16", "u16"}, + message="Fix: TGATHER expect b16/b32", + ) + if _dtype_byte_width(dtype) != _dtype_byte_width(index_dtype): + raise ValueError( + "Fix: TGATHER micro lowering currently supports same-width source/index pairs only." + ) + for view, expected_shape, label in ( + (src_view, [rows, cols], "src"), + (indices_view, [rows, cols], "indices"), + (out_view, [rows, cols], "dst"), + ): + _require_view_shape( + view, + expected_shape, + context="TGATHER", + message=f"Fix: TGATHER {label} shape mismatch.", + ) + _require_view_dtype( + src_view, + dtype, + message="Fix: TGATHER expect same type size for dst and src", + ) + _require_view_dtype( + out_view, + dtype, + message="Fix: TGATHER expect same type size for dst and src", + ) + _require_view_dtype( + indices_view, + index_dtype, + message="Fix: TGATHER expect b16/b32", + ) + return rows, cols, dtype_token, index_token + + +def _check_mrgsort_operands(src_view, out_view, *, dtype, shape, block_len): + rows, cols = _require_static_matrix_shape(shape, context="TMRGSORT") + _require_supported_dtype( + dtype, + allowed={"f32", "f16"}, + message="TMrgsort: Unsupported data type! Supported types is half/float", + ) + if rows != 1: + raise ValueError("TMrgsort: the row of Destination and Source tile must be 1.") + if block_len <= 0 or cols % (block_len * 4) != 0: + raise ValueError("TMrgsort: src columns must be divisible by blockLen * 4.") + _require_view_shape( + src_view, + [rows, cols], + context="TMRGSORT", + message="TMrgsort: source tile shape mismatch.", + ) + _require_view_shape( + out_view, + [rows, cols], + context="TMRGSORT", + message="TMrgsort: destination tile shape mismatch.", + ) + _require_view_dtype( + src_view, + dtype, + message="TMrgsort: Destination and Source tile data types must be the same.", + ) + _require_view_dtype( + out_view, + dtype, + message="TMrgsort: Destination and Source tile data types must be the same.", + ) + return rows, cols + + +def _check_sort32_operands(src_view, idx_view, out_view, *, dtype, shape): + rows, cols = _require_static_matrix_shape(shape, context="TSORT32") + _require_supported_dtype( + dtype, + allowed={"f32", "f16"}, + message="Dst and src must be float or half.", + ) + out_cols = cols * (2 if _dtype_token(dtype) == "f32" else 4) + for view, expected_shape, label in ( + (src_view, [rows, cols], "src"), + (idx_view, [rows, cols], "idx"), + (out_view, [rows, out_cols], "dst"), + ): + _require_view_shape( + view, + expected_shape, + context="TSORT32", + message=f"TSORT32 {label} shape mismatch.", + ) + _require_view_dtype( + src_view, + dtype, + message="Dst and src mube be same.", + ) + _require_view_dtype( + out_view, + dtype, + message="Dst and src mube be same.", + ) + _require_view_dtype( + idx_view, + _dsl_pto.uint32, + message="Idx must be uint32_t.", + ) + if cols % 32 != 0: + raise ValueError("TSORT32 micro lowering currently requires column count divisible by 32.") + return rows, cols, out_cols + + +def _row_expand_binary_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr, op_name): + rows, cols = _check_row_expand_operands( + expand_view, out_view, dtype=dtype, shape=shape, context=f"TROWEXPAND_{op_name[1:].upper()}" + ) + _require_view_shape( + base_view, + [rows, cols], + context=op_name, + message=f"Fix: TROWEXPAND_{op_name[1:].upper()} base input valid shape mismatch with output tile dst shape.", + ) + _require_view_dtype( + base_view, + dtype, + message=f"Fix: TROWEXPAND_{op_name[1:].upper()} input data type must be consistent with the output data type.", + ) + lanes = _micro_lane_count(dtype) + vreg_type = _dsl_pto.VRegType(lanes, dtype) + buf_bytes = rows * cols * _dtype_byte_width(dtype) + + base_addr_value = _const_i64(base_addr) + expand_addr_value = _const_i64(base_addr + buf_bytes) + out_addr_value = _const_i64(base_addr + buf_bytes * 2) + + base_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=base_addr_value) + expand_tile = _dsl_pto.make_tile_buffer( + dtype, shape, space="VEC", valid_shape=[rows, 1] + ).alloc(addr=expand_addr_value) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr_value) + + _dsl_pto.load(base_view, base_tile) + _dsl_pto.load(expand_view, expand_tile) + + base_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), base_addr_value) + expand_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), expand_addr_value) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr_value) + micro_op = getattr(_dsl_pto, op_name) + + for row in range(rows): + scalar_offset = _scalar.const(row * cols) + align = _dsl_pto.vldas(_dsl_pto.AlignType(), expand_ptr, scalar_offset) + scalar_vec, _, _ = _dsl_pto.vldus( + vreg_type, + _dsl_pto.AlignType(), + _dsl_pto.ptr(dtype, space="VEC"), + expand_ptr, + scalar_offset, + align, + ) + broadcast = _dsl_pto.vdup(vreg_type, scalar_vec, position="POS_LOWEST") + for col in range(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + row_offset = _scalar.const(row * cols + col) + base_vec = _dsl_pto.vlds(vreg_type, base_ptr, row_offset) + out_vec = micro_op(vreg_type, base_vec, broadcast, mask) + _dsl_pto.vsts(out_vec, out_ptr, row_offset, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _row_reduce_micro( + src_view, + out_view, + *, + dtype, + shape, + base_addr, + reduce_op_name, + combine_op_name, + init_value, +): + rows, cols = _check_row_reduce_operands( + src_view, out_view, dtype=dtype, shape=shape, context="TROWREDUCE" + ) + width = _dtype_byte_width(dtype) + if width not in {2, 4}: + raise ValueError(f"{reduce_op_name} currently supports only float16/float32.") + + lanes = _micro_lane_count(dtype) + vreg_type = _dsl_pto.VRegType(lanes, dtype) + buf_bytes = rows * cols * width + + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer( + dtype, shape, space="VEC", valid_shape=[rows, 1] + ).alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + reduce_op = getattr(_dsl_pto, reduce_op_name) + combine_op = getattr(_dsl_pto, combine_op_name) + full_mask = _full_mask(dtype) + point_mask = _tail_mask(dtype, 1) + init_scalar = _const_float(dtype, init_value) + + for row in range(rows): + accum = _dsl_pto.vbr(vreg_type, init_scalar) + for col in range(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + offset = _scalar.const(row * cols + col) + vec = _dsl_pto.vlds(vreg_type, src_ptr, offset) + reduced = reduce_op(vreg_type, vec, mask) + accum = combine_op(vreg_type, accum, reduced, full_mask) + out_offset = _scalar.const(row * cols) + _dsl_pto.vsts(accum, out_ptr, out_offset, point_mask, dist=_onept_dist(dtype)) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _col_reduce_micro( + src_view, + out_view, + *, + dtype, + shape, + base_addr, + reduce_op_name, + impl, +): + rows, cols = _check_col_reduce_operands( + src_view, out_view, dtype=dtype, shape=shape, context="TCOLREDUCE" + ) + lanes = _micro_lane_count(dtype) + buf_bytes = rows * cols * _dtype_byte_width(dtype) + + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer( + dtype, [1, cols], space="VEC", valid_shape=[1, cols] + ).alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + ptr_type = _dsl_pto.ptr(dtype, space="VEC") + vreg_type = _dsl_pto.VRegType(lanes, dtype) + src_ptr = _dsl_pto.castptr(ptr_type, src_addr) + out_ptr = _dsl_pto.castptr(ptr_type, out_addr) + reduce_op = getattr(_dsl_pto, reduce_op_name) + impl_kind = _normalize_vf_impl_kind(impl) + if const_expr(impl_kind == VF_IMPL_DEFAULT): + impl_kind = VF_IMPL_1D_POST_UPDATE + + if const_expr(impl_kind in {VF_IMPL_1D_NO_POST_UPDATE, VF_IMPL_2D_NO_POST_UPDATE}): + _col_reduce_micro_no_post_update( + src_ptr, + out_ptr, + dtype=dtype, + rows=rows, + cols=cols, + lanes=lanes, + vreg_type=vreg_type, + reduce_op=reduce_op, + ) + elif const_expr(impl_kind in {VF_IMPL_1D_POST_UPDATE, VF_IMPL_2D_POST_UPDATE}): + _col_reduce_micro_post_update( + src_ptr, + out_ptr, + ptr_type=ptr_type, + dtype=dtype, + rows=rows, + cols=cols, + lanes=lanes, + vreg_type=vreg_type, + reduce_op=reduce_op, + ) + else: + raise ValueError(f"Unexpected normalized VF impl kind '{impl_kind}'.") + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _col_reduce_micro_no_post_update( + src_ptr, out_ptr, *, dtype, rows, cols, lanes, vreg_type, reduce_op +): + loop_pairs = (rows - 1) // 2 + remain = (rows - 1) % 2 + for col in range_constexpr(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + accum = _dsl_pto.vlds(vreg_type, src_ptr, _scalar.const(col)) + for pair in range_constexpr(loop_pairs): + row0 = 2 * pair + 1 + row1 = 2 * pair + 2 + src0 = _dsl_pto.vlds(vreg_type, src_ptr, _scalar.const(col + row0 * cols)) + src1 = _dsl_pto.vlds(vreg_type, src_ptr, _scalar.const(col + row1 * cols)) + tmp = reduce_op(vreg_type, src0, src1, mask) + accum = reduce_op(vreg_type, accum, tmp, mask) + if const_expr(remain): + tail_row = 2 * loop_pairs + 1 + src_tail = _dsl_pto.vlds(vreg_type, src_ptr, _scalar.const(col + tail_row * cols)) + accum = reduce_op(vreg_type, accum, src_tail, mask) + _dsl_pto.vsts(accum, out_ptr, _scalar.const(col), mask) + + +def _col_reduce_micro_post_update( + src_ptr, out_ptr, *, ptr_type, dtype, rows, cols, lanes, vreg_type, reduce_op +): + src_cursor = src_ptr + out_cursor = out_ptr + loop_pairs = (rows - 1) // 2 + remain = (rows - 1) % 2 + lane_step = _scalar.const(lanes) + pair_stride = _scalar.const(cols * 2) + for col in range_constexpr(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + chunk_base = src_cursor + accum, src_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, src_cursor, lane_step) + row0_ptr = _dsl_pto.addptr(chunk_base, _scalar.const(cols)) + row1_ptr = _dsl_pto.addptr(chunk_base, _scalar.const(cols * 2)) + for _ in range_constexpr(loop_pairs): + src0, row0_ptr = _dsl_pto.vlds_post(vreg_type, ptr_type, row0_ptr, pair_stride) + src1, row1_ptr = _dsl_pto.vlds_post(vreg_type, ptr_type, row1_ptr, pair_stride) + tmp = reduce_op(vreg_type, src0, src1, mask) + accum = reduce_op(vreg_type, accum, tmp, mask) + if const_expr(remain): + src_tail = _dsl_pto.vlds(vreg_type, row0_ptr, _scalar.const(0)) + accum = reduce_op(vreg_type, accum, src_tail, mask) + out_cursor = _dsl_pto.vsts_post(ptr_type, accum, out_cursor, lane_step, mask) + + +def _gather_micro( + src_view, + indices_view, + out_view, + *, + dtype, + index_dtype, + shape, + base_addr, +): + rows, cols, _, _ = _check_gather_operands( + src_view, indices_view, out_view, dtype=dtype, index_dtype=index_dtype, shape=shape + ) + src_bytes = rows * cols * _dtype_byte_width(dtype) + idx_bytes = rows * cols * _dtype_byte_width(index_dtype) + + src_addr = _const_i64(base_addr) + idx_addr = _const_i64(base_addr + src_bytes) + out_addr = _const_i64(base_addr + src_bytes + idx_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + idx_tile = _dsl_pto.make_tile_buffer(index_dtype, shape, space="VEC").alloc(addr=idx_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + _dsl_pto.load(indices_view, idx_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + idx_ptr = _dsl_pto.castptr(_dsl_pto.ptr(index_dtype, space="VEC"), idx_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + lanes = _micro_lane_count(dtype) + vreg_type = _dsl_pto.VRegType(lanes, dtype) + index_vreg_type = _dsl_pto.VRegType(_micro_lane_count(index_dtype), index_dtype) + + for row in range_constexpr(rows): + row_base = row * cols + for col in range_constexpr(0, cols, lanes): + active = builtins.min(lanes, cols - col) + offset = _scalar.const(row_base + col) + mask = _mask_for_chunk(dtype, active) + idx_vec = _dsl_pto.vlds(index_vreg_type, idx_ptr, offset) + out_vec = _dsl_pto.vgather2(vreg_type, src_ptr, idx_vec, _scalar.const(active)) + _dsl_pto.vsts(out_vec, out_ptr, offset, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _mrgsort_micro(src_view, out_view, *, dtype, shape, block_len, base_addr): + _, cols = _check_mrgsort_operands(src_view, out_view, dtype=dtype, shape=shape, block_len=block_len) + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + cols * _dtype_byte_width(dtype)) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + _dsl_pto.load(src_view, src_tile) + + ptr_type = _dsl_pto.ptr(dtype, space="VEC") + src_ptr = _dsl_pto.castptr(ptr_type, src_addr) + out_ptr = _dsl_pto.castptr(ptr_type, out_addr) + + src1_ptr = _dsl_pto.addptr(src_ptr, _scalar.const(block_len)) + src2_ptr = _dsl_pto.addptr(src_ptr, _scalar.const(block_len * 2)) + src3_ptr = _dsl_pto.addptr(src_ptr, _scalar.const(block_len * 3)) + + num_structures = (block_len * _dtype_byte_width(dtype)) >> 3 + count_value = ( + num_structures + | (num_structures << 16) + | (num_structures << 32) + | (num_structures << 48) + ) + repeat_times = cols // (block_len * 4) + config_value = repeat_times | (0b1111 << 8) + + _dsl_pto.vmrgsort4( + out_ptr, + src_ptr, + src1_ptr, + src2_ptr, + src3_ptr, + _const_i64(count_value), + _const_i64(config_value), + ) + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _sort32_micro(src_view, idx_view, out_view, *, dtype, shape, base_addr): + rows, cols, out_cols = _check_sort32_operands(src_view, idx_view, out_view, dtype=dtype, shape=shape) + src_bytes = rows * cols * _dtype_byte_width(dtype) + idx_bytes = rows * cols * 4 + + src_addr = _const_i64(base_addr) + idx_addr = _const_i64(base_addr + src_bytes) + out_addr = _const_i64(base_addr + src_bytes + idx_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, [rows, cols], space="VEC").alloc(addr=src_addr) + idx_tile = _dsl_pto.make_tile_buffer(_dsl_pto.uint32, [rows, cols], space="VEC").alloc(addr=idx_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, [rows, out_cols], space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + _dsl_pto.load(idx_view, idx_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + idx_ptr = _dsl_pto.castptr(_dsl_pto.ptr(_dsl_pto.uint32, space="VEC"), idx_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + repeat_times = _scalar.const(cols // 32) + + for row in range_constexpr(rows): + src_row = _dsl_pto.addptr(src_ptr, _scalar.const(row * cols)) + idx_row = _dsl_pto.addptr(idx_ptr, _scalar.const(row * cols)) + out_row = _dsl_pto.addptr(out_ptr, _scalar.const(row * out_cols)) + _dsl_pto.vbitsort(out_row, src_row, idx_row, repeat_times) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _binary_micro(lhs_view, rhs_view, out_view, *, dtype, shape, lanes, base_addr, op_name, impl): + rows, cols = _check_tbinop_operands( + lhs_view, rhs_view, out_view, dtype=dtype, shape=shape, context=op_name.upper().replace("V", "T", 1) + ) + lanes = _resolve_lanes(dtype, lanes) + element_count = rows * cols + buf_bytes = element_count * _dtype_byte_width(dtype) + lhs_addr = _const_i64(base_addr) + rhs_addr = _const_i64(base_addr + buf_bytes) + out_addr = _const_i64(base_addr + buf_bytes * 2) + + lhs_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=lhs_addr) + rhs_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=rhs_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(lhs_view, lhs_tile) + _dsl_pto.load(rhs_view, rhs_tile) + + ptr_type = _dsl_pto.ptr(dtype, space="VEC") + vreg_type = _dsl_pto.VRegType(lanes, dtype) + lhs_ptr = _dsl_pto.castptr(ptr_type, lhs_addr) + rhs_ptr = _dsl_pto.castptr(ptr_type, rhs_addr) + out_ptr = _dsl_pto.castptr(ptr_type, out_addr) + micro_op = getattr(_dsl_pto, op_name) + impl_kind = _normalize_vf_impl_kind(impl) + is_contiguous = rows == 1 or cols == element_count + if const_expr(impl_kind == VF_IMPL_DEFAULT): + impl_kind = VF_IMPL_1D_POST_UPDATE if is_contiguous else VF_IMPL_2D_NO_POST_UPDATE + + if const_expr(impl_kind == VF_IMPL_1D_NO_POST_UPDATE): + _binary_micro_1d_no_post_update( + lhs_ptr, + rhs_ptr, + out_ptr, + dtype=dtype, + lanes=lanes, + element_count=element_count, + vreg_type=vreg_type, + micro_op=micro_op, + ) + elif const_expr(impl_kind == VF_IMPL_1D_POST_UPDATE): + _binary_micro_1d_post_update( + lhs_ptr, + rhs_ptr, + out_ptr, + ptr_type=ptr_type, + dtype=dtype, + lanes=lanes, + element_count=element_count, + vreg_type=vreg_type, + micro_op=micro_op, + ) + elif const_expr(impl_kind == VF_IMPL_2D_NO_POST_UPDATE): + _binary_micro_2d_no_post_update( + lhs_ptr, + rhs_ptr, + out_ptr, + dtype=dtype, + rows=rows, + cols=cols, + lanes=lanes, + vreg_type=vreg_type, + micro_op=micro_op, + ) + elif const_expr(impl_kind == VF_IMPL_2D_POST_UPDATE): + _binary_micro_2d_post_update( + lhs_ptr, + rhs_ptr, + out_ptr, + dtype=dtype, + rows=rows, + cols=cols, + lanes=lanes, + vreg_type=vreg_type, + micro_op=micro_op, + ) + else: + raise ValueError(f"Unexpected normalized VF impl kind '{impl_kind}'.") + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _binary_micro_1d_no_post_update( + lhs_ptr, rhs_ptr, out_ptr, *, dtype, lanes, element_count, vreg_type, micro_op +): + for offset in range_constexpr(0, element_count, lanes): + active = builtins.min(lanes, element_count - offset) + mask = _mask_for_chunk(dtype, active) + index = _scalar.const(offset) + lhs_vec = _dsl_pto.vlds(vreg_type, lhs_ptr, index) + rhs_vec = _dsl_pto.vlds(vreg_type, rhs_ptr, index) + out_vec = micro_op(vreg_type, lhs_vec, rhs_vec, mask) + _dsl_pto.vsts(out_vec, out_ptr, index, mask) + + +def _binary_micro_1d_post_update( + lhs_ptr, rhs_ptr, out_ptr, *, ptr_type, dtype, lanes, element_count, vreg_type, micro_op +): + lhs_cursor = lhs_ptr + rhs_cursor = rhs_ptr + out_cursor = out_ptr + lane_step = _scalar.const(lanes) + for offset in range_constexpr(0, element_count, lanes): + active = builtins.min(lanes, element_count - offset) + mask = _mask_for_chunk(dtype, active) + lhs_vec, lhs_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, lhs_cursor, lane_step) + rhs_vec, rhs_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, rhs_cursor, lane_step) + out_vec = micro_op(vreg_type, lhs_vec, rhs_vec, mask) + out_cursor = _dsl_pto.vsts_post(ptr_type, out_vec, out_cursor, lane_step, mask) + + +def _binary_micro_2d_no_post_update( + lhs_ptr, rhs_ptr, out_ptr, *, dtype, rows, cols, lanes, vreg_type, micro_op +): + for row in range_constexpr(rows): + row_base = row * cols + for col in range_constexpr(0, cols, lanes): + active = builtins.min(lanes, cols - col) + mask = _mask_for_chunk(dtype, active) + index = _scalar.const(row_base + col) + lhs_vec = _dsl_pto.vlds(vreg_type, lhs_ptr, index) + rhs_vec = _dsl_pto.vlds(vreg_type, rhs_ptr, index) + out_vec = micro_op(vreg_type, lhs_vec, rhs_vec, mask) + _dsl_pto.vsts(out_vec, out_ptr, index, mask) + + +def _binary_micro_2d_post_update( + lhs_ptr, rhs_ptr, out_ptr, *, dtype, rows, cols, lanes, vreg_type, micro_op +): + _binary_micro_2d_no_post_update( + lhs_ptr, + rhs_ptr, + out_ptr, + dtype=dtype, + rows=rows, + cols=cols, + lanes=lanes, + vreg_type=vreg_type, + micro_op=micro_op, + ) + + +def _rsqrt_micro(src_view, out_view, *, dtype, shape, lanes, base_addr): + if any(not isinstance(dim, int) for dim in shape): + raise ValueError("micro tile lowering currently requires a static integer shape.") + + lanes = _resolve_lanes(dtype, lanes) + element_count = 1 + for dim in shape: + element_count *= dim + + buf_bytes = element_count * _dtype_byte_width(dtype) + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + vreg_type = _dsl_pto.VRegType(lanes, dtype) + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + + for offset in range_constexpr(0, element_count, lanes): + active = builtins.min(lanes, element_count - offset) + mask = _mask_for_chunk(dtype, active) + index = _scalar.const(offset) + src_vec = _dsl_pto.vlds(vreg_type, src_ptr, index) + sqrt_vec = _dsl_pto.vsqrt(vreg_type, src_vec, mask) + out_vec = _dsl_pto.vrec(vreg_type, sqrt_vec, mask) + _dsl_pto.vsts(out_vec, out_ptr, index, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def _unary_micro(src_view, out_view, *, dtype, shape, lanes, base_addr, op_name): + if any(not isinstance(dim, int) for dim in shape): + raise ValueError("micro tile lowering currently requires a static integer shape.") + + lanes = _resolve_lanes(dtype, lanes) + element_count = 1 + for dim in shape: + element_count *= dim + + buf_bytes = element_count * _dtype_byte_width(dtype) + src_addr = _const_i64(base_addr) + out_addr = _const_i64(base_addr + buf_bytes) + + src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) + + _dsl_pto.load(src_view, src_tile) + + src_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), src_addr) + out_ptr = _dsl_pto.castptr(_dsl_pto.ptr(dtype, space="VEC"), out_addr) + micro_op = getattr(_dsl_pto, op_name) if op_name is not None else None + + for offset in range_constexpr(0, element_count, lanes): + active = builtins.min(lanes, element_count - offset) + mask = _mask_for_chunk(dtype, active) + index = _scalar.const(offset) + src_vec = _dsl_pto.vlds(_dsl_pto.VRegType(lanes, dtype), src_ptr, index) + out_vec = src_vec if micro_op is None else micro_op( + _dsl_pto.VRegType(lanes, dtype), src_vec, mask + ) + _dsl_pto.vsts(out_vec, out_ptr, index, mask) + + _dsl_pto.store(out_tile, out_view) + return out_view + + +def adds(src, scalar, out): + _call(_pto.TAddSOp, src, scalar, out) + return out + + +def sub(lhs, rhs, out): + _call(_pto.TSubOp, lhs, rhs, out) + return out + + +def subs(src, scalar, out): + _call(_pto.TSubSOp, src, scalar, out) + return out + + +def mul(lhs, rhs, out): + _call(_pto.TMulOp, lhs, rhs, out) + return out + + +def muls(src, scalar, out): + _call(_pto.TMulSOp, src, scalar, out) + return out + + +def div(lhs, rhs, out): + _call(_pto.TDivOp, lhs, rhs, out) + return out + + +def divs(src, scalar, out): + _call(_pto.TDivSOp, src, scalar, out) + return out + + +def max(lhs, rhs, out): + _call(_pto.TMaxOp, lhs, rhs, out) + return out + + +def maxs(src, scalar, out): + _call(_pto.TMaxSOp, src, scalar, out) + return out + + +def min(lhs, rhs, out): + _call(_pto.TMinOp, lhs, rhs, out) + return out + + +def mins(src, scalar, out): + _call(_pto.TMinSOp, src, scalar, out) + return out + + +def and_(lhs, rhs, out): + _call(_pto.TAndOp, lhs, rhs, out) + return out + + +def or_(lhs, rhs, out): + _call(_pto.TOrOp, lhs, rhs, out) + return out + + +def xor(lhs, rhs, out): + _call(_pto.TXorOp, lhs, rhs, out) + return out + + +def shl(lhs, rhs, out): + _call(_pto.TShlOp, lhs, rhs, out) + return out + + +def shls(src, scalar, out): + _call(_pto.TShlSOp, src, scalar, out) + return out + + +def shr(lhs, rhs, out): + _call(_pto.TShrOp, lhs, rhs, out) + return out + + +def shrs(src, scalar, out): + _call(_pto.TShrSOp, src, scalar, out) + return out + + +def compare(src0, src1, out, *, mode): + _call(_pto.TCmpOp, src0, src1, out, cmpMode=_cmp_mode_attr(mode)) + return out + + +def exp(src, out): + _call(_pto.TExpOp, src, out) + return out + + +def log(src, out): + _call(_pto.TLogOp, src, out) + return out + + +def relu(src, out): + _call(_pto.TReluOp, src, out) + return out + + +def abs(src, out): + _call(_pto.TAbsOp, src, out) + return out + + +def sqrt(src, out): + _call(_pto.TSqrtOp, src, out) + return out + + +def rsqrt(src, out): + _call(_pto.TRsqrtOp, src, out) + return out + + +def reciprocal(src, out): + _call(_pto.TRecipOp, src, out) + return out + + +def lrelu(src, slope, out): + _call(_pto.TLReluOp, src, slope, out) + return out + + +def gather(src, out, *, indices=None, mask_pattern=None): + kwargs = {} + if indices is not None: + kwargs["indices"] = indices + if mask_pattern is not None: + kwargs["maskPattern"] = _pto.MaskPatternAttr.get( + getattr(_pto.MaskPattern, mask_pattern) + ) + _call(_pto.TGatherOp, src, out, **kwargs) + return out + + +def scatter(src, indices, out): + _call(_pto.TScatterOp, src, indices, out) + return out + + +def select(mask, src0, src1, tmp, out): + _call(_pto.TSelOp, mask, src0, src1, tmp, out) + return out + + +def concat(src0, src1, out): + _call(_pto.TConcatOp, src0, src1, out) + return out + + +def extract(source, index_row, index_col, out): + _call(_pto.TExtractOp, source, index_row, index_col, out) + return out + + +def insert(source, index_row, index_col, out): + _call(_pto.TInsertOp, source, index_row, index_col, out) + return out + + +def row_sum(src, tmp, dst): + _call(_pto.TRowSumOp, src=src, tmp=tmp, dst=dst) + return dst + + +def row_min(src, tmp, dst): + _call(_pto.TRowMinOp, src=src, tmp=tmp, dst=dst) + return dst + + +def row_max(src, tmp, dst): + _call(_pto.TRowMaxOp, src=src, tmp=tmp, dst=dst) + return dst + + +def col_sum(src, tmp, dst, *, is_binary=True): + _call(_pto.TColSumOp, src=src, tmp=tmp, dst=dst, isBinary=is_binary) + return dst + + +def col_min(src, dst): + _call(_pto.TColMinOp, src=src, dst=dst) + return dst + + +def col_max(src, dst): + _call(_pto.TColMaxOp, src=src, dst=dst) + return dst + + +def row_expand(src, dst): + _call(_pto.TRowExpandOp, src=src, dst=dst) + return dst + + +def row_expand_sub(src0, src1, dst): + _call(_pto.TRowExpandSubOp, src0=src0, src1=src1, dst=dst) + return dst + + +def row_expand_mul(src0, src1, dst): + _call(_pto.TRowExpandMulOp, src0=src0, src1=src1, dst=dst) + return dst + + +def row_expand_div(src0, src1, dst): + _call(_pto.TRowExpandDivOp, src0=src0, src1=src1, dst=dst) + return dst + + +def col_expand(src, dst): + _call(_pto.TColExpandOp, src=src, dst=dst) + return dst + + +def col_expand_mul(src0, src1, dst): + _call(_pto.TColExpandMulOp, src0=src0, src1=src1, dst=dst) + return dst + + +def col_expand_max(src0, src1, dst): + _call(_pto.TColExpandMaxOp, src0=src0, src1=src1, dst=dst) + return dst + + +def col_expand_min(src0, src1, dst): + _call(_pto.TColExpandMinOp, src0=src0, src1=src1, dst=dst) + return dst + + +def trans(src, dst): + _call(_pto.TTransOp, src, dst) + return dst + + +def mrgsort(src, dst, block_len): + _call(_pto.TMrgSortOp, srcs=[src], dsts=[dst], blockLen=block_len) + return dst + + +def sort32(src, dst, idx): + _call(_pto.TSort32Op, src, dst, idx) + return dst + + +def matmul(lhs, rhs, out): + _call(_pto.TMatmulOp, None, lhs, rhs, out) + return out + + +def matmul_acc(acc, lhs, rhs, out): + _call(_pto.TMatmulAccOp, None, acc, lhs, rhs, out) + return out + + +def matmul_bias(lhs, rhs, bias, out): + _call(_pto.TMatmulBiasOp, None, lhs, rhs, bias, out) + return out + + +def matmul_mx(lhs, lhs_scale, rhs, rhs_scale, out): + _call(_pto.TMatmulMxOp, None, lhs, lhs_scale, rhs, rhs_scale, out) + return out + + +def matmul_mx_acc(acc, lhs, lhs_scale, rhs, rhs_scale, out): + _call(_pto.TMatmulMxAccOp, None, acc, lhs, lhs_scale, rhs, rhs_scale, out) + return out + + +def matmul_mx_bias(lhs, lhs_scale, rhs, rhs_scale, bias, out): + _call(_pto.TMatmulMxBiasOp, None, lhs, lhs_scale, rhs, rhs_scale, bias, out) + return out + + +def full_mask_b32(): + return _dsl_pto.pset_b32(_dsl_pto.MaskType(), "PAT_ALL") + + +def vload(ptr, offset, *, lanes=64, dtype=None): + dtype = _dsl_pto.float32 if dtype is None else dtype + return _dsl_pto.vlds(_dsl_pto.VRegType(lanes, dtype), ptr, offset) + + +def vstore(vector, ptr, offset, *, mask=None): + if mask is None: + mask = full_mask_b32() + _dsl_pto.vsts(vector, ptr, offset, mask) + return ptr + + +def vector_copy(src_ptr, dst_ptr, offset, *, lanes=64, dtype=None): + vec = vload(src_ptr, offset, lanes=lanes, dtype=dtype) + vstore(vec, dst_ptr, offset) + return vec + + +TLoad = load_tile +TStore = store_tile +TMov = move_tile +TAdd = add +TAddS = adds +TSub = sub +TSubS = subs +TMul = mul +TMulS = muls +TDiv = div +TDivS = divs +TMax = max +TMaxS = maxs +TMin = min +TMinS = mins +TAnd = and_ +TOr = or_ +TXor = xor +TShl = shl +TShlS = shls +TShr = shr +TShrS = shrs +TCmp = compare +TExp = exp +TLog = log +TRelu = relu +TAbs = abs +TSqrt = sqrt +TRsqrt = rsqrt +TRecip = reciprocal +TLRelu = lrelu +TGather = gather +TScatter = scatter +TSel = select +TConcat = concat +TExtract = extract +TInsert = insert +TRowSum = row_sum +TRowMin = row_min +TRowMax = row_max +TColSum = col_sum +TColMin = col_min +TColMax = col_max +TRowExpand = row_expand +TRowExpandSub = row_expand_sub +TRowExpandMul = row_expand_mul +TRowExpandDiv = row_expand_div +TColExpand = col_expand +TColExpandMul = col_expand_mul +TColExpandMax = col_expand_max +TColExpandMin = col_expand_min +TTrans = trans +TMrgSort = mrgsort +TSort32 = sort32 +TMatmul = matmul +TMatmulAcc = matmul_acc +TMatmulBias = matmul_bias +TMatmulMx = matmul_mx +TMatmulMxAcc = matmul_mx_acc +TMatmulMxBias = matmul_mx_bias + + +__all__ = [ + "VF_IMPL_DEFAULT", + "VF_IMPL_1D_NO_POST_UPDATE", + "VF_IMPL_1D_POST_UPDATE", + "VF_IMPL_2D_NO_POST_UPDATE", + "VF_IMPL_2D_POST_UPDATE", + "TAbs", + "TAdd", + "TAddS", + "TAnd", + "TColExpand", + "TColExpandMax", + "TColExpandMin", + "TColExpandMul", + "TColMax", + "TColMin", + "TColSum", + "TConcat", + "TCmp", + "TDiv", + "TDivS", + "TExp", + "TExtract", + "TGather", + "TInsert", + "TLRelu", + "TLoad", + "TLog", + "TMatmul", + "TMatmulAcc", + "TMatmulBias", + "TMatmulMx", + "TMatmulMxAcc", + "TMatmulMxBias", + "TMax", + "TMaxS", + "TMin", + "TMinS", + "TMov", + "TMrgSort", + "TMul", + "TMulS", + "TOr", + "TRecip", + "TRelu", + "TRowExpand", + "TRowExpandDiv", + "TRowExpandMul", + "TRowExpandSub", + "TRowMax", + "TRowMin", + "TRowSum", + "TRsqrt", + "TScatter", + "TSel", + "TShl", + "TShlS", + "TShr", + "TShrS", + "TSort32", + "TSqrt", + "TStore", + "TSub", + "TSubS", + "TTrans", + "TXor", + "add", + "add_micro", + "abs_micro", + "adds", + "and_", + "col_expand", + "col_expand_micro", + "col_expand_max", + "col_expand_min", + "col_expand_mul", + "col_max", + "col_max_micro", + "col_min", + "col_min_micro", + "col_sum", + "col_sum_micro", + "compare", + "concat", + "div", + "divs", + "exp", + "exp_micro", + "extract", + "full_mask_b32", + "gather", + "gather_micro", + "insert", + "load_tile", + "log", + "log_micro", + "lrelu", + "matmul", + "matmul_acc", + "matmul_bias", + "matmul_mx", + "matmul_mx_acc", + "matmul_mx_bias", + "max", + "maxs", + "min", + "mins", + "move_tile", + "mov_micro", + "mrgsort", + "mrgsort_micro", + "mul", + "muls", + "or_", + "reciprocal", + "reciprocal_micro", + "relu", + "relu_micro", + "row_expand", + "row_expand_div_micro", + "row_expand_micro", + "row_expand_mul_micro", + "row_expand_div", + "row_expand_sub_micro", + "row_expand_mul", + "row_expand_sub", + "row_max", + "row_max_micro", + "row_min", + "row_min_micro", + "row_sum", + "row_sum_micro", + "rsqrt", + "rsqrt_micro", + "scatter", + "select", + "shl", + "shls", + "shr", + "shrs", + "sort32", + "sort32_micro", + "sqrt", + "sqrt_micro", + "store_tile", + "sub", + "sub_micro", + "subs", + "div_micro", + "mul_micro", + "or_micro", + "trans", + "vector_copy", + "vload", + "vstore", + "xor", +] diff --git a/ptodsl/lib/a5/tile_micro_coverage.py b/ptodsl/lib/a5/tile_micro_coverage.py new file mode 100644 index 00000000..33ea434f --- /dev/null +++ b/ptodsl/lib/a5/tile_micro_coverage.py @@ -0,0 +1,198 @@ +from ptodsl import tile + + +TILE_MICRO_COVERAGE = { + "mov": { + "status": "implemented", + "helper": "mov_micro", + "note": "UB stage + vlds/vsts copy loop.", + }, + "add": { + "status": "implemented", + "helper": "add_micro", + "note": "UB stage + constexpr-specialized TBinOp-style vlds/vadd/vsts lowering.", + }, + "sub": { + "status": "implemented", + "helper": "sub_micro", + "note": "UB stage + constexpr-specialized TBinOp-style vlds/vsub/vsts lowering.", + }, + "div": { + "status": "implemented", + "helper": "div_micro", + "note": "UB stage + constexpr-specialized TBinOp-style vlds/vdiv/vsts lowering.", + }, + "mul": { + "status": "implemented", + "helper": "mul_micro", + "note": "UB stage + constexpr-specialized TBinOp-style vlds/vmul/vsts lowering.", + }, + "or_": { + "status": "implemented", + "helper": "or_micro", + "note": "UB stage + constexpr-specialized TBinOp-style vlds/vor/vsts lowering.", + }, + "gather": { + "status": "partial", + "helper": "gather_micro", + "note": "Indexed gather is implemented via vgather2 for same-width source/index pairs; mask-pattern gather still needs unsupported vsqz-style micro support.", + }, + "exp": { + "status": "implemented", + "helper": "exp_micro", + "note": "UB stage + vlds/vexp/vsts loop.", + }, + "log": { + "status": "implemented", + "helper": "log_micro", + "note": "UB stage + vlds/vln/vsts loop.", + }, + "relu": { + "status": "implemented", + "helper": "relu_micro", + "note": "UB stage + vlds/vrelu/vsts loop.", + }, + "abs": { + "status": "implemented", + "helper": "abs_micro", + "note": "UB stage + vlds/vabs/vsts loop.", + }, + "sqrt": { + "status": "implemented", + "helper": "sqrt_micro", + "note": "UB stage + vlds/vsqrt/vsts loop.", + }, + "rsqrt": { + "status": "implemented", + "helper": "rsqrt_micro", + "note": "UB stage + vsqrt/vrec micro sequence.", + }, + "reciprocal": { + "status": "implemented", + "helper": "reciprocal_micro", + "note": "UB stage + vlds/vrec/vsts loop.", + }, + "matmul": { + "status": "blocked", + "helper": None, + "note": "Cube/L0 path is not a pure vector-micro rewrite target.", + }, + "matmul_bias": { + "status": "blocked", + "helper": None, + "note": "Cube/L0 path is not a pure vector-micro rewrite target.", + }, + "matmul_acc": { + "status": "blocked", + "helper": None, + "note": "Cube/L0 path is not a pure vector-micro rewrite target.", + }, + "extract": { + "status": "blocked", + "helper": None, + "note": "Layout/L0 extraction op, not a vector-micro compute rewrite.", + }, + "row_sum": { + "status": "implemented", + "helper": "row_sum_micro", + "note": "Static-shape row reduction via vcadd + point-store.", + }, + "row_min": { + "status": "implemented", + "helper": "row_min_micro", + "note": "Static-shape row reduction via vcmin + point-store.", + }, + "row_max": { + "status": "implemented", + "helper": "row_max_micro", + "note": "Static-shape row reduction via vcmax + point-store.", + }, + "row_expand": { + "status": "implemented", + "helper": "row_expand_micro", + "note": "Static-shape canonical broadcast via vldas/vldus/vdup/vsts.", + }, + "row_expand_sub": { + "status": "implemented", + "helper": "row_expand_sub_micro", + "note": "Static-shape canonical broadcast via vldas/vldus/vdup/vsub/vsts.", + }, + "row_expand_div": { + "status": "implemented", + "helper": "row_expand_div_micro", + "note": "Static-shape canonical broadcast via vldas/vldus/vdup/vdiv/vsts.", + }, + "row_expand_mul": { + "status": "implemented", + "helper": "row_expand_mul_micro", + "note": "Static-shape canonical broadcast via vldas/vldus/vdup/vmul/vsts.", + }, + "col_sum": { + "status": "implemented", + "helper": "col_sum_micro", + "note": "Static-shape TColReduceOps-style column reduction via vadd.", + }, + "col_min": { + "status": "implemented", + "helper": "col_min_micro", + "note": "Static-shape TColReduceOps-style column reduction via vmin.", + }, + "col_max": { + "status": "implemented", + "helper": "col_max_micro", + "note": "Static-shape TColReduceOps-style column reduction via vmax.", + }, + "col_expand": { + "status": "implemented", + "helper": "col_expand_micro", + "note": "Static-shape canonical broadcast via vlds/vsts replication.", + }, + "mrgsort": { + "status": "implemented", + "helper": "mrgsort_micro", + "note": "Single-list row-major merge sort via vmrgsort4.", + }, + "sort32": { + "status": "implemented", + "helper": "sort32_micro", + "note": "Static-shape block sort via vbitsort.", + }, + "subset": { + "status": "not_applicable", + "helper": None, + "note": "View helper only, not a tile compute op.", + }, +} + + +def coverage_summary(): + counts = {} + for entry in TILE_MICRO_COVERAGE.values(): + status = entry["status"] + counts[status] = counts.get(status, 0) + 1 + return counts + + +def coverage_markdown(): + counts = coverage_summary() + lines = [ + "# Tile Micro Coverage", + "", + f"- Total public tile ops: `{len(tile.__all__)}`", + f"- Implemented: `{counts.get('implemented', 0)}`", + f"- Partial: `{counts.get('partial', 0)}`", + f"- Pending: `{counts.get('pending', 0)}`", + f"- Blocked: `{counts.get('blocked', 0)}`", + f"- Not applicable: `{counts.get('not_applicable', 0)}`", + "", + "| tile op | status | helper | note |", + "| --- | --- | --- | --- |", + ] + for name in tile.__all__: + entry = TILE_MICRO_COVERAGE[name] + helper = entry["helper"] or "-" + lines.append(f"| `{name}` | `{entry['status']}` | `{helper}` | {entry['note']} |") + return "\n".join(lines) + "\n" + + +__all__ = ["TILE_MICRO_COVERAGE", "coverage_markdown", "coverage_summary"] diff --git a/pyproject.toml b/pyproject.toml index b5ef41cd..de2df06b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ packages = [ "ptodsl", "ptodsl.api", "ptodsl.compiler", + "ptodsl.lib", + "ptodsl.lib.a5", "ptodsl.utils", ] diff --git a/scripts/generate_a5_pto.py b/scripts/generate_a5_pto.py new file mode 100644 index 00000000..a7d97d6c --- /dev/null +++ b/scripts/generate_a5_pto.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import pathlib +import subprocess +import sys + + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +if str(_ROOT) not in sys.path: + sys.path.insert(0, str(_ROOT)) + +from ptodsl.lib import a5 + + +_DEFAULT_OUTPUT_DIR = _ROOT / "ptodsl" / "lib" / "a5" / "generated" +_DEFAULT_PTOAS = _ROOT.parent / "PTOAS" / "build-src312" / "tools" / "ptoas" / "ptoas" + + +def emit_kernels(*, output_dir, ptoas_bin=None, emit_cpp=False): + output_dir.mkdir(parents=True, exist_ok=True) + generated = [] + for kernel_name, builder in a5.KERNEL_BUILDERS.items(): + module = builder() + pto_path = output_dir / f"{kernel_name}.pto" + pto_path.write_text(f"{module}\n", encoding="utf-8") + generated.append(pto_path) + + if emit_cpp: + if ptoas_bin is None: + raise ValueError("`emit_cpp=True` requires `ptoas_bin`.") + cpp_path = output_dir / f"{kernel_name}.cpp" + try: + subprocess.run( + [str(ptoas_bin), str(pto_path), "-o", str(cpp_path)], + check=True, + cwd=str(output_dir), + ) + except subprocess.CalledProcessError as exc: + print( + f"warning: failed to lower {pto_path.name} to C++ with ptoas: {exc}", + file=sys.stderr, + ) + return generated + + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Generate PTODSL A5 translation artifacts as `.pto` files." + ) + parser.add_argument( + "--output-dir", + type=pathlib.Path, + default=_DEFAULT_OUTPUT_DIR, + help=f"Directory to write generated artifacts. Default: {_DEFAULT_OUTPUT_DIR}", + ) + parser.add_argument( + "--ptoas", + type=pathlib.Path, + default=_DEFAULT_PTOAS, + help=f"ptoas binary to use when `--emit-cpp` is set. Default: {_DEFAULT_PTOAS}", + ) + parser.add_argument( + "--emit-cpp", + action="store_true", + help="Also run ptoas and write `.cpp` files next to the generated `.pto` files.", + ) + return parser.parse_args() + + +def main(): + args = _parse_args() + ptoas_bin = args.ptoas if args.emit_cpp else None + generated = emit_kernels( + output_dir=args.output_dir, + ptoas_bin=ptoas_bin, + emit_cpp=args.emit_cpp, + ) + for path in generated: + print(path) + + +if __name__ == "__main__": + main() diff --git a/scripts/update_tile_micro_checklist.py b/scripts/update_tile_micro_checklist.py new file mode 100644 index 00000000..8eaaee9f --- /dev/null +++ b/scripts/update_tile_micro_checklist.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +from pathlib import Path +import sys + + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from ptodsl.lib.a5.tile_micro_coverage import coverage_markdown + + +def main(): + target = _REPO_ROOT / "ptodsl" / "lib" / "a5" / "TILE_MICRO_CHECKLIST.md" + target.write_text(coverage_markdown(), encoding="utf-8") + print(target) + + +if __name__ == "__main__": + main() diff --git a/tests/regression/test_a5_lib_regression.py b/tests/regression/test_a5_lib_regression.py new file mode 100644 index 00000000..6dfab991 --- /dev/null +++ b/tests/regression/test_a5_lib_regression.py @@ -0,0 +1,410 @@ +import pytest +from mlir.ir import IndexType + +from ptodsl import pto, to_ir_module +from ptodsl.lib import a5 +from scripts.generate_a5_pto import emit_kernels + + +def test_a5_elementwise_add_kernel_emits_tile_flow(): + text = str(a5.build_elementwise_add()) + + assert "func.func @a5_elementwise_add" in text + assert "pto.make_tensor_view" in text + assert "pto.tload" in text + assert "pto.vlds" in text + assert "pto.vadd" in text + assert "pto.vsts" in text + assert "pto.tadd" not in text + assert "pto.tstore" in text + + +def test_a5_templated_elementwise_add_specializes_constexpr_impl(): + specializer = a5.build_templated_elementwise_add() + text = str( + specializer( + ROWS=8, + COLS=64, + VF_IMPL=a5.VF_IMPL_1D_POST_UPDATE, + ) + ) + + assert "func.func @a5_templated_elementwise_add(%arg0" in text + assert "ROWS" not in text + assert "COLS" not in text + assert "VF_IMPL" not in text + assert "scf.if" not in text + assert "pto.vlds_post" in text + assert "pto.vsts_post" in text + assert "pto.tadd" not in text + + +def test_a5_micro_vector_copy_emits_micro_ops(): + text = str(a5.build_micro_vector_copy()) + + assert "func.func @a5_micro_vector_copy" in text + assert "pto.pset_b32" in text + assert "pto.vlds" in text + assert "pto.vsts" in text + + +def test_a5_col_expand_micro_emits_broadcast_micro_ops(): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + "index_t": IndexType.get(), + } + + @to_ir_module(meta_data=meta_data) + def a5_col_expand_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[1, 32], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[32, 32], dtype=pto.float32) + with pto.vector_section(): + a5.col_expand_micro( + src_view.slice([0, 0], [1, 32]), + dst_view.slice([0, 0], [32, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + text = str(a5_col_expand_micro) + + assert "func.func @a5_col_expand_micro" in text + assert "pto.vlds" in text + assert "pto.vsts" in text + assert "pto.tcolexpand" not in text + + +def test_a5_gather_micro_emits_indexed_gather_micro_ops(): + def meta_data(): + return { + "ptr_src": pto.ptr(pto.float32), + "ptr_idx": pto.ptr(pto.uint32), + } + + @to_ir_module(meta_data=meta_data) + def a5_gather_micro(src: "ptr_src", idx: "ptr_idx", dst: "ptr_src") -> None: + src_view = pto.make_tensor(src, shape=[1, 64], dtype=pto.float32) + idx_view = pto.make_tensor(idx, shape=[1, 64], dtype=pto.uint32) + dst_view = pto.make_tensor(dst, shape=[1, 64], dtype=pto.float32) + with pto.vector_section(): + a5.gather_micro( + src_view.slice([0, 0], [1, 64]), + idx_view.slice([0, 0], [1, 64]), + dst_view.slice([0, 0], [1, 64]), + dtype=pto.float32, + index_dtype=pto.uint32, + shape=[1, 64], + ) + + text = str(a5_gather_micro) + + assert "func.func @a5_gather_micro" in text + assert "pto.vgather2" in text + assert "pto.vsts" in text + assert "pto.tgather" not in text + + +def test_a5_row_expand_micro_emits_broadcast_micro_ops(): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + "index_t": IndexType.get(), + } + + @to_ir_module(meta_data=meta_data) + def a5_row_expand_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[32, 1], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[32, 32], dtype=pto.float32) + with pto.vector_section(): + a5.row_expand_micro( + src_view.slice([0, 0], [32, 1]), + dst_view.slice([0, 0], [32, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + text = str(a5_row_expand_micro) + + assert "func.func @a5_row_expand_micro" in text + assert "pto.vldas" in text + assert "pto.vldus" in text + assert "pto.vdup" in text + assert "pto.vsts" in text + assert "pto.trowexpand" not in text + + +def test_a5_row_expand_mul_micro_emits_broadcast_compute_micro_ops(): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + "index_t": IndexType.get(), + } + + @to_ir_module(meta_data=meta_data) + def a5_row_expand_mul_micro(base: "ptr_t", scale: "ptr_t", dst: "ptr_t") -> None: + base_view = pto.make_tensor(base, shape=[32, 32], dtype=pto.float32) + scale_view = pto.make_tensor(scale, shape=[32, 1], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[32, 32], dtype=pto.float32) + with pto.vector_section(): + a5.row_expand_mul_micro( + base_view.slice([0, 0], [32, 32]), + scale_view.slice([0, 0], [32, 1]), + dst_view.slice([0, 0], [32, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + text = str(a5_row_expand_mul_micro) + + assert "func.func @a5_row_expand_mul_micro" in text + assert "pto.vldas" in text + assert "pto.vldus" in text + assert "pto.vdup" in text + assert "pto.vmul" in text + assert "pto.vsts" in text + assert "pto.trowexpandmul" not in text + + +def test_a5_rsqrt_micro_emits_vsqrt_then_vrec(): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + } + + @to_ir_module(meta_data=meta_data) + def a5_rsqrt_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[1, 64], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[1, 64], dtype=pto.float32) + with pto.vector_section(): + a5.rsqrt_micro( + src_view.slice([0, 0], [1, 64]), + dst_view.slice([0, 0], [1, 64]), + dtype=pto.float32, + shape=[1, 64], + ) + + text = str(a5_rsqrt_micro) + + assert "func.func @a5_rsqrt_micro" in text + assert "pto.vsqrt" in text + assert "pto.vrec" in text + assert "pto.trsqrt" not in text + + +@pytest.mark.parametrize( + ("helper_name", "reduce_op", "combine_op", "tile_op"), + [ + ("row_sum_micro", "pto.vcadd", "pto.vadd", "pto.trowsum"), + ("row_max_micro", "pto.vcmax", "pto.vmax", "pto.trowmax"), + ("row_min_micro", "pto.vcmin", "pto.vmin", "pto.trowmin"), + ], +) +def test_a5_row_reduce_micro_emits_reduction_micro_ops( + helper_name, reduce_op, combine_op, tile_op +): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + "index_t": IndexType.get(), + } + + helper = getattr(a5, helper_name) + + @to_ir_module(meta_data=meta_data) + def a5_row_reduce_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[32, 1], dtype=pto.float32) + with pto.vector_section(): + helper( + src_view.slice([0, 0], [32, 32]), + dst_view.slice([0, 0], [32, 1]), + dtype=pto.float32, + shape=[32, 32], + ) + + text = str(a5_row_reduce_micro) + + assert reduce_op in text + assert combine_op in text + assert 'dist = "ONEPT_B32"' in text + assert tile_op not in text + + +@pytest.mark.parametrize( + ("helper_name", "reduce_op", "tile_op", "impl"), + [ + ("col_sum_micro", "pto.vadd", "pto.tcolsum", a5.VF_IMPL_1D_POST_UPDATE), + ("col_max_micro", "pto.vmax", "pto.tcolmax", a5.VF_IMPL_1D_NO_POST_UPDATE), + ("col_min_micro", "pto.vmin", "pto.tcolmin", a5.VF_IMPL_1D_POST_UPDATE), + ], +) +def test_a5_col_reduce_micro_emits_template_lowering(helper_name, reduce_op, tile_op, impl): + def meta_data(): + return { + "ptr_t": pto.ptr(pto.float32), + } + + helper = getattr(a5, helper_name) + + @to_ir_module(meta_data=meta_data) + def a5_col_reduce_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[1, 32], dtype=pto.float32) + with pto.vector_section(): + helper( + src_view.slice([0, 0], [32, 32]), + dst_view.slice([0, 0], [1, 32]), + dtype=pto.float32, + shape=[32, 32], + impl=impl, + ) + + text = str(a5_col_reduce_micro) + + assert reduce_op in text + assert tile_op not in text + if impl == a5.VF_IMPL_1D_POST_UPDATE: + assert "pto.vlds_post" in text + assert "pto.vsts_post" in text + + +def test_a5_sort32_micro_emits_vbitsort(): + def meta_data(): + return { + "ptr_src": pto.ptr(pto.float32), + "ptr_idx": pto.ptr(pto.uint32), + } + + @to_ir_module(meta_data=meta_data) + def a5_sort32_micro(src: "ptr_src", idx: "ptr_idx", dst: "ptr_src") -> None: + src_view = pto.make_tensor(src, shape=[1, 64], dtype=pto.float32) + idx_view = pto.make_tensor(idx, shape=[1, 64], dtype=pto.uint32) + dst_view = pto.make_tensor(dst, shape=[1, 128], dtype=pto.float32) + with pto.vector_section(): + a5.sort32_micro( + src_view.slice([0, 0], [1, 64]), + idx_view.slice([0, 0], [1, 64]), + dst_view.slice([0, 0], [1, 128]), + dtype=pto.float32, + shape=[1, 64], + ) + + text = str(a5_sort32_micro) + + assert "func.func @a5_sort32_micro" in text + assert "pto.vbitsort" in text + assert "pto.tsort32" not in text + + +def test_a5_mrgsort_micro_emits_vmrgsort4(): + def meta_data(): + return {"ptr_t": pto.ptr(pto.float32)} + + @to_ir_module(meta_data=meta_data) + def a5_mrgsort_micro(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[1, 256], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[1, 256], dtype=pto.float32) + with pto.vector_section(): + a5.mrgsort_micro( + src_view.slice([0, 0], [1, 256]), + dst_view.slice([0, 0], [1, 256]), + dtype=pto.float32, + shape=[1, 256], + block_len=64, + ) + + text = str(a5_mrgsort_micro) + + assert "func.func @a5_mrgsort_micro" in text + assert "pto.vmrgsort4" in text + assert "pto.tmrgsort" not in text + + +def test_a5_generation_script_emits_pto_files(tmp_path): + generated = emit_kernels(output_dir=tmp_path) + + generated_names = sorted(path.name for path in generated) + assert generated_names == [ + "a5_cube_matmul.pto", + "a5_elementwise_add.pto", + "a5_micro_vector_copy.pto", + ] + + for path in generated: + text = path.read_text(encoding="utf-8") + assert "func.func @" in text + + +def test_a5_add_micro_rejects_view_dtype_mismatch(): + def meta_data(): + return {"ptr_t": pto.ptr(pto.float16)} + + with pytest.raises(ValueError, match="TADD input tile src0, src1 and dst tile data type mismatch"): + @to_ir_module(meta_data=meta_data) + def invalid_add(src0: "ptr_t", src1: "ptr_t", dst: "ptr_t") -> None: + lhs = pto.make_tensor(src0, shape=[32, 32], dtype=pto.float16) + rhs = pto.make_tensor(src1, shape=[32, 32], dtype=pto.float16) + out = pto.make_tensor(dst, shape=[32, 32], dtype=pto.float16) + with pto.vector_section(): + a5.add_micro( + lhs.slice([0, 0], [32, 32]), + rhs.slice([0, 0], [32, 32]), + out.slice([0, 0], [32, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + +def test_a5_row_expand_micro_rejects_non_column_source(): + def meta_data(): + return {"ptr_t": pto.ptr(pto.float32)} + + with pytest.raises(ValueError, match="TROWEXPAND source valid shape must be \\[rows, 1\\]"): + @to_ir_module(meta_data=meta_data) + def invalid_row_expand(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[1, 32], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[32, 32], dtype=pto.float32) + with pto.vector_section(): + a5.row_expand_micro( + src_view.slice([0, 0], [1, 32]), + dst_view.slice([0, 0], [32, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + +def test_a5_row_reduce_micro_rejects_non_single_column_output(): + def meta_data(): + return {"ptr_t": pto.ptr(pto.float32)} + + with pytest.raises(ValueError, match="use a single-column output tile"): + @to_ir_module(meta_data=meta_data) + def invalid_row_reduce(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.float32) + dst_view = pto.make_tensor(dst, shape=[1, 32], dtype=pto.float32) + with pto.vector_section(): + a5.row_sum_micro( + src_view.slice([0, 0], [32, 32]), + dst_view.slice([0, 0], [1, 32]), + dtype=pto.float32, + shape=[32, 32], + ) + + +def test_a5_col_reduce_micro_rejects_unsupported_dtype(): + def meta_data(): + return {"ptr_t": pto.ptr(pto.bool)} + + with pytest.raises(ValueError, match="TCOLREDUCE input data type is not supported"): + @to_ir_module(meta_data=meta_data) + def invalid_col_reduce(src: "ptr_t", dst: "ptr_t") -> None: + src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.bool) + dst_view = pto.make_tensor(dst, shape=[1, 32], dtype=pto.bool) + with pto.vector_section(): + a5.col_sum_micro( + src_view.slice([0, 0], [32, 32]), + dst_view.slice([0, 0], [1, 32]), + dtype=pto.bool, + shape=[32, 32], + ) diff --git a/tests/regression/test_tile_micro_coverage.py b/tests/regression/test_tile_micro_coverage.py new file mode 100644 index 00000000..5f17ded9 --- /dev/null +++ b/tests/regression/test_tile_micro_coverage.py @@ -0,0 +1,39 @@ +from pathlib import Path + +from ptodsl import tile +from ptodsl.lib import a5 +from ptodsl.lib.a5.tile_micro_coverage import ( + TILE_MICRO_COVERAGE, + coverage_markdown, + coverage_summary, +) + + +def test_tile_micro_coverage_checklist_covers_every_tile_api_symbol(): + assert set(TILE_MICRO_COVERAGE) == set(tile.__all__) + + +def test_implemented_tile_micro_helpers_exist(): + for name, entry in TILE_MICRO_COVERAGE.items(): + helper = entry["helper"] + if entry["status"] == "implemented": + assert helper is not None + assert getattr(a5, helper) is not None + + +def test_tile_micro_coverage_markdown_mentions_all_tile_ops(): + text = coverage_markdown() + for name in tile.__all__: + assert f"`{name}`" in text + + +def test_tile_micro_coverage_summary_matches_public_surface(): + counts = coverage_summary() + assert sum(counts.values()) == len(tile.__all__) + assert counts["implemented"] > 0 + assert counts["blocked"] > 0 + + +def test_checked_in_tile_micro_checklist_is_in_sync(): + checklist = Path(__file__).resolve().parents[2] / "ptodsl" / "lib" / "a5" / "TILE_MICRO_CHECKLIST.md" + assert checklist.read_text(encoding="utf-8") == coverage_markdown() From 52523feefe7022a39eb3938e3ca79487db61be64 Mon Sep 17 00:00:00 2001 From: RuoyuZhou Date: Tue, 31 Mar 2026 09:21:09 +0800 Subject: [PATCH 53/53] Fix CI and constexpr exports for PTODSL A5 PR --- .../aot/matmul_mxfp8/matmul_mxfp8_builder.py | 47 ++++- .../aot/matmul_mxfp8/mxfp8_ppt_example.py | 17 +- ptodsl/__init__.py | 14 +- ptodsl/compiler/ir.py | 15 +- ptodsl/constexpr.py | 37 ++++ ptodsl/language.py | 138 ++++++++++---- ptodsl/lib/a5/__init__.py | 6 +- ptodsl/lib/a5/generated/a5_cube_matmul.pto | 1 - .../lib/a5/generated/a5_elementwise_add.pto | 1 - .../lib/a5/generated/a5_micro_vector_copy.pto | 1 - ptodsl/lib/a5/kernels.py | 8 +- ptodsl/lib/a5/ops.py | 173 ++++++++++++++---- ptodsl/lib/a5/tile_micro_coverage.py | 5 +- tests/frontend/test_mxfp8_frontend.py | 3 + tests/regression/test_a5_lib_regression.py | 16 +- tests/regression/test_tile_micro_coverage.py | 8 +- 16 files changed, 392 insertions(+), 98 deletions(-) create mode 100644 ptodsl/constexpr.py diff --git a/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py b/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py index 5cf713fa..68990645 100644 --- a/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py +++ b/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py @@ -52,15 +52,40 @@ def matmul_mxfp8( tv_a = pto.as_tensor(lhs_tensor, ptr=a_ptr, shape=[cM, cK], strides=[cK, c1]) tv_b = pto.as_tensor(rhs_tensor, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1]) - tv_scale_a = pto.as_tensor(lhs_scale_tensor, ptr=a_scale_ptr, shape=[cM, cScaleK], strides=[cScaleK, c1]) - tv_scale_b = pto.as_tensor(rhs_scale_tensor, ptr=b_scale_ptr, shape=[cScaleK, cN], strides=[cN, c1]) - tv_bias = pto.as_tensor(bias_tensor, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1]) - - sv_a = pto.slice_view(lhs_tile_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK]) - sv_b = pto.slice_view(rhs_tile_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN]) - sv_scale_a = pto.slice_view(lhs_scale_tile_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK]) - sv_scale_b = pto.slice_view(rhs_scale_tile_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN]) - sv_bias = pto.slice_view(bias_tile_view, source=tv_bias, offsets=[c0, c0], sizes=[c1, cN]) + tv_scale_a = pto.as_tensor( + lhs_scale_tensor, + ptr=a_scale_ptr, + shape=[cM, cScaleK], + strides=[cScaleK, c1], + ) + tv_scale_b = pto.as_tensor( + rhs_scale_tensor, ptr=b_scale_ptr, shape=[cScaleK, cN], strides=[cN, c1] + ) + tv_bias = pto.as_tensor( + bias_tensor, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1] + ) + + sv_a = pto.slice_view( + lhs_tile_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK] + ) + sv_b = pto.slice_view( + rhs_tile_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN] + ) + sv_scale_a = pto.slice_view( + lhs_scale_tile_view, + source=tv_scale_a, + offsets=[c0, c0], + sizes=[cM, cScaleK], + ) + sv_scale_b = pto.slice_view( + rhs_scale_tile_view, + source=tv_scale_b, + offsets=[c0, c0], + sizes=[cScaleK, cN], + ) + sv_bias = pto.slice_view( + bias_tile_view, source=tv_bias, offsets=[c0, c0], sizes=[c1, cN] + ) with pto.cube_section(): a_tile = pto.alloc_tile(lhs_tile) @@ -75,7 +100,9 @@ def matmul_mxfp8( pto.load(sv_scale_a, a_scale_tile) pto.load(sv_scale_b, b_scale_tile) pto.load(sv_bias, bias_tile_buf) - pto.matmul_mx_bias(a_tile, a_scale_tile, b_tile, b_scale_tile, bias_tile_buf, acc_tile_buf) + pto.matmul_mx_bias( + a_tile, a_scale_tile, b_tile, b_scale_tile, bias_tile_buf, acc_tile_buf + ) return matmul_mxfp8 diff --git a/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py b/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py index b7988c9e..603542b0 100644 --- a/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py +++ b/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py @@ -1,7 +1,6 @@ from ptodsl import to_ir_module import ptodsl.language as pto - M, K, N = 16, 64, 32 @@ -52,13 +51,21 @@ def matmul_mxfp8_core( tv_a = pto.as_tensor(a_tensor, ptr=a, shape=[cM, cK], strides=[cK, c1]) tv_b = pto.as_tensor(b_tensor, ptr=b, shape=[cK, cN], strides=[cN, c1]) - tv_scale_a = pto.as_tensor(scale_a_tensor, ptr=scale_a, shape=[cM, cScaleK], strides=[cScaleK, c1]) - tv_scale_b = pto.as_tensor(scale_b_tensor, ptr=scale_b, shape=[cScaleK, cN], strides=[cN, c1]) + tv_scale_a = pto.as_tensor( + scale_a_tensor, ptr=scale_a, shape=[cM, cScaleK], strides=[cScaleK, c1] + ) + tv_scale_b = pto.as_tensor( + scale_b_tensor, ptr=scale_b, shape=[cScaleK, cN], strides=[cN, c1] + ) sv_a = pto.slice_view(a_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK]) sv_b = pto.slice_view(b_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN]) - sv_scale_a = pto.slice_view(scale_a_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK]) - sv_scale_b = pto.slice_view(scale_b_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN]) + sv_scale_a = pto.slice_view( + scale_a_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK] + ) + sv_scale_b = pto.slice_view( + scale_b_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN] + ) with pto.cube_section(): ta = pto.alloc_tile(a_tile) diff --git a/ptodsl/__init__.py b/ptodsl/__init__.py index 55333e65..5ed02b28 100644 --- a/ptodsl/__init__.py +++ b/ptodsl/__init__.py @@ -2,5 +2,17 @@ from .bench import do_bench from .compiler.ir import to_ir_module from .compiler.jit import JitWrapper, jit +from .constexpr import Constexpr, const_expr, range_constexpr -__all__ = ["JitWrapper", "do_bench", "jit", "pto", "scalar", "tile", "to_ir_module"] +__all__ = [ + "Constexpr", + "JitWrapper", + "const_expr", + "do_bench", + "jit", + "pto", + "range_constexpr", + "scalar", + "tile", + "to_ir_module", +] diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py index b32730ef..488d8638 100644 --- a/ptodsl/compiler/ir.py +++ b/ptodsl/compiler/ir.py @@ -4,6 +4,7 @@ from mlir.ir import Context, InsertionPoint, Location, Module from ..api.scalar import wrap_value +from ..constexpr import is_constexpr_annotation def _resolve_meta(meta_fn): @@ -19,6 +20,8 @@ def _resolve_arg_types(signature, meta_map): arg_types = [] for param in signature.parameters.values(): annot = param.annotation + if is_constexpr_annotation(annot): + continue if isinstance(annot, str): if annot not in meta_map: raise ValueError(f"Unknown annotation '{annot}'.") @@ -89,7 +92,17 @@ def decorator(fn): entry = ir_func.add_entry_block() with InsertionPoint(entry): - wrapped_args = [wrap_value(arg) for arg in entry.arguments] + wrapped_args = [] + entry_arg_iter = iter(entry.arguments) + for param in sig.parameters.values(): + if is_constexpr_annotation(param.annotation): + if param.default is inspect._empty: + raise ValueError( + f"Constexpr argument '{param.name}' requires a default value." + ) + wrapped_args.append(param.default) + else: + wrapped_args.append(wrap_value(next(entry_arg_iter))) injected = set(meta_map.keys()) old_globals = _inject_globals(fn, meta_map) try: diff --git a/ptodsl/constexpr.py b/ptodsl/constexpr.py new file mode 100644 index 00000000..a1475b9a --- /dev/null +++ b/ptodsl/constexpr.py @@ -0,0 +1,37 @@ +import builtins + + +class ConstexprAnnotation: + __ptodsl_constexpr__ = True + + def __init__(self, inner_type): + self.inner_type = inner_type + + def __repr__(self): + return f"Constexpr[{self.inner_type!r}]" + + +class Constexpr: + def __class_getitem__(cls, inner_type): + return ConstexprAnnotation(inner_type) + + +def is_constexpr_annotation(annotation): + return getattr(annotation, "__ptodsl_constexpr__", False) + + +def const_expr(value): + return value + + +def range_constexpr(*args): + return builtins.range(*args) + + +__all__ = [ + "Constexpr", + "ConstexprAnnotation", + "const_expr", + "is_constexpr_annotation", + "range_constexpr", +] diff --git a/ptodsl/language.py b/ptodsl/language.py index 7f45ac42..de97b725 100644 --- a/ptodsl/language.py +++ b/ptodsl/language.py @@ -4,7 +4,7 @@ from mlir import ir as mlir_ir from mlir.dialects import arith, pto, scf -from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType +from mlir.ir import IndexType, InsertionPoint, IntegerType def _unwrap(value): @@ -69,7 +69,7 @@ def __le__(self, other): def __ge__(self, other): return Value._cmp(self, other, arith.CmpIPredicate.sge) - + def __eq__(self, other): return Value._cmp(self, other, arith.CmpIPredicate.eq) @@ -100,7 +100,9 @@ def data(self): def scale_k(self, k): if k % self.scale_factor != 0: - raise ValueError(f"k={k} must be divisible by scale_factor={self.scale_factor} for MXFP8.") + raise ValueError( + f"k={k} must be divisible by scale_factor={self.scale_factor} for MXFP8." + ) return k // self.scale_factor @@ -122,9 +124,13 @@ def make_mxfp8(*, lhs="e5m2", rhs="e5m2", acc=None, scale_factor=32): "e5m2": __getattr__("fp8_e5m2"), } if lhs not in variants: - raise ValueError(f"Unsupported lhs variant '{lhs}'. Expected one of: {', '.join(sorted(variants))}.") + raise ValueError( + f"Unsupported lhs variant '{lhs}'. Expected one of: {', '.join(sorted(variants))}." + ) if rhs not in variants: - raise ValueError(f"Unsupported rhs variant '{rhs}'. Expected one of: {', '.join(sorted(variants))}.") + raise ValueError( + f"Unsupported rhs variant '{rhs}'. Expected one of: {', '.join(sorted(variants))}." + ) return MXFP8DType( lhs=variants[lhs], rhs=variants[rhs], @@ -139,9 +145,9 @@ def __getattr__(name): if name == "bool": return IntegerType.get_signless(1) if name == "float32": - return F32Type.get() + return _get_mlir_float_type(name, "F32Type", "Float32Type") if name == "float16": - return F16Type.get() + return _get_mlir_float_type(name, "F16Type", "Float16Type") if name == "bfloat16": return _get_mlir_float_type(name, "BF16Type") if name in ("fp8_e4m3", "float8_e4m3"): @@ -176,7 +182,9 @@ def SubTensorType(*, shape, dtype): class TileBufConfig: - def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null"): + def __init__( + self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pad="Null" + ): # TODO: expose and validate a broader set of tile buffer knobs if PTO adds # more layout/padding/fractal settings that should be configurable here. self._bl = pto.BLayoutAttr.get(getattr(pto.BLayout, blayout)) @@ -186,7 +194,9 @@ def __init__(self, blayout="RowMajor", slayout="NoneBox", s_fractal_size=512, pa @property def attr(self): - return pto.TileBufConfigAttr.get(self._bl, self._sl, self._s_fractal_size, self._pd) + return pto.TileBufConfigAttr.get( + self._bl, self._sl, self._s_fractal_size, self._pd + ) def _default_tile_config(memory_space, shape): @@ -194,21 +204,51 @@ def _default_tile_config(memory_space, shape): # Defaults mirror the explicit configs used by the verbose matmul builder. if space == "MAT": if len(shape) >= 1 and shape[0] == 1: - return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) - return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=pto.TileConfig.fractalABSize, + ) + return TileBufConfig( + blayout="ColMajor", + slayout="RowMajor", + s_fractal_size=pto.TileConfig.fractalABSize, + ) if space == "LEFT": - return TileBufConfig(blayout="RowMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig( + blayout="RowMajor", + slayout="RowMajor", + s_fractal_size=pto.TileConfig.fractalABSize, + ) if space == "RIGHT": - return TileBufConfig(blayout="RowMajor", slayout="ColMajor", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig( + blayout="RowMajor", + slayout="ColMajor", + s_fractal_size=pto.TileConfig.fractalABSize, + ) if space == "ACC": - return TileBufConfig(blayout="ColMajor", slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalCSize) + return TileBufConfig( + blayout="ColMajor", + slayout="RowMajor", + s_fractal_size=pto.TileConfig.fractalCSize, + ) if space == "BIAS": - return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=pto.TileConfig.fractalABSize, + ) if space == "SCALING": - return TileBufConfig(blayout="RowMajor", slayout="NoneBox", s_fractal_size=pto.TileConfig.fractalABSize) + return TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=pto.TileConfig.fractalABSize, + ) if space == "VEC": return TileBufConfig() - raise ValueError(f"Unsupported memory_space '{memory_space}' for default tile config.") + raise ValueError( + f"Unsupported memory_space '{memory_space}' for default tile config." + ) def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): @@ -228,7 +268,13 @@ def LeftScaleTileBufType(*, shape, dtype, valid_shape=None, config=None): slayout="RowMajor", s_fractal_size=pto.TileConfig.fractalMxSize, ) - return TileBufType(shape=shape, dtype=dtype, memory_space="SCALING", valid_shape=valid_shape, config=config) + return TileBufType( + shape=shape, + dtype=dtype, + memory_space="SCALING", + valid_shape=valid_shape, + config=config, + ) def RightScaleTileBufType(*, shape, dtype, valid_shape=None, config=None): @@ -238,7 +284,13 @@ def RightScaleTileBufType(*, shape, dtype, valid_shape=None, config=None): slayout="ColMajor", s_fractal_size=pto.TileConfig.fractalMxSize, ) - return TileBufType(shape=shape, dtype=dtype, memory_space="SCALING", valid_shape=valid_shape, config=config) + return TileBufType( + shape=shape, + dtype=dtype, + memory_space="SCALING", + valid_shape=valid_shape, + config=config, + ) def const(value): @@ -272,13 +324,17 @@ def index_cast(value, index_type=IndexType): def as_tensor(tensor_type, *, ptr, shape, strides): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] - return pto.MakeTensorViewOp(tensor_type, _unwrap(ptr), shape_vals, stride_vals).result + return pto.MakeTensorViewOp( + tensor_type, _unwrap(ptr), shape_vals, stride_vals + ).result def slice_view(subtensor_type, *, source, offsets, sizes): offset_vals = [_unwrap(v) for v in offsets] size_vals = [_unwrap(v) for v in sizes] - return pto.PartitionViewOp(subtensor_type, source, offsets=offset_vals, sizes=size_vals).result + return pto.PartitionViewOp( + subtensor_type, source, offsets=offset_vals, sizes=size_vals + ).result @contextmanager @@ -447,18 +503,22 @@ def ge(a, b): def select(cond, true_val, false_val): - return Value(arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result) + return Value( + arith.SelectOp(_unwrap(cond), _unwrap(true_val), _unwrap(false_val)).result + ) class _IfElseBranch: def __init__(self, if_op): self._if_op = if_op + @contextmanager def else_context(self): with InsertionPoint(self._if_op.else_block): yield scf.YieldOp([]) + @contextmanager def if_context(condition, has_else=False): if has_else: @@ -483,6 +543,7 @@ def cond(condition, then_builder, else_builder): scf.YieldOp([]) return op + def _resolve_sync_op(sync_op): if isinstance(sync_op, str): normalized = sync_op.strip().upper() @@ -503,24 +564,39 @@ def _resolve_event_id(event_id): return event_id -def record_event(record_op, wait_op, event_id: int|Sequence[int]=0): +def record_event(record_op, wait_op, event_id: int | Sequence[int] = 0): if not isinstance(event_id, int): for eid in event_id: - pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + pto.record_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(eid), + ) else: - pto.record_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) - + pto.record_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(event_id), + ) -def wait_event(record_op, wait_op, event_id: int|Sequence[int]=0): +def wait_event(record_op, wait_op, event_id: int | Sequence[int] = 0): if not isinstance(event_id, int): for eid in event_id: - pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(eid)) + pto.wait_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(eid), + ) else: - pto.wait_event(_resolve_sync_op(record_op), _resolve_sync_op(wait_op), _resolve_event_id(event_id)) + pto.wait_event( + _resolve_sync_op(record_op), + _resolve_sync_op(wait_op), + _resolve_event_id(event_id), + ) -def record_wait_pair(record_op, wait_op, event_id: int|Sequence[int]=0): +def record_wait_pair(record_op, wait_op, event_id: int | Sequence[int] = 0): rec = _resolve_sync_op(record_op) w = _resolve_sync_op(wait_op) ev = _resolve_event_id(event_id) @@ -533,4 +609,4 @@ def barrier(sync_op): def row_sum(src, tmp, dst): - pto.TRowSumOp(src = src, tmp = tmp, dst = dst) + pto.TRowSumOp(src=src, tmp=tmp, dst=dst) diff --git a/ptodsl/lib/a5/__init__.py b/ptodsl/lib/a5/__init__.py index 6bc0da50..61670f55 100644 --- a/ptodsl/lib/a5/__init__.py +++ b/ptodsl/lib/a5/__init__.py @@ -8,7 +8,11 @@ build_templated_elementwise_add, ) from .ops import * -from .tile_micro_coverage import TILE_MICRO_COVERAGE, coverage_markdown, coverage_summary +from .tile_micro_coverage import ( + TILE_MICRO_COVERAGE, + coverage_markdown, + coverage_summary, +) __all__ = list(ops.__all__) + [ "KERNEL_BUILDERS", diff --git a/ptodsl/lib/a5/generated/a5_cube_matmul.pto b/ptodsl/lib/a5/generated/a5_cube_matmul.pto index ae16b9d4..7f52f654 100644 --- a/ptodsl/lib/a5/generated/a5_cube_matmul.pto +++ b/ptodsl/lib/a5/generated/a5_cube_matmul.pto @@ -44,4 +44,3 @@ module { return } } - diff --git a/ptodsl/lib/a5/generated/a5_elementwise_add.pto b/ptodsl/lib/a5/generated/a5_elementwise_add.pto index 1363c6d2..598b5bfb 100644 --- a/ptodsl/lib/a5/generated/a5_elementwise_add.pto +++ b/ptodsl/lib/a5/generated/a5_elementwise_add.pto @@ -48,4 +48,3 @@ module { return } } - diff --git a/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto b/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto index a5c5b940..7cda605b 100644 --- a/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto +++ b/ptodsl/lib/a5/generated/a5_micro_vector_copy.pto @@ -8,4 +8,3 @@ module { return } } - diff --git a/ptodsl/lib/a5/kernels.py b/ptodsl/lib/a5/kernels.py index 70f3f301..4bd7e531 100644 --- a/ptodsl/lib/a5/kernels.py +++ b/ptodsl/lib/a5/kernels.py @@ -170,7 +170,9 @@ def a5_mxfp8_matmul( return a5_mxfp8_matmul -def build_cube_matmul(*, m=16, k=32, n=16, lhs_dtype=None, rhs_dtype=None, acc_dtype=None): +def build_cube_matmul( + *, m=16, k=32, n=16, lhs_dtype=None, rhs_dtype=None, acc_dtype=None +): lhs_dtype = pto.float16 if lhs_dtype is None else lhs_dtype rhs_dtype = pto.float16 if rhs_dtype is None else rhs_dtype acc_dtype = pto.float32 if acc_dtype is None else acc_dtype @@ -183,7 +185,9 @@ def meta_data(): } @to_ir_module(meta_data=meta_data) - def a5_cube_matmul(lhs_ptr: "ptr_lhs", rhs_ptr: "ptr_rhs", out_ptr: "ptr_out") -> None: + def a5_cube_matmul( + lhs_ptr: "ptr_lhs", rhs_ptr: "ptr_rhs", out_ptr: "ptr_out" + ) -> None: c0 = s.const(0) lhs = pto.make_tensor(lhs_ptr, shape=[m, k], dtype=lhs_dtype) rhs = pto.make_tensor(rhs_ptr, shape=[k, n], dtype=rhs_dtype) diff --git a/ptodsl/lib/a5/ops.py b/ptodsl/lib/a5/ops.py index 689bd7da..996c6723 100644 --- a/ptodsl/lib/a5/ops.py +++ b/ptodsl/lib/a5/ops.py @@ -10,7 +10,6 @@ from ... import const_expr, range_constexpr from ...api.scalar import _unwrap - VF_IMPL_DEFAULT = "default" VF_IMPL_1D_NO_POST_UPDATE = "1d_no_post_update" VF_IMPL_1D_POST_UPDATE = "1d_post_update" @@ -70,9 +69,24 @@ def _dtype_token(dtype): def _dtype_byte_width(dtype): text = str(dtype) - if "float32" in text or "f32" in text or "int32" in text or "i32" in text or "uint32" in text or "u32" in text: + if ( + "float32" in text + or "f32" in text + or "int32" in text + or "i32" in text + or "uint32" in text + or "u32" in text + ): return 4 - if "float16" in text or "f16" in text or "bfloat16" in text or "bf16" in text or "int16" in text or "i16" in text or "u16" in text: + if ( + "float16" in text + or "f16" in text + or "bfloat16" in text + or "bf16" in text + or "int16" in text + or "i16" in text + or "u16" in text + ): return 2 if "i8" in text or "u8" in text: return 1 @@ -215,7 +229,9 @@ def _normalize_vf_impl_kind(impl): } if normalized not in aliases: supported = ", ".join(sorted(aliases)) - raise ValueError(f"Unsupported VF impl kind '{impl}'. Expected one of: {supported}.") + raise ValueError( + f"Unsupported VF impl kind '{impl}'. Expected one of: {supported}." + ) return aliases[normalized] @@ -584,7 +600,9 @@ def row_expand_micro(src_view, out_view, *, dtype, shape, base_addr=0): return out_view -def row_expand_sub_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): +def row_expand_sub_micro( + base_view, expand_view, out_view, *, dtype, shape, base_addr=0 +): return _row_expand_binary_micro( base_view, expand_view, @@ -596,7 +614,9 @@ def row_expand_sub_micro(base_view, expand_view, out_view, *, dtype, shape, base ) -def row_expand_mul_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): +def row_expand_mul_micro( + base_view, expand_view, out_view, *, dtype, shape, base_addr=0 +): return _row_expand_binary_micro( base_view, expand_view, @@ -608,7 +628,9 @@ def row_expand_mul_micro(base_view, expand_view, out_view, *, dtype, shape, base ) -def row_expand_div_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr=0): +def row_expand_div_micro( + base_view, expand_view, out_view, *, dtype, shape, base_addr=0 +): return _row_expand_binary_micro( base_view, expand_view, @@ -659,7 +681,9 @@ def row_min_micro(src_view, out_view, *, dtype, shape, base_addr=0): ) -def col_sum_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): +def col_sum_micro( + src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT +): return _col_reduce_micro( src_view, out_view, @@ -671,7 +695,9 @@ def col_sum_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL ) -def col_max_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): +def col_max_micro( + src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT +): return _col_reduce_micro( src_view, out_view, @@ -683,7 +709,9 @@ def col_max_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL ) -def col_min_micro(src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT): +def col_min_micro( + src_view, out_view, *, dtype, shape, base_addr=0, impl=VF_IMPL_DEFAULT +): return _col_reduce_micro( src_view, out_view, @@ -879,7 +907,9 @@ def _check_col_reduce_operands(src_view, out_view, *, dtype, shape, context): return rows, cols -def _check_gather_operands(src_view, indices_view, out_view, *, dtype, index_dtype, shape): +def _check_gather_operands( + src_view, indices_view, out_view, *, dtype, index_dtype, shape +): rows, cols = _require_static_matrix_shape(shape, context="TGATHER") dtype_token = _require_supported_dtype( dtype, @@ -995,13 +1025,21 @@ def _check_sort32_operands(src_view, idx_view, out_view, *, dtype, shape): message="Idx must be uint32_t.", ) if cols % 32 != 0: - raise ValueError("TSORT32 micro lowering currently requires column count divisible by 32.") + raise ValueError( + "TSORT32 micro lowering currently requires column count divisible by 32." + ) return rows, cols, out_cols -def _row_expand_binary_micro(base_view, expand_view, out_view, *, dtype, shape, base_addr, op_name): +def _row_expand_binary_micro( + base_view, expand_view, out_view, *, dtype, shape, base_addr, op_name +): rows, cols = _check_row_expand_operands( - expand_view, out_view, dtype=dtype, shape=shape, context=f"TROWEXPAND_{op_name[1:].upper()}" + expand_view, + out_view, + dtype=dtype, + shape=shape, + context=f"TROWEXPAND_{op_name[1:].upper()}", ) _require_view_shape( base_view, @@ -1022,11 +1060,15 @@ def _row_expand_binary_micro(base_view, expand_view, out_view, *, dtype, shape, expand_addr_value = _const_i64(base_addr + buf_bytes) out_addr_value = _const_i64(base_addr + buf_bytes * 2) - base_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=base_addr_value) + base_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc( + addr=base_addr_value + ) expand_tile = _dsl_pto.make_tile_buffer( dtype, shape, space="VEC", valid_shape=[rows, 1] ).alloc(addr=expand_addr_value) - out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr_value) + out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc( + addr=out_addr_value + ) _dsl_pto.load(base_view, base_tile) _dsl_pto.load(expand_view, expand_tile) @@ -1199,7 +1241,9 @@ def _col_reduce_micro_no_post_update( accum = reduce_op(vreg_type, accum, tmp, mask) if const_expr(remain): tail_row = 2 * loop_pairs + 1 - src_tail = _dsl_pto.vlds(vreg_type, src_ptr, _scalar.const(col + tail_row * cols)) + src_tail = _dsl_pto.vlds( + vreg_type, src_ptr, _scalar.const(col + tail_row * cols) + ) accum = reduce_op(vreg_type, accum, src_tail, mask) _dsl_pto.vsts(accum, out_ptr, _scalar.const(col), mask) @@ -1217,12 +1261,18 @@ def _col_reduce_micro_post_update( active = builtins.min(lanes, cols - col) mask = _mask_for_chunk(dtype, active) chunk_base = src_cursor - accum, src_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, src_cursor, lane_step) + accum, src_cursor = _dsl_pto.vlds_post( + vreg_type, ptr_type, src_cursor, lane_step + ) row0_ptr = _dsl_pto.addptr(chunk_base, _scalar.const(cols)) row1_ptr = _dsl_pto.addptr(chunk_base, _scalar.const(cols * 2)) for _ in range_constexpr(loop_pairs): - src0, row0_ptr = _dsl_pto.vlds_post(vreg_type, ptr_type, row0_ptr, pair_stride) - src1, row1_ptr = _dsl_pto.vlds_post(vreg_type, ptr_type, row1_ptr, pair_stride) + src0, row0_ptr = _dsl_pto.vlds_post( + vreg_type, ptr_type, row0_ptr, pair_stride + ) + src1, row1_ptr = _dsl_pto.vlds_post( + vreg_type, ptr_type, row1_ptr, pair_stride + ) tmp = reduce_op(vreg_type, src0, src1, mask) accum = reduce_op(vreg_type, accum, tmp, mask) if const_expr(remain): @@ -1242,7 +1292,12 @@ def _gather_micro( base_addr, ): rows, cols, _, _ = _check_gather_operands( - src_view, indices_view, out_view, dtype=dtype, index_dtype=index_dtype, shape=shape + src_view, + indices_view, + out_view, + dtype=dtype, + index_dtype=index_dtype, + shape=shape, ) src_bytes = rows * cols * _dtype_byte_width(dtype) idx_bytes = rows * cols * _dtype_byte_width(index_dtype) @@ -1252,7 +1307,9 @@ def _gather_micro( out_addr = _const_i64(base_addr + src_bytes + idx_bytes) src_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=src_addr) - idx_tile = _dsl_pto.make_tile_buffer(index_dtype, shape, space="VEC").alloc(addr=idx_addr) + idx_tile = _dsl_pto.make_tile_buffer(index_dtype, shape, space="VEC").alloc( + addr=idx_addr + ) out_tile = _dsl_pto.make_tile_buffer(dtype, shape, space="VEC").alloc(addr=out_addr) _dsl_pto.load(src_view, src_tile) @@ -1272,7 +1329,9 @@ def _gather_micro( offset = _scalar.const(row_base + col) mask = _mask_for_chunk(dtype, active) idx_vec = _dsl_pto.vlds(index_vreg_type, idx_ptr, offset) - out_vec = _dsl_pto.vgather2(vreg_type, src_ptr, idx_vec, _scalar.const(active)) + out_vec = _dsl_pto.vgather2( + vreg_type, src_ptr, idx_vec, _scalar.const(active) + ) _dsl_pto.vsts(out_vec, out_ptr, offset, mask) _dsl_pto.store(out_tile, out_view) @@ -1280,7 +1339,9 @@ def _gather_micro( def _mrgsort_micro(src_view, out_view, *, dtype, shape, block_len, base_addr): - _, cols = _check_mrgsort_operands(src_view, out_view, dtype=dtype, shape=shape, block_len=block_len) + _, cols = _check_mrgsort_operands( + src_view, out_view, dtype=dtype, shape=shape, block_len=block_len + ) src_addr = _const_i64(base_addr) out_addr = _const_i64(base_addr + cols * _dtype_byte_width(dtype)) @@ -1320,7 +1381,9 @@ def _mrgsort_micro(src_view, out_view, *, dtype, shape, block_len, base_addr): def _sort32_micro(src_view, idx_view, out_view, *, dtype, shape, base_addr): - rows, cols, out_cols = _check_sort32_operands(src_view, idx_view, out_view, dtype=dtype, shape=shape) + rows, cols, out_cols = _check_sort32_operands( + src_view, idx_view, out_view, dtype=dtype, shape=shape + ) src_bytes = rows * cols * _dtype_byte_width(dtype) idx_bytes = rows * cols * 4 @@ -1328,9 +1391,15 @@ def _sort32_micro(src_view, idx_view, out_view, *, dtype, shape, base_addr): idx_addr = _const_i64(base_addr + src_bytes) out_addr = _const_i64(base_addr + src_bytes + idx_bytes) - src_tile = _dsl_pto.make_tile_buffer(dtype, [rows, cols], space="VEC").alloc(addr=src_addr) - idx_tile = _dsl_pto.make_tile_buffer(_dsl_pto.uint32, [rows, cols], space="VEC").alloc(addr=idx_addr) - out_tile = _dsl_pto.make_tile_buffer(dtype, [rows, out_cols], space="VEC").alloc(addr=out_addr) + src_tile = _dsl_pto.make_tile_buffer(dtype, [rows, cols], space="VEC").alloc( + addr=src_addr + ) + idx_tile = _dsl_pto.make_tile_buffer( + _dsl_pto.uint32, [rows, cols], space="VEC" + ).alloc(addr=idx_addr) + out_tile = _dsl_pto.make_tile_buffer(dtype, [rows, out_cols], space="VEC").alloc( + addr=out_addr + ) _dsl_pto.load(src_view, src_tile) _dsl_pto.load(idx_view, idx_tile) @@ -1350,9 +1419,16 @@ def _sort32_micro(src_view, idx_view, out_view, *, dtype, shape, base_addr): return out_view -def _binary_micro(lhs_view, rhs_view, out_view, *, dtype, shape, lanes, base_addr, op_name, impl): +def _binary_micro( + lhs_view, rhs_view, out_view, *, dtype, shape, lanes, base_addr, op_name, impl +): rows, cols = _check_tbinop_operands( - lhs_view, rhs_view, out_view, dtype=dtype, shape=shape, context=op_name.upper().replace("V", "T", 1) + lhs_view, + rhs_view, + out_view, + dtype=dtype, + shape=shape, + context=op_name.upper().replace("V", "T", 1), ) lanes = _resolve_lanes(dtype, lanes) element_count = rows * cols @@ -1377,7 +1453,9 @@ def _binary_micro(lhs_view, rhs_view, out_view, *, dtype, shape, lanes, base_add impl_kind = _normalize_vf_impl_kind(impl) is_contiguous = rows == 1 or cols == element_count if const_expr(impl_kind == VF_IMPL_DEFAULT): - impl_kind = VF_IMPL_1D_POST_UPDATE if is_contiguous else VF_IMPL_2D_NO_POST_UPDATE + impl_kind = ( + VF_IMPL_1D_POST_UPDATE if is_contiguous else VF_IMPL_2D_NO_POST_UPDATE + ) if const_expr(impl_kind == VF_IMPL_1D_NO_POST_UPDATE): _binary_micro_1d_no_post_update( @@ -1447,7 +1525,16 @@ def _binary_micro_1d_no_post_update( def _binary_micro_1d_post_update( - lhs_ptr, rhs_ptr, out_ptr, *, ptr_type, dtype, lanes, element_count, vreg_type, micro_op + lhs_ptr, + rhs_ptr, + out_ptr, + *, + ptr_type, + dtype, + lanes, + element_count, + vreg_type, + micro_op, ): lhs_cursor = lhs_ptr rhs_cursor = rhs_ptr @@ -1456,8 +1543,12 @@ def _binary_micro_1d_post_update( for offset in range_constexpr(0, element_count, lanes): active = builtins.min(lanes, element_count - offset) mask = _mask_for_chunk(dtype, active) - lhs_vec, lhs_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, lhs_cursor, lane_step) - rhs_vec, rhs_cursor = _dsl_pto.vlds_post(vreg_type, ptr_type, rhs_cursor, lane_step) + lhs_vec, lhs_cursor = _dsl_pto.vlds_post( + vreg_type, ptr_type, lhs_cursor, lane_step + ) + rhs_vec, rhs_cursor = _dsl_pto.vlds_post( + vreg_type, ptr_type, rhs_cursor, lane_step + ) out_vec = micro_op(vreg_type, lhs_vec, rhs_vec, mask) out_cursor = _dsl_pto.vsts_post(ptr_type, out_vec, out_cursor, lane_step, mask) @@ -1495,7 +1586,9 @@ def _binary_micro_2d_post_update( def _rsqrt_micro(src_view, out_view, *, dtype, shape, lanes, base_addr): if any(not isinstance(dim, int) for dim in shape): - raise ValueError("micro tile lowering currently requires a static integer shape.") + raise ValueError( + "micro tile lowering currently requires a static integer shape." + ) lanes = _resolve_lanes(dtype, lanes) element_count = 1 @@ -1530,7 +1623,9 @@ def _rsqrt_micro(src_view, out_view, *, dtype, shape, lanes, base_addr): def _unary_micro(src_view, out_view, *, dtype, shape, lanes, base_addr, op_name): if any(not isinstance(dim, int) for dim in shape): - raise ValueError("micro tile lowering currently requires a static integer shape.") + raise ValueError( + "micro tile lowering currently requires a static integer shape." + ) lanes = _resolve_lanes(dtype, lanes) element_count = 1 @@ -1555,8 +1650,10 @@ def _unary_micro(src_view, out_view, *, dtype, shape, lanes, base_addr, op_name) mask = _mask_for_chunk(dtype, active) index = _scalar.const(offset) src_vec = _dsl_pto.vlds(_dsl_pto.VRegType(lanes, dtype), src_ptr, index) - out_vec = src_vec if micro_op is None else micro_op( - _dsl_pto.VRegType(lanes, dtype), src_vec, mask + out_vec = ( + src_vec + if micro_op is None + else micro_op(_dsl_pto.VRegType(lanes, dtype), src_vec, mask) ) _dsl_pto.vsts(out_vec, out_ptr, index, mask) diff --git a/ptodsl/lib/a5/tile_micro_coverage.py b/ptodsl/lib/a5/tile_micro_coverage.py index 33ea434f..c3b98677 100644 --- a/ptodsl/lib/a5/tile_micro_coverage.py +++ b/ptodsl/lib/a5/tile_micro_coverage.py @@ -1,6 +1,5 @@ from ptodsl import tile - TILE_MICRO_COVERAGE = { "mov": { "status": "implemented", @@ -191,7 +190,9 @@ def coverage_markdown(): for name in tile.__all__: entry = TILE_MICRO_COVERAGE[name] helper = entry["helper"] or "-" - lines.append(f"| `{name}` | `{entry['status']}` | `{helper}` | {entry['note']} |") + lines.append( + f"| `{name}` | `{entry['status']}` | `{helper}` | {entry['note']} |" + ) return "\n".join(lines) + "\n" diff --git a/tests/frontend/test_mxfp8_frontend.py b/tests/frontend/test_mxfp8_frontend.py index 03b0a70e..76aebf8c 100644 --- a/tests/frontend/test_mxfp8_frontend.py +++ b/tests/frontend/test_mxfp8_frontend.py @@ -11,6 +11,7 @@ def get(): def test_mxfp8_family_uses_e5m2_data_and_e8m0_scale(monkeypatch): stub_ir = types.SimpleNamespace( + F32Type=_StubType, Float8E5M2Type=_StubType, Float8E8M0FNUType=_StubType, Float8E4M3FNType=_StubType, @@ -29,6 +30,7 @@ def test_mxfp8_family_uses_e5m2_data_and_e8m0_scale(monkeypatch): def test_float8_aliases_accept_common_mlir_ctor_names(monkeypatch): stub_ir = types.SimpleNamespace( + F32Type=_StubType, Float8E4M3FNType=_StubType, Float8E5M2Type=_StubType, Float8E8M0FNUType=_StubType, @@ -42,6 +44,7 @@ def test_float8_aliases_accept_common_mlir_ctor_names(monkeypatch): def test_make_mxfp8_accepts_mixed_lhs_rhs_variants(monkeypatch): stub_ir = types.SimpleNamespace( + F32Type=_StubType, Float8E4M3FNType=_StubType, Float8E5M2Type=_StubType, Float8E8M0FNUType=_StubType, diff --git a/tests/regression/test_a5_lib_regression.py b/tests/regression/test_a5_lib_regression.py index 6dfab991..d9b23c7d 100644 --- a/tests/regression/test_a5_lib_regression.py +++ b/tests/regression/test_a5_lib_regression.py @@ -239,7 +239,9 @@ def a5_row_reduce_micro(src: "ptr_t", dst: "ptr_t") -> None: ("col_min_micro", "pto.vmin", "pto.tcolmin", a5.VF_IMPL_1D_POST_UPDATE), ], ) -def test_a5_col_reduce_micro_emits_template_lowering(helper_name, reduce_op, tile_op, impl): +def test_a5_col_reduce_micro_emits_template_lowering( + helper_name, reduce_op, tile_op, impl +): def meta_data(): return { "ptr_t": pto.ptr(pto.float32), @@ -340,7 +342,10 @@ def test_a5_add_micro_rejects_view_dtype_mismatch(): def meta_data(): return {"ptr_t": pto.ptr(pto.float16)} - with pytest.raises(ValueError, match="TADD input tile src0, src1 and dst tile data type mismatch"): + with pytest.raises( + ValueError, match="TADD input tile src0, src1 and dst tile data type mismatch" + ): + @to_ir_module(meta_data=meta_data) def invalid_add(src0: "ptr_t", src1: "ptr_t", dst: "ptr_t") -> None: lhs = pto.make_tensor(src0, shape=[32, 32], dtype=pto.float16) @@ -360,7 +365,10 @@ def test_a5_row_expand_micro_rejects_non_column_source(): def meta_data(): return {"ptr_t": pto.ptr(pto.float32)} - with pytest.raises(ValueError, match="TROWEXPAND source valid shape must be \\[rows, 1\\]"): + with pytest.raises( + ValueError, match="TROWEXPAND source valid shape must be \\[rows, 1\\]" + ): + @to_ir_module(meta_data=meta_data) def invalid_row_expand(src: "ptr_t", dst: "ptr_t") -> None: src_view = pto.make_tensor(src, shape=[1, 32], dtype=pto.float32) @@ -379,6 +387,7 @@ def meta_data(): return {"ptr_t": pto.ptr(pto.float32)} with pytest.raises(ValueError, match="use a single-column output tile"): + @to_ir_module(meta_data=meta_data) def invalid_row_reduce(src: "ptr_t", dst: "ptr_t") -> None: src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.float32) @@ -397,6 +406,7 @@ def meta_data(): return {"ptr_t": pto.ptr(pto.bool)} with pytest.raises(ValueError, match="TCOLREDUCE input data type is not supported"): + @to_ir_module(meta_data=meta_data) def invalid_col_reduce(src: "ptr_t", dst: "ptr_t") -> None: src_view = pto.make_tensor(src, shape=[32, 32], dtype=pto.bool) diff --git a/tests/regression/test_tile_micro_coverage.py b/tests/regression/test_tile_micro_coverage.py index 5f17ded9..81dbe74c 100644 --- a/tests/regression/test_tile_micro_coverage.py +++ b/tests/regression/test_tile_micro_coverage.py @@ -35,5 +35,11 @@ def test_tile_micro_coverage_summary_matches_public_surface(): def test_checked_in_tile_micro_checklist_is_in_sync(): - checklist = Path(__file__).resolve().parents[2] / "ptodsl" / "lib" / "a5" / "TILE_MICRO_CHECKLIST.md" + checklist = ( + Path(__file__).resolve().parents[2] + / "ptodsl" + / "lib" + / "a5" + / "TILE_MICRO_CHECKLIST.md" + ) assert checklist.read_text(encoding="utf-8") == coverage_markdown()