-
Notifications
You must be signed in to change notification settings - Fork 31
refactor: drop deprecated pl.auto_chunk / chunked_loop_optimizer #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,3 +1,3 @@ | ||||||||||
| # Copyright (c) PyPTO Contributors. | ||||||||||
| # This program is free software, you can redistribute it and/or modify it under the terms and conditions of | ||||||||||
| # CANN Open Software License Agreement Version 2.0 (the "License"). | ||||||||||
|
|
@@ -11,7 +11,7 @@ | |||||||||
| output = matmul(attn_out, wo) + hidden_states | ||||||||||
|
|
||||||||||
| Stage 0 (matmul: attn_out x wo) and Stage 1 (residual add) can be: | ||||||||||
| - Fused: single pl.at block with chunked_loop_optimizer (mix mode) | ||||||||||
| - Fused: single pl.at block with auto_chunk (mix mode) | ||||||||||
| - Split: separate pl.at blocks for each stage (split mode) | ||||||||||
|
Comment on lines
+14
to
15
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update mix-mode docs to match explicit chunk loops. Line 14 and Line 39 still mention ✏️ Suggested doc fix- - Fused: single pl.at block with auto_chunk (mix mode)
+ - Fused: single pl.at block with explicit chunk loops (mix mode)
@@
- """Build fused matmul + elementwise program with auto_chunk."""
+ """Build fused matmul + elementwise program with explicit chunk loops."""Also applies to: 39-39 🤖 Prompt for AI Agents |
||||||||||
|
|
||||||||||
| Input and hidden_states are BF16; wo is BF16; output is FP32. | ||||||||||
|
|
@@ -36,7 +36,7 @@ | |||||||||
| batch_tile: int = BATCH_TILE, | ||||||||||
| chunk: int = 4, | ||||||||||
| ): | ||||||||||
| """Build fused matmul + elementwise program with chunked_loop_optimizer.""" | ||||||||||
| """Build fused matmul + elementwise program with auto_chunk.""" | ||||||||||
| k_blocks = hidden // k_chunk | ||||||||||
| n_blocks = hidden // n_chunk | ||||||||||
|
|
||||||||||
|
|
@@ -50,26 +50,27 @@ | |||||||||
| wo: pl.Tensor[[hidden, hidden], pl.BF16], | ||||||||||
| resid: pl.Out[pl.Tensor[[batch, hidden], pl.FP32]], | ||||||||||
| ) -> pl.Tensor[[batch, hidden], pl.FP32]: | ||||||||||
| with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]): | ||||||||||
| for nb in pl.parallel(0, n_blocks, chunk=chunk): | ||||||||||
| n0 = nb * n_chunk | ||||||||||
| # First K-tile: initialize accumulator via matmul | ||||||||||
| a_chunk_0 = pl.slice(attn_out, [batch_tile, k_chunk], [0, 0]) | ||||||||||
| w_chunk_0 = pl.slice(wo, [k_chunk, n_chunk], [0, n0]) | ||||||||||
| acc = pl.matmul(a_chunk_0, w_chunk_0, out_dtype=pl.FP32) | ||||||||||
|
|
||||||||||
| # Remaining K-tiles: accumulate via matmul_acc | ||||||||||
| for kb in pl.range(1, k_blocks): | ||||||||||
| k0 = kb * k_chunk | ||||||||||
| a_chunk = pl.slice(attn_out, [batch_tile, k_chunk], [0, k0]) | ||||||||||
| w_chunk = pl.slice(wo, [k_chunk, n_chunk], [k0, n0]) | ||||||||||
| acc = pl.matmul_acc(acc, a_chunk, w_chunk) | ||||||||||
|
|
||||||||||
| # Elementwise residual addition | ||||||||||
| hidden_chunk = pl.slice(hidden_states, [batch_tile, n_chunk], [0, n0]) | ||||||||||
| hidden_chunk_f32 = pl.cast(hidden_chunk, target_type=pl.FP32) | ||||||||||
| resid_sum = pl.add(acc, hidden_chunk_f32) | ||||||||||
| resid = pl.assemble(resid, resid_sum, [0, n0]) | ||||||||||
| for nb_chunk in pl.parallel(0, n_blocks, 1 * chunk): | ||||||||||
| with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): | ||||||||||
| for nb in pl.range(nb_chunk, nb_chunk + 1 * chunk, 1): | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The inner loop
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The inner loop upper bound
Suggested change
|
||||||||||
| n0 = nb * n_chunk | ||||||||||
| # First K-tile: initialize accumulator via matmul | ||||||||||
| a_chunk_0 = pl.slice(attn_out, [batch_tile, k_chunk], [0, 0]) | ||||||||||
| w_chunk_0 = pl.slice(wo, [k_chunk, n_chunk], [0, n0]) | ||||||||||
| acc = pl.matmul(a_chunk_0, w_chunk_0, out_dtype=pl.FP32) | ||||||||||
|
|
||||||||||
| # Remaining K-tiles: accumulate via matmul_acc | ||||||||||
| for kb in pl.range(1, k_blocks): | ||||||||||
| k0 = kb * k_chunk | ||||||||||
| a_chunk = pl.slice(attn_out, [batch_tile, k_chunk], [0, k0]) | ||||||||||
| w_chunk = pl.slice(wo, [k_chunk, n_chunk], [k0, n0]) | ||||||||||
| acc = pl.matmul_acc(acc, a_chunk, w_chunk) | ||||||||||
|
|
||||||||||
| # Elementwise residual addition | ||||||||||
| hidden_chunk = pl.slice(hidden_states, [batch_tile, n_chunk], [0, n0]) | ||||||||||
| hidden_chunk_f32 = pl.cast(hidden_chunk, target_type=pl.FP32) | ||||||||||
| resid_sum = pl.add(acc, hidden_chunk_f32) | ||||||||||
| resid = pl.assemble(resid, resid_sum, [0, n0]) | ||||||||||
|
|
||||||||||
| return resid | ||||||||||
|
|
||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,3 +1,3 @@ | ||||||||||
| # Copyright (c) PyPTO Contributors. | ||||||||||
| # This program is free software, you can redistribute it and/or modify it under the terms and conditions of | ||||||||||
| # CANN Open Software License Agreement Version 2.0 (the "License"). | ||||||||||
|
|
@@ -46,13 +46,15 @@ | |||||||||
| b: pl.Tensor[[k, n], pl.FP32], | ||||||||||
| c: pl.Out[pl.Tensor[[m, n], pl.FP32]], | ||||||||||
| ) -> pl.Tensor[[m, n], pl.FP32]: | ||||||||||
| with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): | ||||||||||
| for mb in pl.parallel(0, m, m_tile, chunk=m_chunk): | ||||||||||
| for nb in pl.parallel(0, n, n_tile, chunk=n_chunk): | ||||||||||
| tile_a = pl.slice(a, [m_tile, k], [mb, 0]) | ||||||||||
| tile_b = pl.slice(b, [k, n_tile], [0, nb]) | ||||||||||
| tile_c = pl.matmul(tile_a, tile_b) | ||||||||||
| c = pl.assemble(c, tile_c, [mb, nb]) | ||||||||||
| for mb_chunk in pl.parallel(0, m, m_tile * m_chunk): | ||||||||||
| for nb_chunk in pl.parallel(0, n, n_tile * n_chunk): | ||||||||||
| with pl.at(level=pl.Level.CORE_GROUP): | ||||||||||
| for mb in pl.range(mb_chunk, mb_chunk + m_tile * m_chunk, m_tile): | ||||||||||
| for nb in pl.range(nb_chunk, nb_chunk + n_tile * n_chunk, n_tile): | ||||||||||
|
Comment on lines
+52
to
+53
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These inner loops can exceed the dimensions
Suggested change
|
||||||||||
| tile_a = pl.slice(a, [m_tile, k], [mb, 0]) | ||||||||||
| tile_b = pl.slice(b, [k, n_tile], [0, nb]) | ||||||||||
| tile_c = pl.matmul(tile_a, tile_b) | ||||||||||
| c = pl.assemble(c, tile_c, [mb, nb]) | ||||||||||
|
|
||||||||||
| return c | ||||||||||
|
|
||||||||||
|
|
||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment is being updated to use
auto_chunk, butauto_chunkis deprecated and its usage is being removed from the code in this PR. Since the implementation has migrated to explicit chunk loops, the documentation should reflect this change to avoid confusion.