Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 20 additions & 27 deletions examples/hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,44 +11,37 @@
The simplest PyPTO program: element-wise tensor addition.

Concepts introduced:
- @pl.program / @pl.function decorators
- InCore function: load tiles from global memory, compute, store back
- Orchestration function: calls InCore kernels on full tensors
- pl.Out[] marks output tensor parameters
- @pl.jit decorator: function specializes on torch tensor shape/dtype, compiles, caches
- pl.incore() context: a single on-chip compute scope (load tiles, compute, store back)
- pl.Out[] marks output tensor parameters (in-place mutation)
- Tensor (global memory) vs Tile (on-chip register) types

Run: python examples/hello_world.py
Next: examples/kernels/01_elementwise.py
"""

import pypto.language as pl
import torch
from pypto.runtime import RunConfig


@pl.program
class HelloWorldProgram:
@pl.function(type=pl.FunctionType.InCore)
def tile_add(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
tile_a: pl.Tile[[128, 128], pl.FP32] = pl.load(a, [0, 0], [128, 128])
@pl.jit
def tile_add(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
with pl.incore():
tile_a = pl.load(a, [0, 0], [128, 128])
tile_b = pl.load(b, [0, 0], [128, 128])
tile_c = pl.add(tile_a, tile_b)
out_c = pl.store(tile_c, [0, 0], c)
return out_c

@pl.function(type=pl.FunctionType.Orchestration)
def orchestrator(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
out_c_ret = self.tile_add(a, b, out_c)
return out_c_ret
pl.store(tile_c, [0, 0], c)
return c


if __name__ == "__main__":
print(HelloWorldProgram.as_python())
a = torch.full((128, 128), 2.0, dtype=torch.float32)
b = torch.full((128, 128), 3.0, dtype=torch.float32)
c = torch.zeros((128, 128), dtype=torch.float32)
tile_add(a, b, c, config=RunConfig())
expected = a + b
assert torch.allclose(c, expected, rtol=1e-5, atol=1e-5), (
f"hello_world tile_add failed: max diff = {(c - expected).abs().max().item()}"
)
print("OK")
158 changes: 55 additions & 103 deletions examples/kernels/01_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,136 +10,88 @@
"""
Tile element-wise operations: add and multiply.

Programs:
TileAddProgram — c = a + b (128x128)
TileMulProgram — c = a * b (128x128)
Kernels:
tile_add_128 — c = a + b (128x128)
tile_mul_128 — c = a * b (128x128)
tile_add_64 — c = a + b (64x64)
tile_mul_64 — c = a * b (64x64)

Concepts introduced:
- pl.mul for element-wise multiplication
- Multiple programs in one file
- Multiple @pl.jit kernels in one file

Run: python examples/kernels/01_elementwise.py
Next: examples/kernels/02_fused_ops.py
"""

import pypto.language as pl
import torch
from pypto.runtime import RunConfig


@pl.program
class TileAddProgram:
@pl.function(type=pl.FunctionType.InCore)
def tile_add(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
@pl.jit
def tile_add_128(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
with pl.incore():
tile_a = pl.load(a, [0, 0], [128, 128])
tile_b = pl.load(b, [0, 0], [128, 128])
tile_c = pl.add(tile_a, tile_b)
out_c = pl.store(tile_c, [0, 0], c)
return out_c

@pl.function(type=pl.FunctionType.Orchestration)
def orchestrator(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
out_c_ret = self.tile_add(a, b, out_c)
return out_c_ret


@pl.program
class TileMulProgram:
@pl.function(type=pl.FunctionType.InCore)
def tile_mul(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
pl.store(tile_c, [0, 0], c)
return c


@pl.jit
def tile_mul_128(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
with pl.incore():
tile_a = pl.load(a, [0, 0], [128, 128])
tile_b = pl.load(b, [0, 0], [128, 128])
tile_c = pl.mul(tile_a, tile_b)
out_c = pl.store(tile_c, [0, 0], c)
return out_c

@pl.function(type=pl.FunctionType.Orchestration)
def orchestrator(
self,
a: pl.Tensor[[128, 128], pl.FP32],
b: pl.Tensor[[128, 128], pl.FP32],
out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
) -> pl.Tensor[[128, 128], pl.FP32]:
out_c_ret = self.tile_mul(a, b, out_c)
return out_c_ret


@pl.program
class TileAdd64Program:
"""Element-wise addition on 64x64 tiles."""
pl.store(tile_c, [0, 0], c)
return c

@pl.function(type=pl.FunctionType.InCore)
def tile_add(
self,
a: pl.Tensor[[64, 64], pl.FP32],
b: pl.Tensor[[64, 64], pl.FP32],
c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
) -> pl.Tensor[[64, 64], pl.FP32]:

@pl.jit
def tile_add_64(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
"""Element-wise addition on 64x64 tiles."""
with pl.incore():
tile_a = pl.load(a, [0, 0], [64, 64])
tile_b = pl.load(b, [0, 0], [64, 64])
tile_c = pl.add(tile_a, tile_b)
out_c = pl.store(tile_c, [0, 0], c)
return out_c

@pl.function(type=pl.FunctionType.Orchestration)
def orchestrator(
self,
a: pl.Tensor[[64, 64], pl.FP32],
b: pl.Tensor[[64, 64], pl.FP32],
out_c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
) -> pl.Tensor[[64, 64], pl.FP32]:
out_c_ret = self.tile_add(a, b, out_c)
return out_c_ret


@pl.program
class TileMul64Program:
"""Element-wise multiplication on 64x64 tiles."""
pl.store(tile_c, [0, 0], c)
return c

@pl.function(type=pl.FunctionType.InCore)
def tile_mul(
self,
a: pl.Tensor[[64, 64], pl.FP32],
b: pl.Tensor[[64, 64], pl.FP32],
c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
) -> pl.Tensor[[64, 64], pl.FP32]:

@pl.jit
def tile_mul_64(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
"""Element-wise multiplication on 64x64 tiles."""
with pl.incore():
tile_a = pl.load(a, [0, 0], [64, 64])
tile_b = pl.load(b, [0, 0], [64, 64])
tile_c = pl.mul(tile_a, tile_b)
out_c = pl.store(tile_c, [0, 0], c)
return out_c
pl.store(tile_c, [0, 0], c)
return c


@pl.function(type=pl.FunctionType.Orchestration)
def orchestrator(
self,
a: pl.Tensor[[64, 64], pl.FP32],
b: pl.Tensor[[64, 64], pl.FP32],
out_c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
) -> pl.Tensor[[64, 64], pl.FP32]:
out_c_ret = self.tile_mul(a, b, out_c)
return out_c_ret
if __name__ == "__main__":
cfg = RunConfig()

a128 = torch.full((128, 128), 2.0, dtype=torch.float32)
b128 = torch.full((128, 128), 3.0, dtype=torch.float32)
c128 = torch.zeros((128, 128), dtype=torch.float32)
tile_add_128(a128, b128, c128, config=cfg)
assert torch.allclose(c128, a128 + b128, rtol=1e-5, atol=1e-5)

# Aliases for backward compatibility with tests that use size-suffixed names
TileAdd128Program = TileAddProgram
TileMul128Program = TileMulProgram
c128 = torch.zeros((128, 128), dtype=torch.float32)
tile_mul_128(a128, b128, c128, config=cfg)
assert torch.allclose(c128, a128 * b128, rtol=1e-5, atol=1e-5)

a64 = torch.full((64, 64), 2.0, dtype=torch.float32)
b64 = torch.full((64, 64), 3.0, dtype=torch.float32)
c64 = torch.zeros((64, 64), dtype=torch.float32)
tile_add_64(a64, b64, c64, config=cfg)
assert torch.allclose(c64, a64 + b64, rtol=1e-5, atol=1e-5)

if __name__ == "__main__":
print("=== TileAddProgram ===")
print(TileAddProgram.as_python())
print("\n=== TileMulProgram ===")
print(TileMulProgram.as_python())
c64 = torch.zeros((64, 64), dtype=torch.float32)
tile_mul_64(a64, b64, c64, config=cfg)
assert torch.allclose(c64, a64 * b64, rtol=1e-5, atol=1e-5)

print("OK")
Loading
Loading