hw-native-sys · lyfne123 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/examples/hello_world.py b/examples/hello_world.py
@@ -11,44 +11,37 @@
 The simplest PyPTO program: element-wise tensor addition.
 
 Concepts introduced:
-  - @pl.program / @pl.function decorators
-  - InCore function: load tiles from global memory, compute, store back
-  - Orchestration function: calls InCore kernels on full tensors
-  - pl.Out[] marks output tensor parameters
+  - @pl.jit decorator: function specializes on torch tensor shape/dtype, compiles, caches
+  - pl.incore() context: a single on-chip compute scope (load tiles, compute, store back)
+  - pl.Out[] marks output tensor parameters (in-place mutation)
   - Tensor (global memory) vs Tile (on-chip register) types
 
 Run:  python examples/hello_world.py
 Next: examples/kernels/01_elementwise.py
 """
 
 import pypto.language as pl
+import torch
+from pypto.runtime import RunConfig
 
 
-@pl.program
-class HelloWorldProgram:
-    @pl.function(type=pl.FunctionType.InCore)
-    def tile_add(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
-        tile_a: pl.Tile[[128, 128], pl.FP32] = pl.load(a, [0, 0], [128, 128])
+@pl.jit
+def tile_add(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
+    with pl.incore():
+        tile_a = pl.load(a, [0, 0], [128, 128])
         tile_b = pl.load(b, [0, 0], [128, 128])
         tile_c = pl.add(tile_a, tile_b)
-        out_c = pl.store(tile_c, [0, 0], c)
-        return out_c
-
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def orchestrator(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
-        out_c_ret = self.tile_add(a, b, out_c)
-        return out_c_ret
+        pl.store(tile_c, [0, 0], c)
+    return c
 
 
 if __name__ == "__main__":
-    print(HelloWorldProgram.as_python())
+    a = torch.full((128, 128), 2.0, dtype=torch.float32)
+    b = torch.full((128, 128), 3.0, dtype=torch.float32)
+    c = torch.zeros((128, 128), dtype=torch.float32)
+    tile_add(a, b, c, config=RunConfig())
+    expected = a + b
+    assert torch.allclose(c, expected, rtol=1e-5, atol=1e-5), (
+        f"hello_world tile_add failed: max diff = {(c - expected).abs().max().item()}"
+    )
+    print("OK")
diff --git a/examples/kernels/01_elementwise.py b/examples/kernels/01_elementwise.py
@@ -10,136 +10,88 @@
 """
 Tile element-wise operations: add and multiply.
 
-Programs:
-  TileAddProgram — c = a + b  (128x128)
-  TileMulProgram — c = a * b  (128x128)
+Kernels:
+  tile_add_128 — c = a + b  (128x128)
+  tile_mul_128 — c = a * b  (128x128)
+  tile_add_64  — c = a + b  (64x64)
+  tile_mul_64  — c = a * b  (64x64)
 
 Concepts introduced:
   - pl.mul for element-wise multiplication
-  - Multiple programs in one file
+  - Multiple @pl.jit kernels in one file
 
 Run:  python examples/kernels/01_elementwise.py
 Next: examples/kernels/02_fused_ops.py
 """
 
 import pypto.language as pl
+import torch
+from pypto.runtime import RunConfig
 
 
-@pl.program
-class TileAddProgram:
-    @pl.function(type=pl.FunctionType.InCore)
-    def tile_add(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
+@pl.jit
+def tile_add_128(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
+    with pl.incore():
         tile_a = pl.load(a, [0, 0], [128, 128])
         tile_b = pl.load(b, [0, 0], [128, 128])
         tile_c = pl.add(tile_a, tile_b)
-        out_c = pl.store(tile_c, [0, 0], c)
-        return out_c
-
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def orchestrator(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
-        out_c_ret = self.tile_add(a, b, out_c)
-        return out_c_ret
-
-
-@pl.program
-class TileMulProgram:
-    @pl.function(type=pl.FunctionType.InCore)
-    def tile_mul(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
+        pl.store(tile_c, [0, 0], c)
+    return c
+
+
+@pl.jit
+def tile_mul_128(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
+    with pl.incore():
         tile_a = pl.load(a, [0, 0], [128, 128])
         tile_b = pl.load(b, [0, 0], [128, 128])
         tile_c = pl.mul(tile_a, tile_b)
-        out_c = pl.store(tile_c, [0, 0], c)
-        return out_c
-
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def orchestrator(
-        self,
-        a: pl.Tensor[[128, 128], pl.FP32],
-        b: pl.Tensor[[128, 128], pl.FP32],
-        out_c: pl.Out[pl.Tensor[[128, 128], pl.FP32]],
-    ) -> pl.Tensor[[128, 128], pl.FP32]:
-        out_c_ret = self.tile_mul(a, b, out_c)
-        return out_c_ret
-
-
-@pl.program
-class TileAdd64Program:
-    """Element-wise addition on 64x64 tiles."""
+        pl.store(tile_c, [0, 0], c)
+    return c
 
-    @pl.function(type=pl.FunctionType.InCore)
-    def tile_add(
-        self,
-        a: pl.Tensor[[64, 64], pl.FP32],
-        b: pl.Tensor[[64, 64], pl.FP32],
-        c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
-    ) -> pl.Tensor[[64, 64], pl.FP32]:
+
+@pl.jit
+def tile_add_64(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
+    """Element-wise addition on 64x64 tiles."""
+    with pl.incore():
         tile_a = pl.load(a, [0, 0], [64, 64])
         tile_b = pl.load(b, [0, 0], [64, 64])
         tile_c = pl.add(tile_a, tile_b)
-        out_c = pl.store(tile_c, [0, 0], c)
-        return out_c
-
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def orchestrator(
-        self,
-        a: pl.Tensor[[64, 64], pl.FP32],
-        b: pl.Tensor[[64, 64], pl.FP32],
-        out_c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
-    ) -> pl.Tensor[[64, 64], pl.FP32]:
-        out_c_ret = self.tile_add(a, b, out_c)
-        return out_c_ret
-
-
-@pl.program
-class TileMul64Program:
-    """Element-wise multiplication on 64x64 tiles."""
+        pl.store(tile_c, [0, 0], c)
+    return c
 
-    @pl.function(type=pl.FunctionType.InCore)
-    def tile_mul(
-        self,
-        a: pl.Tensor[[64, 64], pl.FP32],
-        b: pl.Tensor[[64, 64], pl.FP32],
-        c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
-    ) -> pl.Tensor[[64, 64], pl.FP32]:
+
+@pl.jit
+def tile_mul_64(a: pl.Tensor, b: pl.Tensor, c: pl.Out[pl.Tensor]):
+    """Element-wise multiplication on 64x64 tiles."""
+    with pl.incore():
         tile_a = pl.load(a, [0, 0], [64, 64])
         tile_b = pl.load(b, [0, 0], [64, 64])
         tile_c = pl.mul(tile_a, tile_b)
-        out_c = pl.store(tile_c, [0, 0], c)
-        return out_c
+        pl.store(tile_c, [0, 0], c)
+    return c
+
 
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def orchestrator(
-        self,
-        a: pl.Tensor[[64, 64], pl.FP32],
-        b: pl.Tensor[[64, 64], pl.FP32],
-        out_c: pl.Out[pl.Tensor[[64, 64], pl.FP32]],
-    ) -> pl.Tensor[[64, 64], pl.FP32]:
-        out_c_ret = self.tile_mul(a, b, out_c)
-        return out_c_ret
+if __name__ == "__main__":
+    cfg = RunConfig()
 
+    a128 = torch.full((128, 128), 2.0, dtype=torch.float32)
+    b128 = torch.full((128, 128), 3.0, dtype=torch.float32)
+    c128 = torch.zeros((128, 128), dtype=torch.float32)
+    tile_add_128(a128, b128, c128, config=cfg)
+    assert torch.allclose(c128, a128 + b128, rtol=1e-5, atol=1e-5)
 
-# Aliases for backward compatibility with tests that use size-suffixed names
-TileAdd128Program = TileAddProgram
-TileMul128Program = TileMulProgram
+    c128 = torch.zeros((128, 128), dtype=torch.float32)
+    tile_mul_128(a128, b128, c128, config=cfg)
+    assert torch.allclose(c128, a128 * b128, rtol=1e-5, atol=1e-5)
 
+    a64 = torch.full((64, 64), 2.0, dtype=torch.float32)
+    b64 = torch.full((64, 64), 3.0, dtype=torch.float32)
+    c64 = torch.zeros((64, 64), dtype=torch.float32)
+    tile_add_64(a64, b64, c64, config=cfg)
+    assert torch.allclose(c64, a64 + b64, rtol=1e-5, atol=1e-5)
 
-if __name__ == "__main__":
-    print("=== TileAddProgram ===")
-    print(TileAddProgram.as_python())
-    print("\n=== TileMulProgram ===")
-    print(TileMulProgram.as_python())
+    c64 = torch.zeros((64, 64), dtype=torch.float32)
+    tile_mul_64(a64, b64, c64, config=cfg)
+    assert torch.allclose(c64, a64 * b64, rtol=1e-5, atol=1e-5)
+
+    print("OK")