NVIDIA
diff --git a/‎README.tilecpp.md‎
Lines changed: 214 additions & 0 deletions b/‎README.tilecpp.md‎
Lines changed: 214 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/tilegym/backend/dispatcher.py‎
Lines changed: 10 additions & 0 deletions b/‎src/tilegym/backend/dispatcher.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/tilegym/backend/selector.py‎
Lines changed: 116 additions & 1 deletion b/‎src/tilegym/backend/selector.py‎
Lines changed: 116 additions & 1 deletion
@@ -0,0 +1,214 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. --->
+
+<!--- SPDX-License-Identifier: MIT --->
+
+
+
+
+# CUDA Tile C++ Backend
+
+The CUDA Tile C++ backend provides CUDA Tile C++ kernel implementations for TileGym operations.
+
+## Set up
+
+CUDA Tile C++ requires CUDA Toolkit 13.3 or newer. Install the latest CUDA Toolkit
+available for your platform, and make sure `nvcc` from that toolkit is on
+your `PATH`.
+
+```
+# Example: use a CUDA 13.3+ toolkit installed under /usr/local.
+export PATH=/usr/local/cuda-13.3/bin:$PATH
+export TILECPP_NVCC_PATH=/usr/local/cuda-13.3/bin/nvcc
+
+# Verify nvcc is visible.
+nvcc --version
+
+# Run a test, you should see a CUDA Tile C++ (TileCpp) column in the report table
+python tests/benchmark/bench_swiglu.py
+```
+
+## Environment Variables
+
+### Cache Configuration
+
+
+| Variable                | Default            | Description                                                                                                                 |
+| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------- |
+| `TILECPP_CACHE_DIR`     | `~/.cache/tilecpp` | Directory for caching compiled cubin files. If not set, uses `$XDG_CACHE_HOME/tilecpp` or falls back to `~/.cache/tilecpp`. |
+| `TILECPP_DISABLE_CACHE` | `0`                | Set to `1` to disable cubin caching and force recompilation on every run. Useful for development/debugging.                 |
+
+
+### Compiler Configuration
+
+
+| Variable            | Default | Description                                                                                                        |
+| ------------------- | ------- | ------------------------------------------------------------------------------------------------------------------ |
+| `TILECPP_NVCC_PATH` | `nvcc`  | Path to the nvcc compiler. Override if nvcc is not in your PATH or you want to use a specific version.             |
+| `TILECPP_SAVE_SRC`  | `0`     | Set to `1` to save generated CUDA source files alongside compiled cubins. Useful for debugging compilation issues. |
+
+
+### Autotuning
+
+
+| Variable                   | Default | Description                                                                                             |
+| -------------------------- | ------- | ------------------------------------------------------------------------------------------------------- |
+| `TILECPP_AUTOTUNE`         | `0`     | Set to `1` to enable autotuning for kernel configurations. When disabled, uses default configurations.  |
+| `TILECPP_VERBOSE_AUTOTUNE` | `0`     | Set to `1` to enable verbose output during autotuning, showing configuration trials and timing results. |
+
+
+## Adding a New CUDA Tile C++ Kernel to TileGym
+
+This section is only about integrating a CUDA Tile C++ kernel into TileGym.
+
+CUDA Tile C++ operators normally have two pieces:
+
+1. A CUDA Tile C++ kernel in `src/tilegym/ops/tilecpp/<op>.cuh`.
+2. A Python binding in `src/tilegym/ops/tilecpp/<op>.py` that compiles, launches,
+  and registers the kernel with TileGym.
+
+The `.cuh` file contains the `__tile_global__` kernel and any helper tile code.
+Prefer making compile-time constants template parameters when they affect tile
+shapes or loop structure. Keep the kernel signature limited to runtime pointers
+and scalar values that must be passed at launch time.
+
+```cpp
+#pragma once
+
+#include <cuda_tile.h>
+
+template<typename T, int BLOCK_M, int BLOCK_N>
+__tile_global__ void my_kernel(const T* __restrict__ x, T* __restrict__ y, int n) {
+    namespace ct = cuda::tiles;
+    // Tile code goes here.
+}
+```
+
+The Python file creates a `TileCppKernel`, requests a specialized kernel with
+`get_kernel(...)`, launches it with device pointers/scalars, and registers the
+public TileGym op for the `tilecpp` backend.
+
+```python
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from tilegym.backend import register_impl
+from tilegym.ops.tilecpp.utils._cuda_utils import TileCppKernel
+
+_my_kernel = TileCppKernel(
+    source_path=Path(__file__).parent / "my_op.cuh",
+    kernel_name="my_kernel",
+)
+
+
+def _launch_my_kernel(x: torch.Tensor, y: torch.Tensor, block_m: int, block_n: int):
+    kernel, _, _ = _my_kernel.get_kernel(
+        dtype=x.dtype,
+        template_params=[block_m, block_n],
+        signature="const {T}*, {T}*, int",
+    )
+    _my_kernel.launch(
+        grid=(1, 1, 1),
+        kernel=kernel,
+        args=[
+            np.uint64(x.data_ptr()),
+            np.uint64(y.data_ptr()),
+            np.int32(x.numel()),
+        ],
+    )
+
+
+@register_impl("my_op", backend="tilecpp")
+def my_op(x: torch.Tensor, **kwargs):
+    y = torch.empty_like(x)
+    _launch_my_kernel(x, y, block_m=128, block_n=128)
+    return y
+```
+
+Make sure `src/tilegym/ops/tilecpp/__init__.py` imports the new Python module
+when the backend is available. Add or extend tests under `tests/ops/` so the
+same operation can run with `backend="tilecpp"`, and add benchmark coverage
+under `tests/benchmark/` when there is a corresponding CuTile benchmark.
+
+## Compiling a `.cuh` Kernel Standalone with nvcc 13.3+
+
+You can compile a CUDA Tile C++ `.cuh` kernel directly with the CUDA 13.3+ toolkit
+without going through TileGym. This is useful for verifying a kernel builds
+cleanly outside the framework or sharing a self-contained reproducer.
+
+You need one extra `.cu` driver file that:
+
+1. Includes the `.cuh` so the template is in scope.
+2. Adds at least one **explicit template instantiation**.
+3. Provides host-side setup: device buffers, `cudaMemcpy`, the kernel
+  launch, and copy-back/cleanup.
+
+Example driver (`my_op_main.cu`) for the `my_kernel` template shown earlier:
+
+```cpp
+#include <cstdio>
+#include <vector>
+#include <cuda_runtime.h>
+
+#include "my_op.cuh"
+
+template __tile_global__ void my_kernel<float, 128, 128>(
+    const float* __restrict__, float* __restrict__, int);
+
+int main() {
+    constexpr int N = 1 << 20;
+    std::vector<float> h_x(N, 1.0f), h_y(N);
+
+    float *d_x = nullptr, *d_y = nullptr;
+    cudaMalloc(&d_x, N * sizeof(float));
+    cudaMalloc(&d_y, N * sizeof(float));
+    cudaMemcpy(d_x, h_x.data(), N * sizeof(float), cudaMemcpyHostToDevice);
+
+    /* Tile C++ kernels are tile-centric: the launch always uses
+     * block=1, and the kernel uses ct::bid() for parallelism.  The
+     * grid covers ceil(N / BLOCK_SIZE) tiles. */
+    dim3 grid((N + 127) / 128), block(1);
+    my_kernel<float, 128, 128><<<grid, block>>>(d_x, d_y, N);
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(h_y.data(), d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
+    printf("y[0] = %f\n", h_y[0]);
+
+    cudaFree(d_x); cudaFree(d_y);
+    return 0;
+}
+```
+
+Compile with nvcc 13.3 or newer. Set `-arch` to match your target GPU
+(`sm_80` and newer architectures are supported):
+
+```bash
+/usr/local/cuda-13.3/bin/nvcc \
+    -enable-tile \
+    -std=c++20 \
+    -arch=sm_100 \
+    -I src/tilegym/ops/tilecpp \
+    my_op_main.cu \
+    -o my_op_main
+
+./my_op_main
+```
+
+The `-enable-tile` flag turns on the Tile C++ extensions (`__tile_global__`,
+the `cuda::tiles` namespace, etc.); without it nvcc treats the `.cuh` as
+plain CUDA and rejects the tile syntax.
+
+The same toolchain can produce a cubin-only artifact (the form TileGym caches
+internally) by adding `-tilecubin --tile-only` and dropping the host driver
+code from the `.cu` file.
+
+## Cache Management
+
+The CUDA Tile C++ cache stores compiled cubin files to avoid recompilation. Cache files are named using a hash of the source code and template parameters.
+
+To clear the cache:
+
+```bash
+rm -rf ~/.cache/tilecpp/*
+```
@@ -13,4 +13,6 @@ numpy
 cuda-tile>=1.3.0  # Or use: pip install cuda-tile[tileiras] for bundled tileiras compiler
 filelock>=3.20.3  # CVE fix: GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
 pillow>=12.1.1  # CVE fix: GHSA-cfh3-3jmp-rvhc
+cuda-bindings>=13.2.0
+cuda-core>=0.7.0
 # nvidia-ml-py  # optional
@@ -17,6 +17,7 @@
 from tilegym.logger import get_logger
 
 from .selector import get_current_backend
+from .selector import is_tilecpp_available
 
 
 def _is_fallback_disabled() -> bool:
@@ -81,6 +82,15 @@ def wrapper(*args, **kwargs):
 
             logger.debug(f"[Backend Dispatch] Function: '{name}', Current backend: '{current_backend}'")
 
+            # Defer the tilecpp nvcc-version probe until the first actual
+            # dispatch to tilecpp. is_tilecpp_available() is cached, so the
+            # subprocess runs at most once per process. If unavailable, fall
+            # through to the registered fallback so the user gets a useful
+            # result (or a clear DISABLE_FALLBACK error below) instead of a
+            # tilecpp launch failure.
+            if current_backend == "tilecpp" and not is_tilecpp_available():
+                current_backend = fallback_backend
+
             # Try implementation from current backend
             if name in _REGISTRY and current_backend in _REGISTRY[name]:
                 logger.debug(f"[Backend Dispatch] Using '{current_backend}' implementation for '{name}'")
 
@@ -7,6 +7,7 @@
 Used to manage backend implementations of various operations in TileGym library
 """
 
+import functools
 import os
 from typing import Dict
 from typing import Set
@@ -31,13 +32,109 @@ def is_cutile_available():
     return CUTILE_AVAILABLE
 
 
+_TILECPP_MIN_NVCC = (13, 3)
+
+
+def _nvcc_version_supported() -> bool:
+    """Return True iff a usable nvcc with a supported CUDA version is found.
+
+    Resolution order: ``$TILECPP_NVCC_PATH`` first, then ``nvcc`` on PATH.
+    The release version reported by ``nvcc --version`` must be at least
+    ``_TILECPP_MIN_NVCC`` (currently 13.3).
+    """
+    import re
+    import shutil
+    import subprocess
+
+    nvcc = os.environ.get("TILECPP_NVCC_PATH", "nvcc")
+    if not os.path.isabs(nvcc):
+        resolved = shutil.which(nvcc)
+        if resolved is None:
+            return False
+        nvcc = resolved
+    elif not os.path.exists(nvcc):
+        return False
+
+    try:
+        result = subprocess.run([nvcc, "--version"], capture_output=True, text=True, timeout=10)
+    except (OSError, subprocess.SubprocessError):
+        return False
+    if result.returncode != 0:
+        return False
+    m = re.search(r"release\s+(\d+)\.(\d+)", result.stdout)
+    if not m:
+        return False
+    return (int(m.group(1)), int(m.group(2))) >= _TILECPP_MIN_NVCC
+
+
+def _check_tilecpp_module_importable():
+    """Cheap eager check: can we locate and import the TileCpp _cuda_utils module?
+
+    Does NOT spawn any subprocess, so it is safe to call at module load time
+    even on hosts without nvcc / without CUDA. Returns ``(ok, err)`` where
+    ``err`` is the captured exception when ``ok`` is False.
+    """
+    try:
+        from importlib import util as importlib_util
+        from pathlib import Path
+
+        _tilecpp_cuda_utils_path = Path(__file__).resolve().parents[1] / "ops" / "tilecpp" / "utils" / "_cuda_utils.py"
+        _tilecpp_cuda_utils_spec = importlib_util.spec_from_file_location(
+            "_tilegym_tilecpp_cuda_utils_availability",
+            _tilecpp_cuda_utils_path,
+        )
+        if _tilecpp_cuda_utils_spec is None or _tilecpp_cuda_utils_spec.loader is None:
+            raise ImportError("Failed to locate TileCpp _cuda_utils module")
+        _tilecpp_cuda_utils = importlib_util.module_from_spec(_tilecpp_cuda_utils_spec)
+        _tilecpp_cuda_utils_spec.loader.exec_module(_tilecpp_cuda_utils)
+        if not hasattr(_tilecpp_cuda_utils, "TileCppKernel"):
+            raise ImportError("TileCppKernel is not available")
+    except (ImportError, FileNotFoundError) as err:
+        return False, err
+    return True, None
+
+
+_TILECPP_MODULE_IMPORTABLE, _tilecpp_unavailable_err = _check_tilecpp_module_importable()
+
+
+@functools.cache
+def is_tilecpp_available() -> bool:
+    """Check if the CUDA Tile C++ backend is available.
+
+    The expensive ``nvcc --version`` subprocess is deferred to the first call
+    of this function (cached thereafter), so ``import tilegym`` on a non-CUDA
+    host has no subprocess overhead. The check is invoked by the dispatcher
+    on the first actual tilecpp dispatch. When the check fails, a
+    ``UserWarning`` is emitted at the caller's frame (``stacklevel=2``) and
+    suppressed for subsequent calls.
+    """
+    import warnings
+
+    if not _TILECPP_MODULE_IMPORTABLE:
+        warnings.warn(
+            f"TileCpp backend is not available: {_tilecpp_unavailable_err}",
+            stacklevel=2,
+        )
+        return False
+    if not _nvcc_version_supported():
+        warnings.warn(
+            f"TileCpp backend is not available: nvcc >= {_TILECPP_MIN_NVCC[0]}.{_TILECPP_MIN_NVCC[1]} "
+            "is required (set TILECPP_NVCC_PATH or install CUDA "
+            f"{_TILECPP_MIN_NVCC[0]}.{_TILECPP_MIN_NVCC[1]} or newer on PATH)",
+            stacklevel=2,
+        )
+        return False
+    return True
+
+
 _AVAILABLE_BACKENDS: Set[str] = set()
 _CURRENT_BACKENDS: str = "cutile"
 
 
 def _check_backends_availability() -> Dict[str, bool]:
     availability = {
         "cutile": is_cutile_available(),
+        "tilecpp": _TILECPP_MODULE_IMPORTABLE,
     }
     return availability
 
@@ -75,13 +172,31 @@ def set_backend(backend: str) -> None:
     global _CURRENT_BACKENDS
     if backend not in _AVAILABLE_BACKENDS:
         raise ValueError(f"Unknown backend: {backend}, available backends: {_AVAILABLE_BACKENDS}")
+    # tilecpp is in _AVAILABLE_BACKENDS based on a cheap module-importability
+    # check; verify the full runtime requirement (nvcc >= 13.3) here so callers
+    # opting in to tilecpp fail fast instead of silently falling back at dispatch.
+    if backend == "tilecpp" and not is_tilecpp_available():
+        raise ValueError(
+            f"Backend 'tilecpp' is not available on this system: nvcc >= "
+            f"{_TILECPP_MIN_NVCC[0]}.{_TILECPP_MIN_NVCC[1]} is required "
+            "(set TILECPP_NVCC_PATH or install CUDA "
+            f"{_TILECPP_MIN_NVCC[0]}.{_TILECPP_MIN_NVCC[1]} or newer on PATH)"
+        )
     _CURRENT_BACKENDS = backend
     logger.info(f"Set backend to {backend}")
 
 
 def is_backend_available(backend: str) -> bool:
     """check if the backend is available"""
-    return backend in _AVAILABLE_BACKENDS
+    if backend not in _AVAILABLE_BACKENDS:
+        return False
+    # tilecpp's entry in _AVAILABLE_BACKENDS reflects only the cheap module-
+    # importability check; the runtime nvcc>=13.3 requirement is verified
+    # lazily here (cached) so test gates like
+    # ``if is_backend_available("tilecpp"):`` skip on hosts without nvcc.
+    if backend == "tilecpp":
+        return is_tilecpp_available()
+    return True
 
 
 def assert_backend_available(backend: str) -> None: