From cdb3648eed81b66d035da00e3048a9416e44ef90 Mon Sep 17 00:00:00 2001 From: Hannah Li Date: Wed, 27 May 2026 01:36:40 +0800 Subject: [PATCH 1/3] Migrate skills to canonical 'skills/' path for nvskills-ci onboarding Changes: - Move 7 cuTile skill folders from .agents/skills/ to skills/. - Add .agents/skills and .claude/skills symlinks pointing to ../skills for backward compatibility. - Update LICENSE, CONTRIBUTING.md, and .github/scripts/check_spdx_headers.py to reference the new skills/ path. - Split skills/cutile-autotuning/SKILL.md: move API Reference, Step-by-Step Workflow, and Pitfall Checklist into new files under references/ to keep SKILL.md concise. Signed-off-by: Hannah Li --- .agents/skills | 1 + .agents/skills/cutile-autotuning/SKILL.md | 711 ------------------ .claude/skills | 2 +- .github/scripts/check_spdx_headers.py | 60 +- CONTRIBUTING.md | 7 +- LICENSE | 26 +- .../adding-cutile-kernel/SKILL.md | 0 .../converting-cutile-to-julia/SKILL.md | 0 .../examples/01_add/cutile_julia.jl | 0 .../examples/01_add/cutile_python.py | 0 .../examples/02_matmul/cutile_julia.jl | 0 .../examples/02_matmul/cutile_python.py | 0 .../examples/03_softmax/cutile_julia.jl | 0 .../examples/03_softmax/cutile_python.py | 0 .../references/api-mapping.md | 0 .../references/critical-rules.md | 0 .../references/debugging.md | 0 .../references/testing.md | 0 .../scripts/validate_cutile_jl.py | 0 .../translations/workflow.md | 0 .../converting-cutile-to-triton/SKILL.md | 0 .../examples/01_vector_add/cutile_kernel.py | 0 .../examples/01_vector_add/triton_kernel.py | 0 .../examples/02_softmax/cutile_kernel.py | 0 .../examples/02_softmax/triton_kernel.py | 0 .../examples/03_layernorm/cutile_kernel.py | 0 .../examples/03_layernorm/triton_kernel.py | 0 .../examples/04_matmul/cutile_kernel.py | 0 .../examples/04_matmul/triton_kernel.py | 0 .../examples/05_attention/cutile_kernel.py | 0 .../examples/05_attention/triton_kernel.py | 0 .../references/api-mapping.md | 0 .../references/debugging.md | 0 .../references/gotchas.md | 0 .../references/harness-integration.md | 0 .../references/optimization-strategy.md | 0 .../references/optimizing-reference.md | 0 .../references/performance-gotchas.md | 0 .../translations/advanced-patterns.md | 0 .../translations/file-structure.md | 0 .../translations/workflow.md | 0 skills/cutile-autotuning/SKILL.md | 240 ++++++ .../autotuned_launch.py | 0 .../01_rmsnorm_occupancy_only/fixed_launch.py | 0 .../02_matmul_full_search/autotuned_launch.py | 0 .../02_matmul_full_search/fixed_launch.py | 0 .../autotuned_launch.py | 0 .../fixed_launch.py | 0 .../references/api-reference.md | 179 +++++ .../references/hardware-constraints.md | 0 .../references/kernel-type-templates.md | 0 .../references/parameter-space-design.md | 0 .../cutile-autotuning/references/pitfalls.md | 116 +++ .../references/search-strategies.md | 0 .../cutile-autotuning/references/workflow.md | 202 +++++ .../skills => skills}/cutile-python/SKILL.md | 2 +- .../examples/convolution/README.md | 0 .../conv2d_with_bias_dilation_groups.py | 0 .../conv3d_with_bias_dilation_groups.py | 0 .../examples/convolution/conv_transpose_2d.py | 0 .../examples/convolution/conv_transpose_3d.py | 0 .../cutile-python/examples/matmul/README.md | 0 .../examples/matmul/matmul_4d_tensors.py | 0 .../matmul/matrix_vector_multiplication.py | 0 .../examples/matmul/split_k_gemm.py | 0 .../examples/normalization/README.md | 0 .../examples/normalization/group_norm.py | 0 .../cutile-python/examples/pooling/README.md | 0 .../examples/pooling/avgpool3d.py | 0 .../examples/pooling/maxpool3d.py | 0 .../cutile-python/examples/scan/README.md | 0 .../examples/scan/cumsum_cumprod_blocking.py | 0 .../examples/tilegym_and_examples_guide.md | 2 +- .../guidelines/01_implementation_lessons.md | 0 .../guidelines/02_code_generation_rules.md | 0 .../cutile-python/guidelines/03_concepts.md | 0 .../orchestration/analyzer_agent.md | 0 .../orchestration/composer_agent.md | 0 .../orchestration/kernel_agent.md | 0 .../cutile-python/orchestration/overview.md | 0 .../cutile-python/orchestration/workflow.md | 0 .../torch-learner/examples/lstm_trace.md | 0 .../references/1_pytorch_codebase_map.md | 0 .../references/2_dispatch_mechanism.md | 0 .../references/3_tracing_strategies.md | 0 .../references/4_language_layers.md | 0 .../references/5_well_known_ops.md | 0 .../torch-learner/tracing_workflow.md | 0 .../improve-cutile-kernel-perf/SKILL.md | 0 .../references/cutile-api-reference.md | 0 .../references/cutile-patterns-reference.md | 0 .../references/ir-dump-guide.md | 0 .../references/optimization-playbook.md | 0 .../references/perf-knobs-catalog.md | 0 .../references/performance-model.md | 0 .../SKILL.md | 0 .../references/auto-kernelize.md | 0 .../references/environment-setup.md | 0 .../references/kernel-integration.md | 0 .../references/workflow-diagram.png | Bin 100 files changed, 792 insertions(+), 756 deletions(-) create mode 120000 .agents/skills delete mode 100644 .agents/skills/cutile-autotuning/SKILL.md rename {.agents/skills => skills}/adding-cutile-kernel/SKILL.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/SKILL.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/01_add/cutile_julia.jl (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/01_add/cutile_python.py (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/02_matmul/cutile_julia.jl (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/02_matmul/cutile_python.py (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/03_softmax/cutile_julia.jl (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/examples/03_softmax/cutile_python.py (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/references/api-mapping.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/references/critical-rules.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/references/debugging.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/references/testing.md (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/scripts/validate_cutile_jl.py (100%) rename {.agents/skills => skills}/converting-cutile-to-julia/translations/workflow.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/SKILL.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/01_vector_add/cutile_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/01_vector_add/triton_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/02_softmax/cutile_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/02_softmax/triton_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/03_layernorm/cutile_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/03_layernorm/triton_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/04_matmul/cutile_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/04_matmul/triton_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/05_attention/cutile_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/examples/05_attention/triton_kernel.py (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/api-mapping.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/debugging.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/gotchas.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/harness-integration.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/optimization-strategy.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/optimizing-reference.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/references/performance-gotchas.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/translations/advanced-patterns.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/translations/file-structure.md (100%) rename {.agents/skills => skills}/converting-cutile-to-triton/translations/workflow.md (100%) create mode 100644 skills/cutile-autotuning/SKILL.md rename {.agents/skills => skills}/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/autotuned_launch.py (100%) rename {.agents/skills => skills}/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/fixed_launch.py (100%) rename {.agents/skills => skills}/cutile-autotuning/assets/examples/02_matmul_full_search/autotuned_launch.py (100%) rename {.agents/skills => skills}/cutile-autotuning/assets/examples/02_matmul_full_search/fixed_launch.py (100%) rename {.agents/skills => skills}/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/autotuned_launch.py (100%) rename {.agents/skills => skills}/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/fixed_launch.py (100%) create mode 100644 skills/cutile-autotuning/references/api-reference.md rename {.agents/skills => skills}/cutile-autotuning/references/hardware-constraints.md (100%) rename {.agents/skills => skills}/cutile-autotuning/references/kernel-type-templates.md (100%) rename {.agents/skills => skills}/cutile-autotuning/references/parameter-space-design.md (100%) create mode 100644 skills/cutile-autotuning/references/pitfalls.md rename {.agents/skills => skills}/cutile-autotuning/references/search-strategies.md (100%) create mode 100644 skills/cutile-autotuning/references/workflow.md rename {.agents/skills => skills}/cutile-python/SKILL.md (98%) rename {.agents/skills => skills}/cutile-python/examples/convolution/README.md (100%) rename {.agents/skills => skills}/cutile-python/examples/convolution/conv2d_with_bias_dilation_groups.py (100%) rename {.agents/skills => skills}/cutile-python/examples/convolution/conv3d_with_bias_dilation_groups.py (100%) rename {.agents/skills => skills}/cutile-python/examples/convolution/conv_transpose_2d.py (100%) rename {.agents/skills => skills}/cutile-python/examples/convolution/conv_transpose_3d.py (100%) rename {.agents/skills => skills}/cutile-python/examples/matmul/README.md (100%) rename {.agents/skills => skills}/cutile-python/examples/matmul/matmul_4d_tensors.py (100%) rename {.agents/skills => skills}/cutile-python/examples/matmul/matrix_vector_multiplication.py (100%) rename {.agents/skills => skills}/cutile-python/examples/matmul/split_k_gemm.py (100%) rename {.agents/skills => skills}/cutile-python/examples/normalization/README.md (100%) rename {.agents/skills => skills}/cutile-python/examples/normalization/group_norm.py (100%) rename {.agents/skills => skills}/cutile-python/examples/pooling/README.md (100%) rename {.agents/skills => skills}/cutile-python/examples/pooling/avgpool3d.py (100%) rename {.agents/skills => skills}/cutile-python/examples/pooling/maxpool3d.py (100%) rename {.agents/skills => skills}/cutile-python/examples/scan/README.md (100%) rename {.agents/skills => skills}/cutile-python/examples/scan/cumsum_cumprod_blocking.py (100%) rename {.agents/skills => skills}/cutile-python/examples/tilegym_and_examples_guide.md (92%) rename {.agents/skills => skills}/cutile-python/guidelines/01_implementation_lessons.md (100%) rename {.agents/skills => skills}/cutile-python/guidelines/02_code_generation_rules.md (100%) rename {.agents/skills => skills}/cutile-python/guidelines/03_concepts.md (100%) rename {.agents/skills => skills}/cutile-python/orchestration/analyzer_agent.md (100%) rename {.agents/skills => skills}/cutile-python/orchestration/composer_agent.md (100%) rename {.agents/skills => skills}/cutile-python/orchestration/kernel_agent.md (100%) rename {.agents/skills => skills}/cutile-python/orchestration/overview.md (100%) rename {.agents/skills => skills}/cutile-python/orchestration/workflow.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/examples/lstm_trace.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/references/1_pytorch_codebase_map.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/references/2_dispatch_mechanism.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/references/3_tracing_strategies.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/references/4_language_layers.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/references/5_well_known_ops.md (100%) rename {.agents/skills => skills}/cutile-python/torch-learner/tracing_workflow.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/SKILL.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/cutile-api-reference.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/cutile-patterns-reference.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/ir-dump-guide.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/optimization-playbook.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/perf-knobs-catalog.md (100%) rename {.agents/skills => skills}/improve-cutile-kernel-perf/references/performance-model.md (100%) rename {.agents/skills => skills}/monkey-patch-kernels-to-transformers/SKILL.md (100%) rename {.agents/skills => skills}/monkey-patch-kernels-to-transformers/references/auto-kernelize.md (100%) rename {.agents/skills => skills}/monkey-patch-kernels-to-transformers/references/environment-setup.md (100%) rename {.agents/skills => skills}/monkey-patch-kernels-to-transformers/references/kernel-integration.md (100%) rename {.agents/skills => skills}/monkey-patch-kernels-to-transformers/references/workflow-diagram.png (100%) diff --git a/.agents/skills b/.agents/skills new file mode 120000 index 00000000..42c5394a --- /dev/null +++ b/.agents/skills @@ -0,0 +1 @@ +../skills \ No newline at end of file diff --git a/.agents/skills/cutile-autotuning/SKILL.md b/.agents/skills/cutile-autotuning/SKILL.md deleted file mode 100644 index 69ed37ee..00000000 --- a/.agents/skills/cutile-autotuning/SKILL.md +++ /dev/null @@ -1,711 +0,0 @@ ---- -name: cutile-autotuning -description: "Use when adding, modifying, optimizing, or debugging CuTile autotuning code. Trigger signals: `exhaustive_search` / `replace_hints` / `hints_fn` / `cuda.tile.tune` in code, `autotune` in filenames, or correctness/performance issues in autotuned CuTile kernels. Covers: tune-once/cache/launch pattern, per-architecture configs (sm80–sm120), parameter space design (tile sizes, occupancy, num_ctas), and 7 common pitfalls with solutions." -license: CC-BY-4.0 AND Apache-2.0 ---- - -# CuTile Autotuning - -Add autotuning to CuTile kernels using the `exhaustive_search` API with tune-once/cache/direct-launch pattern. - -## Instructions - -Follow the decision tree to classify the kernel, design a search space, implement the tune-once/cache/launch pattern, and validate performance. - -1. **Classify** — use the Decision Tree to determine search dimensions (occupancy-only vs full tile search) -2. **Design search space** — select the matching template from `references/kernel-type-templates.md`; prune to ≤ 30 configs in the final code via arch filters (directed exploration probes may temporarily exceed this — see Design Philosophy) -3. **Implement** — add `exhaustive_search` + cache + `ct.launch` following the Step-by-Step Workflow; handle in-place writes with split-buffer if needed -4. **Test** — run correctness with autotune enabled and with `DISABLE_AUTOTUNE=1` -5. **Validate** — A/B benchmark against fixed best-known config; see `references/search-strategies.md` -6. **Shrink** — prune dead-weight configs that never win, targeting ≤ 8 configs per architecture to minimize compilation cost (Step 10) - -## Task Router — Jump to What You Need - -| What are you trying to do? | Go to | -|---|---| -| Add autotune to a new kernel (most common) | Quick Reference below → Workflow: Adding Autotune → `references/kernel-type-templates.md` (pick by kernel type: T1=elementwise, T2=in-place, T3=matmul, T4=persistent, T5=FMHA, T6=FP8, T7=grouped GEMM, T8=varlen attention, T9=dual-GEMM fusion) | -| Debug: data corruption / wrong results after first run | Pitfall #1 (In-Place Kernel) | -| Debug: autotune taking 5+ minutes | Pitfall #2 (Compilation Timeout) | -| Debug: search space generator returning zero configs | Pitfall #5 first; also check arch filters, size guards, and `num_ctas` constraints | -| Optimize an existing autotune config | Workflow: Optimizing an Existing Config | - -## Quick Reference — Occupancy-Only Autotune (Tune-Once/Cache/Launch) - -Most CuTile kernels (elementwise, reduction, LayerNorm) need only occupancy tuning. Copy this pattern: - -```python -from types import SimpleNamespace -from cuda.tile.tune import exhaustive_search -import cuda.tile as ct -import torch - -def _my_autotune_configs(): - for occ in [1, 2, 4, 8]: - yield SimpleNamespace(occupancy=occ) - -# Module-level cache: tune once, launch fast forever after -_autotune_cache = {} - -def my_op(x, output): - stream = torch.cuda.current_stream() - NUM_SM = torch.cuda.get_device_properties(x.device).multi_processor_count - - # Cache key: anything that affects optimal config (use str() for device) - cache_key = (x.shape, x.dtype, str(x.device)) - - if cache_key not in _autotune_cache: - configs = list(_my_autotune_configs()) - result = exhaustive_search( - configs, - stream, - grid_fn=lambda cfg: (min(NUM_SM * cfg.occupancy, M), 1, 1), - kernel=my_kernel, - args_fn=lambda cfg: (x, output, ...), - hints_fn=lambda cfg: {"occupancy": cfg.occupancy}, - ) - best_cfg = result.best.config - tuned_kernel = my_kernel.replace_hints(occupancy=best_cfg.occupancy) - _autotune_cache[cache_key] = (best_cfg, tuned_kernel) # cache BOTH - - cfg, tuned_kernel = _autotune_cache[cache_key] - grid = (min(NUM_SM * cfg.occupancy, M), 1, 1) - ct.launch(stream, grid, tuned_kernel, (x, output, ...)) -``` - -Key rules: -- **Tune once, cache, launch directly** — `exhaustive_search` runs only on first call per shape; subsequent calls use cached config + `ct.launch` with zero overhead -- For in-place kernels use split-buffer during search (separate input/output tensors) -- Keep ≤ 30 configs in final code (see Design Philosophy for temporary directed probes) -- `exhaustive_search` requires a `Sequence` (list/tuple) — convert generators with `list()` -- **Search space must include the original fixed config** — this guarantees autotuning never makes performance worse - -**When to use this pattern**: Kernel has fixed block size (not tile-size tunable). Includes: elementwise (SwiGLU, GeGLU), reduction (RMSNorm, LayerNorm), RoPE, and persistent kernels with heuristic block sizes (grouped GEMM). - -For complex kernels (matmul with tile sizes, FMHA, FP8 with num_ctas), read the full guide below + [`kernel-type-templates.md`](references/kernel-type-templates.md). - -> **⚠️ Three pitfalls catch almost everyone — check before submitting:** -> - **`replace_hints` on hot path?** → Cache BOTH config AND kernel object from `exhaustive_search`. Calling `replace_hints()` every invocation recompiles (100–500× slower) → Pitfall #7 -> - **In-place kernel** (writes back to input tensor)? → MUST use split-buffer pattern during search → Pitfall #1 -> - **Search space empty?** → Check arch filters and `num_ctas` constraints → Pitfall #5 - -> **Minimum coverage**: On sm100+, FMHA/matmul/varlen search spaces must include both `num_ctas=1` and `num_ctas=2`. For core dimensions (tile sizes, occupancy), keep at least 2 distinct values even if unsure which is better — let `exhaustive_search` decide. - -> **When to stop tuning**: A mean speedup in [0.98, 1.02] means your *current* search space isn't helping — but doesn't mean no config will help. Before stopping, check whether you've covered the key dimensions for this kernel type (consult `references/kernel-type-templates.md`). If the search space already covers the template's recommended dimensions and the best result is still noise-floor, then stop — further micro-adjustments won't help. If key dimensions are missing (e.g., never tried `num_ctas=2` for a dual-GEMM kernel), expand the search space rather than giving up. -> -> Once correctness tests pass and the autotuned kernel shows speedup over the fixed-config baseline, **stop — do not re-run to "confirm".** GPU kernel timing fluctuates ±5–10 % between invocations due to clock scaling and OS scheduling; a subsequent timing dip does not mean your code is wrong. -> -> To improve speedup, only modify the autotune search space (configs, tile sizes, occupancy, num_ctas). Do not modify other code (Python wrapper, stream management, etc.) to chase speedup — kernel performance is determined by the config selection, not by host-side code. - -## Reading Guide - -- **Occupancy-only kernels** (elementwise, reduction, persistent with fixed block sizes): Quick Reference + Pitfall Checklist is sufficient — skip `references/` docs. For in-place kernels, also read Pitfall #1. -- **Complex kernels** (matmul with tunable tile sizes, FMHA, FP8 with num_ctas): Quick Reference → Decision Tree → API Reference → Step-by-Step Workflow → relevant `references/` docs. - -**5-step summary**: Classify kernel → Design search space ([`parameter-space-design.md`](references/parameter-space-design.md)) → Implement using template ([`kernel-type-templates.md`](references/kernel-type-templates.md)) → Validate with A/B test → Check Pitfall Checklist. - -**Reading references**: Read only the reference relevant to your kernel type — e.g., for FMHA, read the Template 5 section in `references/kernel-type-templates.md`; for hardware constraints, read only the target architecture's section. Avoid reading all references end-to-end when a targeted lookup suffices. - -## Design Philosophy - -**Build a small, precise search space bottom-up — not a large space trimmed down.** CuTile compilation is much heavier than Triton (~0.5-1s per config), so the **final code** should contain ≤ 30 configs. The approach is: classify the kernel type first, then construct only the relevant configs for that type and architecture. - -**Directed exploration during development**: If the initial template configs yield speedup < 1.0, you may run a *temporary* larger probe (30–100 configs) via `bash + python3 -c` to identify which dimensions matter — but this probe must be **directional**, not a blind cartesian product. Use the kernel type classification to decide *which* dimensions to vary (e.g. for dual-GEMM, probe `num_ctas × occupancy` while fixing tile sizes; for FMHA, probe `TILE_M × num_ctas` while fixing TILE_N). Once the probe identifies the winning region, lock the final code's search space to ≤ 8 top candidates. Do NOT write the large probe into the source file — it is a one-shot diagnostic tool. - -## Decision Tree: What Search Dimensions Does This Kernel Need? - -All kernels should have autotuning added. The question is not *whether* to autotune, but *what dimensions* to search: - -``` -What type of kernel is this? -├── Compute-bound (matmul, GEMM, FMHA) → Does it have multiple tunable dimensions (tile sizes)? -│ ├── YES → Is it a fused multi-GEMM kernel (dual-GEMM, e.g. Linear+GLUAct)? -│ │ ├── YES → Template 9: low occupancy (1–2), conservative tiles (2× SHMEM/register pressure) -│ │ └── NO → Full search: TILE_M × TILE_N × (TILE_K) × occupancy × num_ctas -│ │ (see matmul/FMHA templates in kernel-type-templates.md) -│ └── NO → Occupancy-only search: [1, 2, 4, 8] -│ (see Quick Reference above) -├── Balanced (LayerNorm, reduction + compute) → -│ Occupancy-only search: [1, 2, 4, 8] -│ Expected benefit: 2-15% -└── Memory-bound (CE Loss, pure elementwise) → - Occupancy-only search: [1, 2, 4, 8] - Expected benefit: 0-15% (varies by kernel; zero-cost after tuning) -``` - -**Why memory-bound kernels only search occupancy (not num_ctas or tile sizes)**: -- **`num_ctas` has zero benefit**: `num_ctas > 1` enables TMA multicast, where multiple CTAs share tile data in shared memory (e.g., matmul A/B tiles reused across CTAs). Memory-bound kernels use per-element `ct.gather`/`ct.scatter` with no tile reuse — multi-CTA cooperation adds overhead with no data sharing benefit. -- **Tile sizes are pre-determined**: BLOCK_SIZE for memory-bound kernels is determined by offline sweep (e.g., 1024 is globally optimal on B200 across [256, 512, 1024, 2048, 4096, 8192]). This is a constant, not a runtime tunable. -- **Occupancy is the only effective knob**: Higher occupancy lets the GPU hide memory latency by switching to another CTA while one is stalled on a memory request. - -> **Evidence — CE Loss experiment**: A 12-config search (occupancy × num_ctas) on Cross-Entropy Loss yielded only 2.5% gain (0.79x → 0.81x vs Triton). The `num_ctas` dimension contributed nothing; the result was reverted because compilation cost outweighed the marginal benefit. Occupancy-only (4 configs) achieves the same result at 3x less compilation time. - -**Note on memory-bound kernels**: Adding occupancy-only autotune is always worthwhile because: -- The tune-once/cache/launch pattern has zero runtime overhead after the first call -- The search space is tiny (4 configs, ~2-4s compilation) -- Even small improvements have value at scale - -## Occupancy Selection Guide - -Occupancy controls how many CTAs run concurrently per SM. Use this as a starting point when designing the occupancy search space: - -| Occupancy Range | Best For | Example Kernels | -|-----------------|----------|-----------------| -| 1–4 | Compute-bound (heavy math) | Complex transforms, matmul | -| 4–8 | Balanced (GEMM, TMA) | Matrix multiply, FMHA | -| 8–16 | Memory-bound (reductions) | Softmax, LayerNorm | -| 16–32 | Very light (copies, casts) | Type conversions, elementwise | - -Use these ranges to seed your initial search space. For occupancy-only kernels, `[1, 2, 4, 8]` covers most cases — see Quick Reference above. - -## exhaustive_search API Reference - -> **⚠️ Deprecated API**: `cuda.tile_experimental.autotune_launch()` (aka `ct_experimental.autotune_launch`) is deprecated and should NOT be used. It combines search + launch in one call with random sampling, which produces less reproducible results and worse config selection compared to `exhaustive_search`. Always use `cuda.tile.tune.exhaustive_search` (the current API below) with explicit caching and `ct.launch`. - -### Current API (`cuda.tile.tune`) - -```python -from cuda.tile.tune import exhaustive_search, TuningResult - -result: TuningResult = exhaustive_search( - search_space, # Sequence[T] — list or tuple of configs (NOT a generator) - stream, # torch.cuda.current_stream() - grid_fn, # callable(cfg) → tuple[int, ...] - kernel, # @ct.kernel decorated function - args_fn, # callable(cfg) → tuple of kernel args - hints_fn=None, # callable(cfg) → {"occupancy": int, "num_ctas": int} - *, - quiet=False # suppress output -) -``` - -### TuningResult - -```python -@dataclass -class TuningResult[T]: - best: Measurement # best config + timing (mean_us, error_margin_us, num_samples) - successes: Sequence[Measurement] # all successful configs (sorted by performance) - failures: Sequence[tuple[T, str, str]] # (config, exception_type, message) -``` - -Key properties: -- **Exhaustive**: evaluates ALL configs in order — no random sampling, no skipped configs -- **Search only**: does not perform the final production launch — it executes trial runs internally for benchmarking, but you call `ct.launch` separately for the actual production invocation -- **No built-in cache**: you manage caching explicitly (see tune-once/cache/launch pattern) -- **Deterministic**: same search space always produces the same evaluation order - -### Tune-Once / Cache / Launch Pattern - -This is the **recommended pattern** for all autotuned kernels. It ensures: -- First call: runs `exhaustive_search` to find the best config (~2-30s depending on space size) -- Subsequent calls: uses cached config with `ct.launch` — zero overhead (identical to a fixed `ct.launch`) - -```python -_cache = {} - -def run_kernel_autotuned(x, ...): - stream = torch.cuda.current_stream() - cache_key = (x.shape, x.dtype, str(x.device)) - - if cache_key not in _cache: - configs = list(_my_autotune_configs()) - result = exhaustive_search( - configs, stream, - grid_fn=lambda cfg: ..., - kernel=my_kernel, - args_fn=lambda cfg: ..., - hints_fn=lambda cfg: {"occupancy": cfg.occupancy}, - ) - best_cfg = result.best.config - tuned_kernel = my_kernel.replace_hints(occupancy=best_cfg.occupancy) - _cache[cache_key] = (best_cfg, tuned_kernel) # cache BOTH config and compiled kernel - - cfg, tuned_kernel = _cache[cache_key] - grid = compute_grid(cfg) - ct.launch(stream, grid, tuned_kernel, (x, ...)) -``` - -**Why this pattern matters**: The `ct.launch` call in the fast path is identical to what you'd write for a fixed-config kernel. There is zero per-call overhead — no lock, no hash lookup, no lambda invocation. The only cost is the Python dict lookup for `_cache[cache_key]`. - -> **⚠️ Critical: always cache the tuned kernel object, not just the config.** `replace_hints()` returns a **new** kernel object with its own independent JIT cache. Calling it on every invocation triggers recompilation each time, degrading performance by 100–500×. Call `replace_hints()` once after `exhaustive_search`, store the returned kernel in the cache alongside the config, and reuse it directly on the fast path. See Pitfall #7. - -### replace_hints - -After finding the best config, use `kernel.replace_hints()` to create a kernel variant with the optimal hints: - -```python -# For occupancy-only: -tuned_kernel = my_kernel.replace_hints(occupancy=cfg.occupancy) - -# For occupancy + num_ctas: -tuned_kernel = my_kernel.replace_hints(occupancy=cfg.occupancy, num_ctas=cfg.num_ctas) -``` - -`replace_hints` accepts only `occupancy` and `num_ctas` — these are the only compiler hints controllable via the autotune API. - -**`ByTarget` wrapping for cross-architecture portability**: When creating tuned kernel variants via `ct.kernel()`, prefer wrapping hint values in `ct.ByTarget` for portability across GPU architectures: - -```python -# Preferred: explicit architecture targeting (portable) -tuned_kernel = ct.kernel( - my_kernel._pyfunc, - occupancy=ct.ByTarget(sm_100=best_cfg.occupancy), - num_ctas=ct.ByTarget(sm_100=best_cfg.num_ctas, default=1), -) - -# Also acceptable: plain integers (when targeting a single architecture) -tuned_kernel = ct.kernel(my_kernel._pyfunc, occupancy=best_cfg.occupancy) -``` - -When targeting only the current GPU (the common case in autotuning), plain integers work fine. Use `ByTarget` when the code may run on multiple architectures or when following production conventions (TileGym production code consistently uses `ByTarget`). - -### Kernel Hints - -CuTile kernel performance is controlled by two compile-time hints: - -- **`occupancy`**: Number of CTAs per SM. Higher occupancy = more parallelism but less shared memory per CTA. -- **`num_ctas`**: Number of CTAs in a CGA (Cooperative Group Array). Used for multi-CTA cooperation (e.g., TMA multicast). Only supported on sm90+. - -Three ways to set hints: - -```python -# 1. Fixed value in decorator (no autotune needed) -@ct.kernel(occupancy=2, num_ctas=1) -def my_kernel(...): ... - -# 2. Architecture-specific fixed value (no autotune needed) -@ct.kernel(num_ctas=ct.ByTarget(sm_100=2, sm_120=1, default=1)) -def my_kernel(...): ... - -# 3. Runtime autotune via exhaustive_search + replace_hints -# IMPORTANT: Remove fixed hints from decorator first! -@ct.kernel -def my_kernel(...): ... - -# Then in the host wrapper: -tuned_kernel = my_kernel.replace_hints(occupancy=best_occ, num_ctas=best_ctas) -ct.launch(stream, grid, tuned_kernel, args) -``` - -**Important**: `replace_hints` correctly overrides decorator hints (it uses `dataclasses.replace()` internally). However, if you forget to call `replace_hints`, the decorator's fixed values are used instead of the autotuned values. To avoid this confusion, always remove fixed hints from the `@ct.kernel(...)` decorator before adding autotuning — this makes it explicit that hints come only from the autotune path. - -### search_space Design - -The search space is a list of `SimpleNamespace` objects. Each namespace holds config fields that `grid_fn`, `args_fn`, and `hints_fn` can read. - -```python -from types import SimpleNamespace - -# Occupancy-only (elementwise kernels) -def autotune_configs(): - for occ in [1, 2, 4, 8]: - yield SimpleNamespace(occupancy=occ) - -# Full matmul search space — see parameter-space-design.md for complete per-architecture configs -# Pattern: yield SimpleNamespace(TILE_SIZE_M=..., TILE_SIZE_N=..., TILE_SIZE_K=..., num_ctas=..., occupancy=...) -``` - -**Note**: `exhaustive_search` requires a `Sequence` (list/tuple), not a generator. Always convert with `list()`: -```python -configs = list(autotune_configs()) -result = exhaustive_search(configs, ...) -``` - -### grid_fn Patterns - -```python -from math import ceil - -# Pattern A: Simple tile coverage (matmul, elementwise) -grid_fn=lambda cfg: (ceil(M / cfg.TILE_SIZE_M) * ceil(N / cfg.TILE_SIZE_N), 1, 1) - -# Pattern B: Persistent matmul (static_persistent_matmul_kernel) -NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count -grid_fn=lambda cfg: ( - min(NUM_SMS // cfg.num_ctas, ceil(M / cfg.TILE_M) * ceil(N / cfg.TILE_N)) * cfg.occupancy, - 1, 1, -) - -# Pattern C: 2D grid (FMHA — one dim for seq tiles, one for batch*heads) -grid_fn=lambda cfg: (ceil(q_len / cfg.TILE_M), batch_size * num_heads, 1) - -# Pattern D: 1D elementwise (cdiv = math.ceil(a/b), from ct_ops.py) -grid_fn=lambda cfg: (cdiv(n_elements, BLOCK_SIZE),) - -# Pattern E: Grouped GEMM persistent (grid fixed at NUM_SMS, occupancy via hints_fn only) -grid_fn=lambda cfg: (NUM_SMS, 1, 1) -``` - -## Step-by-Step Workflow - -### Adding Autotune to a New Kernel - -1. **Classify the kernel** using the decision tree above. - - *VERIFY*: You know whether this is occupancy-only or requires tile-size tuning. - -2. **Remove hardcoded hints from decorator** (strongly recommended): If the kernel currently has hardcoded hints in its decorator (e.g. `@ct.kernel(occupancy=2, num_ctas=1)`), **remove those fixed hints** and change to bare `@ct.kernel` before adding autotuning. While `replace_hints` does correctly override decorator values at runtime, leaving them creates a silent fallback trap: if any code path (e.g., `DISABLE_AUTOTUNE`, error handling, or a future refactor) skips `replace_hints`, the decorator's fixed hints are used instead of the autotuned values — and this produces no error, just silently worse performance. Removing them makes the failure mode explicit (missing hints → compiler defaults) rather than silent (wrong fixed hints used). - - *VERIFY*: The `@ct.kernel` decorator has no `occupancy=` or `num_ctas=` arguments before proceeding. Use bare `@ct.kernel` instead. - -3. **Check for in-place writes**: If the kernel modifies input tensors in-place, you MUST use the split-buffer pattern during `exhaustive_search` — see Pitfall #1. - - *VERIFY*: Either the kernel is not in-place, or you have added a split-buffer scratch tensor for the search phase. - -4. **Select the template** from [`kernel-type-templates.md`](references/kernel-type-templates.md) based on kernel type. - -5. **Design the search space** following [`parameter-space-design.md`](references/parameter-space-design.md): - - **Start from reference configs**, not from scratch. Clone configs from existing production kernels of the same type (e.g., `ops/cutile/matmul.py` for GEMM) and adapt. For GEMM-class kernels, `nvMatmulHeuristics` can suggest 8-16 high-quality candidates that reach 96-99% peak performance — see [`parameter-space-design.md`](references/parameter-space-design.md) for details. - - Detect the current GPU architecture with `torch.cuda.get_device_capability()`. - - **Target one architecture at a time.** Generate configs only for the detected arch. Do NOT add branches for other architectures — they cannot be tested on this machine and untested code paths are unreliable. If multi-arch support is needed later, add it in a separate pass on the appropriate hardware. - - **When modifying code that already has autotune configs**: see "Handling Existing Autotune Configs (Multi-Architecture)" below. The "do NOT add branches" rule means do not *invent new configs* for untested architectures — it does NOT mean remove existing configs that were previously validated. - - Identify tunable parameters (tile sizes, occupancy, num_ctas) - - **Ensure the search space includes the original fixed config** (or an equivalent). This guarantees that the autotuned result is at least as good as the original — no performance regression is possible. - - If the generated set exceeds 30, apply tile size filters and pruning rules to reduce it to ≤ 30 in the final code - - *VERIFY*: Total configs in final code ≤ 30 (CuTile compilation is heavy, >30 configs will timeout). Temporary directed probes during development (30–100 configs, run via `bash + python3 -c`) are allowed — see Design Philosophy. - -6. **Implement** the tune-once/cache/launch pattern: - - Define a `_cache` dict at module level - - Define a cache key that captures all parameters affecting optimal config (shapes, dtypes, device, any flags like `is_causal`). **⚠️ Use `str(x.device)` not `x.device`** in the cache key — `torch.device` objects are not reliably hashable and can cause `TypeError: unhashable type` at runtime. Always convert to string: `cache_key = (..., x.dtype, str(x.device))`. **Tip**: For GEMM-class kernels, round dimensions to the next power of 2 in the cache key (e.g., `cache_key = (next_pow2(M), next_pow2(N), next_pow2(K), dtype, str(device))`) to reduce unique key count and avoid re-tuning for similar shapes. - - Call `exhaustive_search(list(configs), ...)` only when cache misses - - Store `result.best.config` in cache - - Use `kernel.replace_hints(...)` to create the tuned kernel variant - - Use `ct.launch()` for the actual kernel invocation - - `grid_fn` correctly computes grid from config - - `args_fn` passes all kernel arguments including tile sizes as `ct.Constant[int]` - - `hints_fn` passes `occupancy` and/or `num_ctas` from config - - *VERIFY*: `exhaustive_search` receives a `list()` of configs, not a raw generator. - -7. **(Optional) Add DISABLE_AUTOTUNE support** for CI and profiling: check `os.environ.get("DISABLE_AUTOTUNE", "0") == "1"` — when set, skip `exhaustive_search` entirely and fall back to `ct.launch` with the first valid config. Useful for: - - CI determinism (autotune adds variable wall time) - - NCU profiling (prevents autotune trial runs from cluttering the trace — see Pitfall #4) - - Debugging (isolates kernel correctness from autotune behavior) - Skip this step if your task only requires adding autotuning and the project's tests don't check for `DISABLE_AUTOTUNE`. - -8. **Test**: Run correctness tests first (`pytest -k "test_op and cutile"`), then benchmark. - - *VERIFY*: Correctness passes with autotune enabled AND with `DISABLE_AUTOTUNE=1`. - -9. **Validate with A/B test**: Compare autotune version vs fixed best-known config. See [`search-strategies.md`](references/search-strategies.md) for methodology. - - *VERIFY*: Autotune version ≥ baseline (or within noise). If worse, check that the search space includes the original fixed config, and that `replace_hints` is being used correctly. - -10. **Shrink the search space** — reduce compilation cost without losing performance. - - Templates provide broad search spaces as a starting point (e.g., 9 configs for varlen attention). Not all configs contribute to finding the optimal one — on a given architecture and kernel shape, many large-tile or multi-CTA configs compile for seconds each but are never selected. The goal of this step is to *prune the dead weight* so the final committed code has 5–8 configs per architecture instead of 10–15. - - **Why this matters**: Each config in `exhaustive_search` requires a full JIT compilation + warmup + benchmark of the kernel. For complex kernels (FMHA, varlen attention), this costs 2–4 seconds *per config*. Cutting from 9 to 5 configs saves 8–16 seconds of one-time autotuning cost per unique shape, with zero performance loss. - - **Procedure**: - - 1. After Step 9 passes, you already have a working autotuned kernel with the full template search space. Now run the test on 2–3 representative shapes and observe which config wins for each shape. You can inspect this by temporarily adding a print inside the cache-miss block: - ```python - print(f"[autotune] shape={cache_key[:5]} best={result.best.config} " - f"time={result.best.time_ms:.3f}ms " - f"configs_tried={len(result.successes)}") - ``` - - 2. Identify which configs are *competitive* — within 5% of the best for at least one shape. Configs that are never within 5% of the best across any test shape are *dead weight*. - - 3. Remove dead-weight configs from the generator. Always keep: - - The original fixed config (safety net — guarantees no regression) - - The config(s) that won on each test shape - - Any config within 5% of a winner (may win on untested shapes) - - 4. Re-run the test to confirm speedup is unchanged after pruning. - - **Common dead-weight patterns** (prune these first): - - `TILE_M=256` configs for attention/varlen kernels where `S_qo` in the test shapes is ≤ 4096 and batch×heads is large — the grid is already saturated at TILE_M=128. - - `num_ctas=2` configs for kernels with irregular or small grids — multi-CTA parallelism requires enough CTAs to benefit from cooperative launch, which doesn't hold when `grid[0]` is small. - - `occupancy=4` or `occupancy=8` configs on sm100+ for compute-bound kernels — Blackwell typically prefers lower occupancy (1–2) with larger tiles. - - **Target**: ≤ 8 configs per architecture branch in the final code. This keeps the one-time tuning cost under 25 seconds even for the most complex kernels (FMHA, varlen attention). - - - *VERIFY*: Config count ≤ 8 per architecture. `speedup_over_fixed` unchanged after pruning. - -11. **(MANDATORY) Verify correctness and performance before finalizing.** - - The verification requirements depend on the task type. In ALL cases, start with the code-level sanity check, then apply the task-specific verification. - - --- - - **A. Code-level sanity check (ALL tasks — do this first)** - - Review your implementation for known performance anti-patterns. These checks catch *implementation bugs*, not algorithmic issues — they apply regardless of whether you are adding, modifying, or fixing autotune code. - - - `replace_hints` must be called *exactly once* per config and the returned kernel object cached (Pitfall #7). If `replace_hints` appears on the hot path (outside the `if cache_key not in` block), you have a recompilation bug that causes 100-500× slowdown. - - `exhaustive_search` must be inside the cache-miss block, not called on every kernel invocation. - - The fast path should only do: cache lookup → `ct.launch` with the cached tuned kernel. No JIT-triggering calls in between. - - The cache must store `(best_cfg, tuned_kernel)` together — not just `best_cfg` alone. - - --- - - **B. Task-specific verification** - - **B1. Adding or modifying autotune configs** (the original code is correct): - - - *Correctness*: autotuned kernel output matches the reference (e.g. `torch` or fixed-config kernel) within tolerance. - - *Performance*: autotuned kernel must be *at least as fast* as the original fixed-config kernel. If it is slower: - - Check that the search space includes the original fixed config (this guarantees no regression). - - Check if `replace_hints` is being called on every code path — revisit Step 2 (if any path skips `replace_hints`, the decorator's fixed hints are used instead of autotuned values). - - Expand search space if all configs perform similarly (see `references/parameter-space-design.md` → "Adapting Search Space"). - - **B2. Fixing a correctness bug** (the original code produces wrong results): - - - *Correctness is the primary goal*: the fixed kernel must produce correct results. Do NOT compare speedup against the broken original — a correct-but-slower kernel is always better than a fast-but-wrong one. - - *Perf sanity check*: after fixing, verify that the implementation is not catastrophically slow due to an implementation bug (e.g. Pitfall #7). Two ways to check: - 1. *Code review*: confirm the code-level sanity check (Section A above) passes — this catches the most common perf bugs. - 2. *Runtime check*: if possible, compare your fixed+autotuned kernel against a simple correct baseline (e.g. the equivalent `torch` operation, or the kernel launched with a single hardcoded config and no autotuning). Your autotuned version should not be slower than this naive baseline. Minor overhead from the fix itself (e.g. split-buffer allocation) is acceptable. - - --- - - *⚠️ Autotuning bugs (silent hint override, split-buffer omission, hot-path recompilation) are only caught at runtime — always verify by running the kernel, not just by reading the code.* - -### Handling Existing Autotune Configs (Multi-Architecture) - -When adding autotune to a kernel, the source code may already contain autotune configs from a previous pass on different hardware. There are three scenarios: - -**Scenario 1: No existing autotune code.** The source has no autotune at all — follow the standard "Adding Autotune to a New Kernel" workflow above. Generate configs for the current GPU architecture only. - -**Scenario 2: Existing autotune, but no config for the current architecture.** The source already has autotune with configs for other architecture(s) (e.g., sm103) but NOT for the current GPU (e.g., sm100). Steps: - -1. Detect the current architecture with `torch.cuda.get_device_capability()`. -2. Check whether the existing config generator already uses architecture-conditional branching (i.e., `if/elif` on device capability). - - **If yes** (conditional yield structure exists): Add a new `elif` branch for the current architecture. Preserve all existing branches **unchanged** — do not modify their config values. - - **If no** (flat configs, no architecture branching): Add an `if` branch for the current architecture with new configs, and keep the existing flat configs in the `else` block as the default fallback. This ensures that all other architectures continue to use the original configs unchanged — the code modification must not alter kernel behavior on any architecture other than the current one. -3. Design configs for the current architecture following the standard workflow (Steps 4–10 above). -4. Validate only the current architecture's configs (Step 11). Other branches are assumed correct since they were previously validated on their respective hardware. - -Example — adding sm100 to a generator that already has sm103 configs (conditional structure exists): - -```python -def _my_autotune_configs(): - gpu_capability = torch.cuda.get_device_capability() - - if gpu_capability == (10, 0): # sm100 (B200) - # NEW: configs for sm100 (added in this pass) - for occ in [1, 2, 4]: - yield SimpleNamespace(occupancy=occ, TILE_M=128, TILE_N=128) - elif gpu_capability == (10, 3): # sm103 (GB300) - # EXISTING: configs for sm103 (do NOT modify) - for occ in [2, 4, 8]: - yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) - else: - # Fallback for unknown architectures - yield SimpleNamespace(occupancy=2, TILE_M=128, TILE_N=128) -``` - -Example — adding current-arch configs to flat (non-branching) code: - -```python -# BEFORE: flat configs (no architecture branching) -def _my_autotune_configs(): - for occ in [2, 4, 8]: - yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) - -# AFTER: if-branch for current arch, original configs become the else-default -def _my_autotune_configs(): - gpu_capability = torch.cuda.get_device_capability() - - if gpu_capability == (10, 0): # sm100 (B200) — current arch - # NEW: configs designed and tested for sm100 - for occ in [1, 2, 4]: - yield SimpleNamespace(occupancy=occ, TILE_M=128, TILE_N=128) - else: - # UNCHANGED: original flat configs as default for all other architectures - for occ in [2, 4, 8]: - yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) -``` - -**Scenario 3: Existing autotune with config for the current architecture.** The source already has a conditional branch for the current GPU architecture. Only modify the current architecture's branch (e.g., adjust tile sizes, add/remove occupancy values). Do **NOT** modify or remove configs for other architectures. - -**Key principles:** - -- **"Target one architecture at a time" means only *add or modify* configs for the detected arch** — it does NOT mean delete existing configs for other architectures. Existing configs were validated on their respective hardware and must be preserved. -- **When adding architecture branching to flat configs**: add an `if` for the current architecture and keep existing configs in the `else` as the default. This guarantees that the code change does not alter kernel behavior on any non-current architecture — the `else` path is identical to the original flat code. -- **Test/validation (Step 11) only applies to the current architecture's branch.** Other branches are assumed correct since they were previously validated on their respective hardware. You cannot test them here because you don't have access to that hardware. - -### Integration with torch.autograd.Function - -When the kernel is used inside a `torch.autograd.Function`: -- Place the tune-once/cache/launch logic in `forward()` only. The cached config is reused across calls. -- In `backward()`, using `ct.launch` with a fixed or cached config is often sufficient. However, if backward has its own independent search space (e.g. grouped GEMM dX and dW have separate optimal configs), autotuning is appropriate there too. -- Example: `rope_embedding.py` — forward uses `exhaustive_search` + cache with split-buffer, backward uses `ct.launch` with same-buffer (Q_in=Q_out). - -### Cross-Backend Config Transfer (Triton → CuTile) - -Use `src/tilegym/autotune.py`: maps `BLOCK_SIZE_M/N/K` → `TILE_SIZE_M/N/K`; `num_warps`/`num_stages` have no CuTile equivalent. - -### Optimizing an Existing Autotune Config - -1. **Profile first**: Use NCU (set `DISABLE_AUTOTUNE=1`). -2. **Expand** (too narrow): add tile sizes, `num_ctas` (sm90+), `swap_ab`. -3. **Prune** (too slow): remove suboptimal configs, use arch-conditional yield, add size filters. -4. **Re-validate**: A/B test to confirm improvement. - -## Pitfall Checklist - -Before submitting code with autotune, verify these: - -### Pitfall #1: In-Place Kernel Data Corruption - -**Problem**: `exhaustive_search` runs the kernel multiple times to benchmark. If the kernel modifies input tensors in-place, the data is corrupted after the first trial run. - -**Solution**: Split-buffer pattern — use separate read-only input and write-only output during search: - -```python -# During exhaustive_search: use separate output buffer -Q_scratch = torch.empty_like(Q) -configs = list(_rope_autotune_configs()) -result = exhaustive_search( - configs, stream, - grid_fn=..., - kernel=rope_kernel, - args_fn=lambda cfg: (Q, Q_scratch, ...), # Q_in != Q_out - hints_fn=..., -) - -# After search: launch with in-place args using tuned config -cfg = result.best.config -tuned_kernel = rope_kernel.replace_hints(occupancy=cfg.occupancy) -ct.launch(stream, grid, tuned_kernel, (Q, Q, ...)) # Q_in == Q_out (in-place) -``` - -**Real example**: `rope_embedding.py` — Search uses split-buffer, final launch uses same-buffer. - -**Also wrong**: Using `Q.clone()` in `args_fn` — this adds ~4us per clone, which is fatal for small kernels (~5us). The clone+copy pattern caused 0.48x performance in RoPE. - -**Tip — isolating output buffers in `args_fn`**: For kernels that write to a dedicated output tensor (not in-place), you *may* use `c.clone()` inside `args_fn` to prevent trial runs from overwriting the final output buffer. This is only needed when the caller reads the output tensor after `exhaustive_search` returns — if you immediately overwrite it with `ct.launch`, clone is unnecessary: - -```python -# Output tensor c will be overwritten by each trial — clone it so trials don't -# corrupt the buffer the caller expects to use after exhaustive_search returns. -result = exhaustive_search( - configs, stream, - grid_fn=..., - kernel=my_kernel, - args_fn=lambda cfg: (a, b, c.clone()), # each trial gets a fresh output - hints_fn=..., -) -``` - -This is safe because the clone cost (~4us) is negligible relative to compute-bound kernel execution time (~50us+). Only avoid `clone()` for very small, memory-bound kernels where 4us is a significant fraction of runtime — in that case, pre-allocate a single scratch buffer outside `args_fn` (as in the split-buffer pattern above). - -### Pitfall #2: Compilation Timeout - -**Problem**: >30 configs in the **final code** causes compilation to exceed 5 minutes. CuTile compilation is heavier than Triton. - -**Solution**: -- Keep the final code's search space ≤ 30 configs — apply arch filters, tile size filters, and pruning rules until you're under the limit -- Use architecture-conditional yield to only generate relevant configs -- If the initial template configs don't beat baseline, use a temporary directed probe (30–100 configs, via bash, not written to file) to identify winning dimensions, then lock the final code to ≤ 8 top candidates (see Design Philosophy) - -**Real example**: Grouped GEMM expanded from 4 to 32 configs → all backward tests timed out. Reverted to occupancy-only (4 configs) with no performance loss. - -### Pitfall #3: Cold-Cache Performance Skew - -**Problem**: First process run is slower due to driver/JIT caches. Can cause wrong config selection. - -**Solution**: Always warm up before measuring. `exhaustive_search` has built-in warmup, but first-process cold start is unavoidable. Re-run if you suspect the initial result was affected. - -### Pitfall #4: NCU Profiling Interference - -**Problem**: NCU profiles autotune trial runs, cluttering the trace. - -**Solution**: Set `DISABLE_AUTOTUNE=1` before profiling, or use `ncu --launch-skip N`. - -### Pitfall #5: search_space as Generator (Exhaustion) - -**Problem**: `exhaustive_search` requires a `Sequence` (list/tuple), not a generator. Passing a generator directly will fail or produce unexpected results. - -**Solution**: Always convert to list: -```python -# CORRECT: convert generator to list -configs = list(_matmul_autotune_configs()) -result = exhaustive_search(configs, ...) - -# WRONG: passing generator directly -result = exhaustive_search(_matmul_autotune_configs(), ...) -``` - -### Pitfall #6: FP8 Precision Loss - -**Problem**: Hardware `/` breaks FP8 quantization bucket boundaries. - -**Solution**: Use `ct.truediv(x, y, rounding_mode=RoundingMode.FULL)` for IEEE-compliant division in FP8 kernels. Never use `/` operator for FP8 scale computation. - -### Pitfall #7: `replace_hints` on Hot Path (Recompilation) - -**Problem**: `replace_hints()` returns a **new kernel object** with its own JIT cache (internally uses `dataclasses.replace()` which creates a fresh instance). Calling it on every kernel invocation — even with the same arguments — triggers recompilation every time. This is the most common autotune performance bug: `cutile_ms` jumps from ~0.04ms to 16–39ms (100–500× slower). - -**Incorrect** (recompiles on every call): -```python -_cache[key] = result.best.config # only stores config - -cfg = _cache[key] -tuned = my_kernel.replace_hints(occupancy=cfg.occupancy) # NEW kernel each time! -ct.launch(stream, grid, tuned, ...) -``` - -**Correct** (compile once, reuse forever): -```python -best_cfg = result.best.config -tuned = my_kernel.replace_hints(occupancy=best_cfg.occupancy) # compile ONCE -_cache[key] = (best_cfg, tuned) # cache both - -cfg, tuned = _cache[key] -ct.launch(stream, grid, tuned, ...) # reuse compiled kernel -``` - -**Rule**: Call `replace_hints` exactly once per config (immediately after `exhaustive_search`), cache the returned kernel object, and never call `replace_hints` again on the fast path. - -## Scope and Boundaries - -This skill covers *only* autotune configuration: search space design, `exhaustive_search` invocation, caching, and `ct.launch` with tuned hints. It does **not** modify kernel code. - -**In scope** (autotune config): -- Search space generator functions -- `exhaustive_search()` calls and result handling -- `kernel.replace_hints()` for applying tuned hints -- Cache logic (key design, dict management) -- `ct.launch()` with tuned kernel -- `DISABLE_AUTOTUNE` fallback path - -**Out of scope** (kernel code modifications — do NOT make these changes): -- Math flags (flush_to_zero, rounding_mode) -- Performance Hints (slice_hint, buffer_depth, copy_config) -- Memory access patterns (2D→1D gather/scatter conversion) -- Codegen optimizations (safe_offs → padding_value) -- Algorithm changes (K-loop split, load balancing) - -## Further Optimization Suggestions - -After adding autotuning, the following kernel-level optimizations may yield additional gains. These are *outside the scope of this skill* — mention them to the user as potential next steps, but do not implement them as part of autotuning: - -- **Math flags**: `flush_to_zero=True` + `rounding_mode=APPROX` can provide 34-72% improvement for FMHA-class kernels (set via environment variables `TILEIR_ENABLE_FTZ=1 TILEIR_ENABLE_APPROX=1` or in kernel code). *Causal chain*: larger tiles initially *decrease* performance by 18-43% due to subnormal handling overhead; enabling FTZ+APPROX rescues this and flips the result to +34-72%. Math flags are therefore a *prerequisite* for large-tile configs to be effective on FMHA-class kernels. -- **Performance Hints**: `slice_hint`, `buffer_depth`, `copy_config` — requires modifying kernel IR code -- **Memory access patterns**: Using TMA loads (`ct.load`) instead of `ct.gather`; removing unnecessary bounds checks (`check_bounds=False` when safe) -- **Codegen quality**: Using `padding_value` parameter instead of manual `ct.where` masking; removing `safe_offs` -- **Algorithm restructuring**: K-loop split, load balancing, algebraic simplification - -## Differences from Triton Autotune - -Key differences: Triton uses `@triton.autotune` decorator with `Config(...)` objects; CuTile uses `exhaustive_search()` with `SimpleNamespace` configs + separate cache + `ct.launch`. CuTile has no `num_warps`/`num_stages` (compiler decides) — only tile sizes + `occupancy` + `num_ctas`. CuTile compilation is heavier (keep ≤30 configs in final code). CuTile cache is user-managed in-memory (no automatic persistence). CuTile separates `args_fn` (kernel args) from `hints_fn` (compiler hints). - -## Reference Documents - -| Category | Document | Content | -|----------|----------|---------| -| **Parameter Design** | [`parameter-space-design.md`](references/parameter-space-design.md) | Per-kernel-type parameter spaces, cross-arch patterns, grid_fn patterns, pruning rules | -| **Search Strategies** | [`search-strategies.md`](references/search-strategies.md) | Exhaustive search, A/B test methodology, DISABLE_AUTOTUNE pattern | -| **Templates** | [`kernel-type-templates.md`](references/kernel-type-templates.md) | Copy-paste autotune templates for 8 kernel types | -| **Hardware** | [`hardware-constraints.md`](references/hardware-constraints.md) | Per-architecture constraints, tile size ranges, num_ctas rules, TMA requirements | - -## Source Code References - -Key files: `ops/cutile/matmul.py` (matmul autotune), `ops/cutile/attention.py` (FMHA autotune), `suites/unsloth/cutile/ct_ops.py` (shared `autotune_configs()` occupancy=[1,2,4,8]), `suites/unsloth/cutile/swiglu.py` (elementwise example), `suites/unsloth/cutile/rope_embedding.py` (split-buffer pattern), `suites/unsloth/cutile/grouped_gemm.py` (persistent GEMM, occupancy-only). - -## Worked Examples - -Each example shows the **before → after** pattern: `fixed_launch.py` (hardcoded `ct.launch`) and `autotuned_launch.py` (refactored to tune-once/cache/launch). - -| Directory | Kernel | Autotune Pattern | Complexity | Key Teaching Point | -|-----------|--------|-----------------|------------|-------------------| -| [`assets/examples/01_rmsnorm_occupancy_only/`](assets/examples/01_rmsnorm_occupancy_only/) | RMSNorm (reduction) | Occupancy-only `[1,2,4,8]` | Low | Most common pattern — no tile tuning, just find best occupancy. Grid = `NUM_SM * cfg.occupancy`. Not in-place. | -| [`assets/examples/02_matmul_full_search/`](assets/examples/02_matmul_full_search/) | GEMM C=A@B | Full: `TILE_M/N/K` + `occupancy` + `num_ctas` (sm90+) | High | Compute-bound kernel with multiple tunable dimensions. `args_fn` passes tile sizes as `ct.Constant[int]`. `grid_fn` depends on `cfg`. ≤30 configs. | -| [`assets/examples/03_rope_inplace_splitbuffer/`](assets/examples/03_rope_inplace_splitbuffer/) | RoPE embedding (in-place) | Occupancy-only, with split-buffer | Medium | In-place kernel MUST use split-buffer during search to avoid corruption. Search writes to scratch; final `ct.launch` uses real in-place args. | diff --git a/.claude/skills b/.claude/skills index 2b7a412b..42c5394a 120000 --- a/.claude/skills +++ b/.claude/skills @@ -1 +1 @@ -../.agents/skills \ No newline at end of file +../skills \ No newline at end of file diff --git a/.github/scripts/check_spdx_headers.py b/.github/scripts/check_spdx_headers.py index 4ec69277..c96ea49b 100755 --- a/.github/scripts/check_spdx_headers.py +++ b/.github/scripts/check_spdx_headers.py @@ -33,11 +33,12 @@ ) # Default SPDX license identifier line for the main repo (MIT). SPDX_LICENSE = "SPDX-License-Identifier: MIT" -# SPDX license identifier line used for skill files (under ``.agents/skills/`` -# and the ``.claude/skills`` symlink). These files are dual-licensed under -# CC-BY-4.0 (documentation) AND Apache-2.0 (source code) per the NVIDIA -# Skills Publishing Onboarding guide and the OSRB-approved CC-BY-4.0-Apache2 -# Dual License pattern. +# SPDX license identifier line used for skill content files (under +# ``skills/``, the canonical location; also accessible via the +# ``.agents/skills`` and ``.claude/skills`` backward-compatibility symlinks). +# These files are dual-licensed under CC-BY-4.0 (documentation) AND +# Apache-2.0 (source code) per the OSRB-approved dual-license pattern; the +# SPDX expression uses ``AND`` to reflect the legal scope. SPDX_LICENSE_SKILLS = "SPDX-License-Identifier: CC-BY-4.0 AND Apache-2.0" # Regex pattern to validate SPDX copyright lines with any valid year or year range @@ -50,21 +51,23 @@ # Public / exportable code (default): MIT only — matches the repo-wide license # for everything that is not a dual-licensed agent skill. # -# Skill content (under ``.agents/skills/``): the dual-licensed combination -# ``CC-BY-4.0 AND Apache-2.0`` only. We deliberately do not accept MIT here -# so that the gate catches any skill file that was authored before the -# relicensing or imported from elsewhere with a stale header. +# Skill content files (under ``skills/``, non-SKILL.md): dual-licensed +# ``CC-BY-4.0 AND Apache-2.0`` per OSRB approval. The NV-BASE validator only +# inspects SKILL.md frontmatter (Tier 1), so the SPDX ``AND`` expression in +# source-file headers is not seen by the validator and remains the legally +# accurate scope marker. ALLOWED_LICENSES_DEFAULT: Tuple[str, ...] = ("MIT",) ALLOWED_LICENSES_SKILLS: Tuple[str, ...] = ("CC-BY-4.0 AND Apache-2.0",) # Directory names (anywhere under root) to skip entirely. # -# ``.agents`` and ``.claude`` are skipped from the default walker because -# they are dual-licensed and therefore cannot use the default MIT header. -# Skill files under those directories are processed separately via -# :func:`iter_skill_files` and :func:`iter_skill_content_files`, both of -# which target ``.agents/skills/`` (the canonical path; ``.claude/skills`` -# is a symlink to ``../.agents/skills`` for agent-tool compatibility). +# ``skills``, ``.agents`` and ``.claude`` are skipped from the default walker +# because they are dual-licensed and therefore cannot use the default MIT +# header. Skill files are processed separately via :func:`iter_skill_files` +# and :func:`iter_skill_content_files`, both of which target the canonical +# ``skills/`` path. ``.agents/skills`` and ``.claude/skills`` are +# backward-compatibility symlinks pointing to ``../skills``; walking only +# the canonical ``skills/`` avoids double-processing the same files. SKIP_DIRS = { ".git", "__pycache__", @@ -75,6 +78,7 @@ ".egg-info", "dist", "build", + "skills", ".agents", ".claude", } @@ -196,7 +200,7 @@ def should_skip_file(file_path: Path, root_dir: Path) -> bool: # License field to insert into SKILL.md (and other frontmatter .md) files -# under ``.agents/skills/``. These files are dual-licensed; the YAML +# under ``skills/``. These files are dual-licensed; the YAML # ``license:`` field carries the same SPDX expression as the in-file SPDX # comment used for non-frontmatter files. SKILL_LICENSE_LINE = "license: CC-BY-4.0 AND Apache-2.0" @@ -207,7 +211,7 @@ def should_skip_file(file_path: Path, root_dir: Path) -> bool: def iter_skill_files(root_dir: Path) -> Iterator[Path]: - """Yield .md files with YAML frontmatter under .agents/skills/. + """Yield .md files with YAML frontmatter under skills/. This includes SKILL.md files and any other .md files that start with ``---`` frontmatter (e.g. sub-skill definitions). All yielded files are @@ -217,12 +221,12 @@ def iter_skill_files(root_dir: Path) -> Iterator[Path]: that :func:`iter_skill_content_files` can give them a standard SPDX comment header instead. - Note: ``.claude/skills`` is a symlink to ``../.agents/skills`` for - backward compatibility with agents that hard-code the ``.claude/`` path. - Walking the canonical ``.agents/skills/`` path avoids double-processing - the same files via the symlink. + Note: ``.agents/skills`` and ``.claude/skills`` are symlinks to + ``../skills`` for backward compatibility with agents that hard-code the + older paths. Walking the canonical ``skills/`` path avoids + double-processing the same files via the symlinks. """ - skills_dir = root_dir / ".agents" / "skills" + skills_dir = root_dir / "skills" if not skills_dir.is_dir(): return for dirpath, _dirnames, filenames in os.walk(skills_dir): @@ -314,7 +318,7 @@ def _has_yaml_frontmatter(path: Path) -> bool: def iter_skill_content_files(root_dir: Path) -> Iterator[Path]: - """Yield .py and ``SKILL.md`` files under .agents/skills/ for SPDX headers. + """Yield .py and ``SKILL.md`` files under skills/ for SPDX headers. .md files with YAML frontmatter (starting with ``---``) are handled by :func:`iter_skill_files` using the frontmatter ``license:`` approach. @@ -331,7 +335,7 @@ def iter_skill_content_files(root_dir: Path) -> Iterator[Path]: ``SKILL.md`` that has not yet been migrated to YAML frontmatter, so the skill itself always advertises its license one way or another. """ - skills_dir = root_dir / ".agents" / "skills" + skills_dir = root_dir / "skills" if not skills_dir.is_dir(): return for dirpath, _dirnames, filenames in os.walk(skills_dir): @@ -568,14 +572,14 @@ def action_write(root_dir: Path) -> int: print(f"Added header to: {file_path.relative_to(root_dir)}") modified_count += 1 - # Handle SKILL.md (and other frontmatter .md) files under .agents/skills/. + # Handle SKILL.md (and other frontmatter .md) files under skills/. # These carry the dual-license expression in the YAML ``license:`` field. for skill_md in iter_skill_files(root_dir): if add_skill_license(skill_md, license_line=SKILL_LICENSE_LINE): print(f"Added/updated license in frontmatter: {skill_md.relative_to(root_dir)}") modified_count += 1 - # Handle .py and non-frontmatter .md files under .agents/skills/. + # Handle .py and non-frontmatter .md files under skills/. # These are dual-licensed under CC-BY-4.0 AND Apache-2.0. for content_file in iter_skill_content_files(root_dir): comment_style = get_comment_style(content_file) @@ -603,7 +607,7 @@ def action_check(root_dir: Path) -> int: if not check_file(file_path): missing_headers.append(file_path) - # Check SKILL.md (and other frontmatter .md) files under .agents/skills/. + # Check SKILL.md (and other frontmatter .md) files under skills/. for skill_md in iter_skill_files(root_dir): try: with open(skill_md, "r", encoding="utf-8") as f: @@ -613,7 +617,7 @@ def action_check(root_dir: Path) -> int: except Exception as e: print(f"Error reading {skill_md}: {e}", file=sys.stderr) - # Check .py and non-frontmatter .md files under .agents/skills/. These + # Check .py and non-frontmatter .md files under skills/. These # must carry the dual-license SPDX expression. for content_file in iter_skill_content_files(root_dir): if not check_file(content_file, allowed_licenses=ALLOWED_LICENSES_SKILLS): diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b0db5851..9b4ecff1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -148,9 +148,10 @@ To accept your contribution, we need a signed Contributor License Agreement (CLA 3. Email the signed CLA to `TileGym@nvidia.com` with subject: `TileGym CLA Submission`. 4. Wait for confirmation from the TileGym team before your PR can be merged. -### 5. Signing your work (DCO) — required for `.agents/skills/` contributions +### 5. Signing your work (DCO) — required for `skills/` contributions -Files under `.agents/skills/` (and the `.claude/skills/` symlink) are dual-licensed under +Files under `skills/` (also accessible via the `.agents/skills/` and `.claude/skills/` +backward-compatibility symlinks) are dual-licensed under **CC-BY-4.0 AND Apache-2.0** (see [`LICENSE`](LICENSE)). All contributions to the dual-licensed agent-skills content must be signed off via the [Developer Certificate of Origin](https://developercertificate.org/) (DCO). @@ -159,7 +160,7 @@ dual-licensed agent-skills content must be signed off via the By signing off on a commit, you certify that the contribution is your original work, or that you have rights to submit it under the same license, or a compatible license. -Any commit touching files under `.agents/skills/` that is not signed off will not be accepted. +Any commit touching files under `skills/` (or its `.agents/skills/` / `.claude/skills/` symlinks) that is not signed off will not be accepted. #### How to sign off diff --git a/LICENSE b/LICENSE index 8292ecc1..c4f883e0 100644 --- a/LICENSE +++ b/LICENSE @@ -6,13 +6,15 @@ This repository is distributed under two licenses: repository. 2. The Agent License (CC-BY-4.0 AND Apache-2.0), set out in Section B - below, applies only to files located under the `.agents/` and - `.claude/` directories (recursively), if present in this repository. + below, applies only to files located under the `skills/` directory + (the canonical location), and equivalently under the `.agents/skills/` + and `.claude/skills/` paths (which are backward-compatibility symlinks + pointing to `skills/`), recursively, if present in this repository. -For any file located under `.agents/` or `.claude/`, both licenses nominally -apply; in the event of any conflict between them for those files, the Agent -License in Section B controls. All other files in the repository are -governed solely by the MIT License in Section A. +For any file located under `skills/`, `.agents/skills/`, or `.claude/skills/`, +both licenses nominally apply; in the event of any conflict between them for +those files, the Agent License in Section B controls. All other files in the +repository are governed solely by the MIT License in Section A. The Agent License additionally travels with the files it covers: it continues to apply to any copy, clone, relocation, or redistribution of those files, @@ -20,13 +22,15 @@ including installations into different directories used by other agent tools (for example, to support Codex or similar). The Agent License scope follows the files themselves, not only the original paths listed above. -If the `.agents/` or `.claude/` directories do not exist in a given checkout -of this repository, the scoping clauses above are inert for that checkout -and the MIT License in Section A governs the entire checkout on its own. +If the `skills/`, `.agents/`, or `.claude/` directories do not exist in a +given checkout of this repository, the scoping clauses above are inert for +that checkout and the MIT License in Section A governs the entire checkout +on its own. -------------------------------------------------------------------------- SECTION A — MIT LICENSE -(APPLIES TO THE ENTIRE REPOSITORY EXCEPT FILES UNDER `.agents/` OR `.claude/`) +(APPLIES TO THE ENTIRE REPOSITORY EXCEPT FILES UNDER `skills/`, + `.agents/skills/`, OR `.claude/skills/`) -------------------------------------------------------------------------- SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. @@ -52,7 +56,7 @@ DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------- SECTION B — AGENT LICENSE (CC-BY-4.0 AND Apache-2.0) -(APPLIES ONLY TO FILES UNDER `.agents/` AND `.claude/`) +(APPLIES ONLY TO FILES UNDER `skills/`, `.agents/skills/`, AND `.claude/skills/`) -------------------------------------------------------------------------- Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/.agents/skills/adding-cutile-kernel/SKILL.md b/skills/adding-cutile-kernel/SKILL.md similarity index 100% rename from .agents/skills/adding-cutile-kernel/SKILL.md rename to skills/adding-cutile-kernel/SKILL.md diff --git a/.agents/skills/converting-cutile-to-julia/SKILL.md b/skills/converting-cutile-to-julia/SKILL.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/SKILL.md rename to skills/converting-cutile-to-julia/SKILL.md diff --git a/.agents/skills/converting-cutile-to-julia/examples/01_add/cutile_julia.jl b/skills/converting-cutile-to-julia/examples/01_add/cutile_julia.jl similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/01_add/cutile_julia.jl rename to skills/converting-cutile-to-julia/examples/01_add/cutile_julia.jl diff --git a/.agents/skills/converting-cutile-to-julia/examples/01_add/cutile_python.py b/skills/converting-cutile-to-julia/examples/01_add/cutile_python.py similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/01_add/cutile_python.py rename to skills/converting-cutile-to-julia/examples/01_add/cutile_python.py diff --git a/.agents/skills/converting-cutile-to-julia/examples/02_matmul/cutile_julia.jl b/skills/converting-cutile-to-julia/examples/02_matmul/cutile_julia.jl similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/02_matmul/cutile_julia.jl rename to skills/converting-cutile-to-julia/examples/02_matmul/cutile_julia.jl diff --git a/.agents/skills/converting-cutile-to-julia/examples/02_matmul/cutile_python.py b/skills/converting-cutile-to-julia/examples/02_matmul/cutile_python.py similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/02_matmul/cutile_python.py rename to skills/converting-cutile-to-julia/examples/02_matmul/cutile_python.py diff --git a/.agents/skills/converting-cutile-to-julia/examples/03_softmax/cutile_julia.jl b/skills/converting-cutile-to-julia/examples/03_softmax/cutile_julia.jl similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/03_softmax/cutile_julia.jl rename to skills/converting-cutile-to-julia/examples/03_softmax/cutile_julia.jl diff --git a/.agents/skills/converting-cutile-to-julia/examples/03_softmax/cutile_python.py b/skills/converting-cutile-to-julia/examples/03_softmax/cutile_python.py similarity index 100% rename from .agents/skills/converting-cutile-to-julia/examples/03_softmax/cutile_python.py rename to skills/converting-cutile-to-julia/examples/03_softmax/cutile_python.py diff --git a/.agents/skills/converting-cutile-to-julia/references/api-mapping.md b/skills/converting-cutile-to-julia/references/api-mapping.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/references/api-mapping.md rename to skills/converting-cutile-to-julia/references/api-mapping.md diff --git a/.agents/skills/converting-cutile-to-julia/references/critical-rules.md b/skills/converting-cutile-to-julia/references/critical-rules.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/references/critical-rules.md rename to skills/converting-cutile-to-julia/references/critical-rules.md diff --git a/.agents/skills/converting-cutile-to-julia/references/debugging.md b/skills/converting-cutile-to-julia/references/debugging.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/references/debugging.md rename to skills/converting-cutile-to-julia/references/debugging.md diff --git a/.agents/skills/converting-cutile-to-julia/references/testing.md b/skills/converting-cutile-to-julia/references/testing.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/references/testing.md rename to skills/converting-cutile-to-julia/references/testing.md diff --git a/.agents/skills/converting-cutile-to-julia/scripts/validate_cutile_jl.py b/skills/converting-cutile-to-julia/scripts/validate_cutile_jl.py similarity index 100% rename from .agents/skills/converting-cutile-to-julia/scripts/validate_cutile_jl.py rename to skills/converting-cutile-to-julia/scripts/validate_cutile_jl.py diff --git a/.agents/skills/converting-cutile-to-julia/translations/workflow.md b/skills/converting-cutile-to-julia/translations/workflow.md similarity index 100% rename from .agents/skills/converting-cutile-to-julia/translations/workflow.md rename to skills/converting-cutile-to-julia/translations/workflow.md diff --git a/.agents/skills/converting-cutile-to-triton/SKILL.md b/skills/converting-cutile-to-triton/SKILL.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/SKILL.md rename to skills/converting-cutile-to-triton/SKILL.md diff --git a/.agents/skills/converting-cutile-to-triton/examples/01_vector_add/cutile_kernel.py b/skills/converting-cutile-to-triton/examples/01_vector_add/cutile_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/01_vector_add/cutile_kernel.py rename to skills/converting-cutile-to-triton/examples/01_vector_add/cutile_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/01_vector_add/triton_kernel.py b/skills/converting-cutile-to-triton/examples/01_vector_add/triton_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/01_vector_add/triton_kernel.py rename to skills/converting-cutile-to-triton/examples/01_vector_add/triton_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/02_softmax/cutile_kernel.py b/skills/converting-cutile-to-triton/examples/02_softmax/cutile_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/02_softmax/cutile_kernel.py rename to skills/converting-cutile-to-triton/examples/02_softmax/cutile_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/02_softmax/triton_kernel.py b/skills/converting-cutile-to-triton/examples/02_softmax/triton_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/02_softmax/triton_kernel.py rename to skills/converting-cutile-to-triton/examples/02_softmax/triton_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/03_layernorm/cutile_kernel.py b/skills/converting-cutile-to-triton/examples/03_layernorm/cutile_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/03_layernorm/cutile_kernel.py rename to skills/converting-cutile-to-triton/examples/03_layernorm/cutile_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/03_layernorm/triton_kernel.py b/skills/converting-cutile-to-triton/examples/03_layernorm/triton_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/03_layernorm/triton_kernel.py rename to skills/converting-cutile-to-triton/examples/03_layernorm/triton_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/04_matmul/cutile_kernel.py b/skills/converting-cutile-to-triton/examples/04_matmul/cutile_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/04_matmul/cutile_kernel.py rename to skills/converting-cutile-to-triton/examples/04_matmul/cutile_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/04_matmul/triton_kernel.py b/skills/converting-cutile-to-triton/examples/04_matmul/triton_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/04_matmul/triton_kernel.py rename to skills/converting-cutile-to-triton/examples/04_matmul/triton_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/05_attention/cutile_kernel.py b/skills/converting-cutile-to-triton/examples/05_attention/cutile_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/05_attention/cutile_kernel.py rename to skills/converting-cutile-to-triton/examples/05_attention/cutile_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/examples/05_attention/triton_kernel.py b/skills/converting-cutile-to-triton/examples/05_attention/triton_kernel.py similarity index 100% rename from .agents/skills/converting-cutile-to-triton/examples/05_attention/triton_kernel.py rename to skills/converting-cutile-to-triton/examples/05_attention/triton_kernel.py diff --git a/.agents/skills/converting-cutile-to-triton/references/api-mapping.md b/skills/converting-cutile-to-triton/references/api-mapping.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/api-mapping.md rename to skills/converting-cutile-to-triton/references/api-mapping.md diff --git a/.agents/skills/converting-cutile-to-triton/references/debugging.md b/skills/converting-cutile-to-triton/references/debugging.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/debugging.md rename to skills/converting-cutile-to-triton/references/debugging.md diff --git a/.agents/skills/converting-cutile-to-triton/references/gotchas.md b/skills/converting-cutile-to-triton/references/gotchas.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/gotchas.md rename to skills/converting-cutile-to-triton/references/gotchas.md diff --git a/.agents/skills/converting-cutile-to-triton/references/harness-integration.md b/skills/converting-cutile-to-triton/references/harness-integration.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/harness-integration.md rename to skills/converting-cutile-to-triton/references/harness-integration.md diff --git a/.agents/skills/converting-cutile-to-triton/references/optimization-strategy.md b/skills/converting-cutile-to-triton/references/optimization-strategy.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/optimization-strategy.md rename to skills/converting-cutile-to-triton/references/optimization-strategy.md diff --git a/.agents/skills/converting-cutile-to-triton/references/optimizing-reference.md b/skills/converting-cutile-to-triton/references/optimizing-reference.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/optimizing-reference.md rename to skills/converting-cutile-to-triton/references/optimizing-reference.md diff --git a/.agents/skills/converting-cutile-to-triton/references/performance-gotchas.md b/skills/converting-cutile-to-triton/references/performance-gotchas.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/references/performance-gotchas.md rename to skills/converting-cutile-to-triton/references/performance-gotchas.md diff --git a/.agents/skills/converting-cutile-to-triton/translations/advanced-patterns.md b/skills/converting-cutile-to-triton/translations/advanced-patterns.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/translations/advanced-patterns.md rename to skills/converting-cutile-to-triton/translations/advanced-patterns.md diff --git a/.agents/skills/converting-cutile-to-triton/translations/file-structure.md b/skills/converting-cutile-to-triton/translations/file-structure.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/translations/file-structure.md rename to skills/converting-cutile-to-triton/translations/file-structure.md diff --git a/.agents/skills/converting-cutile-to-triton/translations/workflow.md b/skills/converting-cutile-to-triton/translations/workflow.md similarity index 100% rename from .agents/skills/converting-cutile-to-triton/translations/workflow.md rename to skills/converting-cutile-to-triton/translations/workflow.md diff --git a/skills/cutile-autotuning/SKILL.md b/skills/cutile-autotuning/SKILL.md new file mode 100644 index 00000000..8657da92 --- /dev/null +++ b/skills/cutile-autotuning/SKILL.md @@ -0,0 +1,240 @@ +--- +name: cutile-autotuning +description: "Use when adding, modifying, optimizing, or debugging CuTile autotuning code. Trigger signals: `exhaustive_search` / `replace_hints` / `hints_fn` / `cuda.tile.tune` in code, `autotune` in filenames, or correctness/performance issues in autotuned CuTile kernels. Covers: tune-once/cache/launch pattern, per-architecture configs (sm80–sm120), parameter space design (tile sizes, occupancy, num_ctas), and 7 common pitfalls with solutions." +license: CC-BY-4.0 AND Apache-2.0 +--- + +# CuTile Autotuning + +Add autotuning to CuTile kernels using the `exhaustive_search` API with tune-once/cache/direct-launch pattern. + +## Instructions + +Follow the decision tree to classify the kernel, design a search space, implement the tune-once/cache/launch pattern, and validate performance. + +1. **Classify** — use the Decision Tree to determine search dimensions (occupancy-only vs full tile search) +2. **Design search space** — select the matching template from `references/kernel-type-templates.md`; prune to ≤ 30 configs in the final code via arch filters (directed exploration probes may temporarily exceed this — see Design Philosophy) +3. **Implement** — add `exhaustive_search` + cache + `ct.launch` following the Step-by-Step Workflow; handle in-place writes with split-buffer if needed +4. **Test** — run correctness with autotune enabled and with `DISABLE_AUTOTUNE=1` +5. **Validate** — A/B benchmark against fixed best-known config; see `references/search-strategies.md` +6. **Shrink** — prune dead-weight configs that never win, targeting ≤ 8 configs per architecture to minimize compilation cost (Step 10) + +## Task Router — Jump to What You Need + +| What are you trying to do? | Go to | +|---|---| +| Add autotune to a new kernel (most common) | Quick Reference below → Workflow: Adding Autotune → `references/kernel-type-templates.md` (pick by kernel type: T1=elementwise, T2=in-place, T3=matmul, T4=persistent, T5=FMHA, T6=FP8, T7=grouped GEMM, T8=varlen attention, T9=dual-GEMM fusion) | +| Debug: data corruption / wrong results after first run | Pitfall #1 (In-Place Kernel) | +| Debug: autotune taking 5+ minutes | Pitfall #2 (Compilation Timeout) | +| Debug: search space generator returning zero configs | Pitfall #5 first; also check arch filters, size guards, and `num_ctas` constraints | +| Optimize an existing autotune config | Workflow: Optimizing an Existing Config | + +## Quick Reference — Occupancy-Only Autotune (Tune-Once/Cache/Launch) + +Most CuTile kernels (elementwise, reduction, LayerNorm) need only occupancy tuning. Copy this pattern: + +```python +from types import SimpleNamespace +from cuda.tile.tune import exhaustive_search +import cuda.tile as ct +import torch + +def _my_autotune_configs(): + for occ in [1, 2, 4, 8]: + yield SimpleNamespace(occupancy=occ) + +# Module-level cache: tune once, launch fast forever after +_autotune_cache = {} + +def my_op(x, output): + stream = torch.cuda.current_stream() + NUM_SM = torch.cuda.get_device_properties(x.device).multi_processor_count + + # Cache key: anything that affects optimal config (use str() for device) + cache_key = (x.shape, x.dtype, str(x.device)) + + if cache_key not in _autotune_cache: + configs = list(_my_autotune_configs()) + result = exhaustive_search( + configs, + stream, + grid_fn=lambda cfg: (min(NUM_SM * cfg.occupancy, M), 1, 1), + kernel=my_kernel, + args_fn=lambda cfg: (x, output, ...), + hints_fn=lambda cfg: {"occupancy": cfg.occupancy}, + ) + best_cfg = result.best.config + tuned_kernel = my_kernel.replace_hints(occupancy=best_cfg.occupancy) + _autotune_cache[cache_key] = (best_cfg, tuned_kernel) # cache BOTH + + cfg, tuned_kernel = _autotune_cache[cache_key] + grid = (min(NUM_SM * cfg.occupancy, M), 1, 1) + ct.launch(stream, grid, tuned_kernel, (x, output, ...)) +``` + +Key rules: +- **Tune once, cache, launch directly** — `exhaustive_search` runs only on first call per shape; subsequent calls use cached config + `ct.launch` with zero overhead +- For in-place kernels use split-buffer during search (separate input/output tensors) +- Keep ≤ 30 configs in final code (see Design Philosophy for temporary directed probes) +- `exhaustive_search` requires a `Sequence` (list/tuple) — convert generators with `list()` +- **Search space must include the original fixed config** — this guarantees autotuning never makes performance worse + +**When to use this pattern**: Kernel has fixed block size (not tile-size tunable). Includes: elementwise (SwiGLU, GeGLU), reduction (RMSNorm, LayerNorm), RoPE, and persistent kernels with heuristic block sizes (grouped GEMM). + +For complex kernels (matmul with tile sizes, FMHA, FP8 with num_ctas), read the full guide below + [`kernel-type-templates.md`](references/kernel-type-templates.md). + +> **⚠️ Three pitfalls catch almost everyone — check before submitting:** +> - **`replace_hints` on hot path?** → Cache BOTH config AND kernel object from `exhaustive_search`. Calling `replace_hints()` every invocation recompiles (100–500× slower) → Pitfall #7 +> - **In-place kernel** (writes back to input tensor)? → MUST use split-buffer pattern during search → Pitfall #1 +> - **Search space empty?** → Check arch filters and `num_ctas` constraints → Pitfall #5 + +> **Minimum coverage**: On sm100+, FMHA/matmul/varlen search spaces must include both `num_ctas=1` and `num_ctas=2`. For core dimensions (tile sizes, occupancy), keep at least 2 distinct values even if unsure which is better — let `exhaustive_search` decide. + +> **When to stop tuning**: A mean speedup in [0.98, 1.02] means your *current* search space isn't helping — but doesn't mean no config will help. Before stopping, check whether you've covered the key dimensions for this kernel type (consult `references/kernel-type-templates.md`). If the search space already covers the template's recommended dimensions and the best result is still noise-floor, then stop — further micro-adjustments won't help. If key dimensions are missing (e.g., never tried `num_ctas=2` for a dual-GEMM kernel), expand the search space rather than giving up. +> +> Once correctness tests pass and the autotuned kernel shows speedup over the fixed-config baseline, **stop — do not re-run to "confirm".** GPU kernel timing fluctuates ±5–10 % between invocations due to clock scaling and OS scheduling; a subsequent timing dip does not mean your code is wrong. +> +> To improve speedup, only modify the autotune search space (configs, tile sizes, occupancy, num_ctas). Do not modify other code (Python wrapper, stream management, etc.) to chase speedup — kernel performance is determined by the config selection, not by host-side code. + +## Reading Guide + +- **Occupancy-only kernels** (elementwise, reduction, persistent with fixed block sizes): Quick Reference + Pitfall Checklist is sufficient — skip `references/` docs. For in-place kernels, also read Pitfall #1. +- **Complex kernels** (matmul with tunable tile sizes, FMHA, FP8 with num_ctas): Quick Reference → Decision Tree → API Reference → Step-by-Step Workflow → relevant `references/` docs. + +**5-step summary**: Classify kernel → Design search space ([`parameter-space-design.md`](references/parameter-space-design.md)) → Implement using template ([`kernel-type-templates.md`](references/kernel-type-templates.md)) → Validate with A/B test → Check Pitfall Checklist. + +**Reading references**: Read only the reference relevant to your kernel type — e.g., for FMHA, read the Template 5 section in `references/kernel-type-templates.md`; for hardware constraints, read only the target architecture's section. Avoid reading all references end-to-end when a targeted lookup suffices. + +## Design Philosophy + +**Build a small, precise search space bottom-up — not a large space trimmed down.** CuTile compilation is much heavier than Triton (~0.5-1s per config), so the **final code** should contain ≤ 30 configs. The approach is: classify the kernel type first, then construct only the relevant configs for that type and architecture. + +**Directed exploration during development**: If the initial template configs yield speedup < 1.0, you may run a *temporary* larger probe (30–100 configs) via `bash + python3 -c` to identify which dimensions matter — but this probe must be **directional**, not a blind cartesian product. Use the kernel type classification to decide *which* dimensions to vary (e.g. for dual-GEMM, probe `num_ctas × occupancy` while fixing tile sizes; for FMHA, probe `TILE_M × num_ctas` while fixing TILE_N). Once the probe identifies the winning region, lock the final code's search space to ≤ 8 top candidates. Do NOT write the large probe into the source file — it is a one-shot diagnostic tool. + +## Decision Tree: What Search Dimensions Does This Kernel Need? + +All kernels should have autotuning added. The question is not *whether* to autotune, but *what dimensions* to search: + +``` +What type of kernel is this? +├── Compute-bound (matmul, GEMM, FMHA) → Does it have multiple tunable dimensions (tile sizes)? +│ ├── YES → Is it a fused multi-GEMM kernel (dual-GEMM, e.g. Linear+GLUAct)? +│ │ ├── YES → Template 9: low occupancy (1–2), conservative tiles (2× SHMEM/register pressure) +│ │ └── NO → Full search: TILE_M × TILE_N × (TILE_K) × occupancy × num_ctas +│ │ (see matmul/FMHA templates in kernel-type-templates.md) +│ └── NO → Occupancy-only search: [1, 2, 4, 8] +│ (see Quick Reference above) +├── Balanced (LayerNorm, reduction + compute) → +│ Occupancy-only search: [1, 2, 4, 8] +│ Expected benefit: 2-15% +└── Memory-bound (CE Loss, pure elementwise) → + Occupancy-only search: [1, 2, 4, 8] + Expected benefit: 0-15% (varies by kernel; zero-cost after tuning) +``` + +**Why memory-bound kernels only search occupancy (not num_ctas or tile sizes)**: +- **`num_ctas` has zero benefit**: `num_ctas > 1` enables TMA multicast, where multiple CTAs share tile data in shared memory (e.g., matmul A/B tiles reused across CTAs). Memory-bound kernels use per-element `ct.gather`/`ct.scatter` with no tile reuse — multi-CTA cooperation adds overhead with no data sharing benefit. +- **Tile sizes are pre-determined**: BLOCK_SIZE for memory-bound kernels is determined by offline sweep (e.g., 1024 is globally optimal on B200 across [256, 512, 1024, 2048, 4096, 8192]). This is a constant, not a runtime tunable. +- **Occupancy is the only effective knob**: Higher occupancy lets the GPU hide memory latency by switching to another CTA while one is stalled on a memory request. + +> **Evidence — CE Loss experiment**: A 12-config search (occupancy × num_ctas) on Cross-Entropy Loss yielded only 2.5% gain (0.79x → 0.81x vs Triton). The `num_ctas` dimension contributed nothing; the result was reverted because compilation cost outweighed the marginal benefit. Occupancy-only (4 configs) achieves the same result at 3x less compilation time. + +**Note on memory-bound kernels**: Adding occupancy-only autotune is always worthwhile because: +- The tune-once/cache/launch pattern has zero runtime overhead after the first call +- The search space is tiny (4 configs, ~2-4s compilation) +- Even small improvements have value at scale + +## Occupancy Selection Guide + +Occupancy controls how many CTAs run concurrently per SM. Use this as a starting point when designing the occupancy search space: + +| Occupancy Range | Best For | Example Kernels | +|-----------------|----------|-----------------| +| 1–4 | Compute-bound (heavy math) | Complex transforms, matmul | +| 4–8 | Balanced (GEMM, TMA) | Matrix multiply, FMHA | +| 8–16 | Memory-bound (reductions) | Softmax, LayerNorm | +| 16–32 | Very light (copies, casts) | Type conversions, elementwise | + +Use these ranges to seed your initial search space. For occupancy-only kernels, `[1, 2, 4, 8]` covers most cases — see Quick Reference above. + +## exhaustive_search API Reference + +See [references/api-reference.md](references/api-reference.md) for the full +`exhaustive_search` API surface — current signature, `TuningResult`, the +tune-once/cache/launch pattern, `replace_hints`, kernel hints, `search_space` +design, and `grid_fn` patterns. + +## Step-by-Step Workflow + +See [references/workflow.md](references/workflow.md) for the end-to-end +workflow — adding autotune to a new kernel, handling existing +multi-architecture configs, integration with `torch.autograd.Function`, +cross-backend config transfer (Triton → CuTile), and optimizing an existing +config. + +## Pitfall Checklist + +See [references/pitfalls.md](references/pitfalls.md) for the full list of +common pitfalls — in-place data corruption, compilation timeout, cold-cache +performance skew, NCU profiling interference, `search_space` generator +exhaustion, FP8 precision loss, and `replace_hints` recompilation on hot +paths. + +## Scope and Boundaries + +This skill covers *only* autotune configuration: search space design, `exhaustive_search` invocation, caching, and `ct.launch` with tuned hints. It does **not** modify kernel code. + +**In scope** (autotune config): +- Search space generator functions +- `exhaustive_search()` calls and result handling +- `kernel.replace_hints()` for applying tuned hints +- Cache logic (key design, dict management) +- `ct.launch()` with tuned kernel +- `DISABLE_AUTOTUNE` fallback path + +**Out of scope** (kernel code modifications — do NOT make these changes): +- Math flags (flush_to_zero, rounding_mode) +- Performance Hints (slice_hint, buffer_depth, copy_config) +- Memory access patterns (2D→1D gather/scatter conversion) +- Codegen optimizations (safe_offs → padding_value) +- Algorithm changes (K-loop split, load balancing) + +## Further Optimization Suggestions + +After adding autotuning, the following kernel-level optimizations may yield additional gains. These are *outside the scope of this skill* — mention them to the user as potential next steps, but do not implement them as part of autotuning: + +- **Math flags**: `flush_to_zero=True` + `rounding_mode=APPROX` can provide 34-72% improvement for FMHA-class kernels (set via environment variables `TILEIR_ENABLE_FTZ=1 TILEIR_ENABLE_APPROX=1` or in kernel code). *Causal chain*: larger tiles initially *decrease* performance by 18-43% due to subnormal handling overhead; enabling FTZ+APPROX rescues this and flips the result to +34-72%. Math flags are therefore a *prerequisite* for large-tile configs to be effective on FMHA-class kernels. +- **Performance Hints**: `slice_hint`, `buffer_depth`, `copy_config` — requires modifying kernel IR code +- **Memory access patterns**: Using TMA loads (`ct.load`) instead of `ct.gather`; removing unnecessary bounds checks (`check_bounds=False` when safe) +- **Codegen quality**: Using `padding_value` parameter instead of manual `ct.where` masking; removing `safe_offs` +- **Algorithm restructuring**: K-loop split, load balancing, algebraic simplification + +## Differences from Triton Autotune + +Key differences: Triton uses `@triton.autotune` decorator with `Config(...)` objects; CuTile uses `exhaustive_search()` with `SimpleNamespace` configs + separate cache + `ct.launch`. CuTile has no `num_warps`/`num_stages` (compiler decides) — only tile sizes + `occupancy` + `num_ctas`. CuTile compilation is heavier (keep ≤30 configs in final code). CuTile cache is user-managed in-memory (no automatic persistence). CuTile separates `args_fn` (kernel args) from `hints_fn` (compiler hints). + +## Reference Documents + +| Category | Document | Content | +|----------|----------|---------| +| **API Reference** | [`api-reference.md`](references/api-reference.md) | `exhaustive_search` signature, `TuningResult`, tune-once/cache/launch pattern, `replace_hints`, kernel hints, `search_space` design, `grid_fn` patterns | +| **Workflow** | [`workflow.md`](references/workflow.md) | End-to-end workflow: adding autotune to a new kernel, multi-architecture configs, `torch.autograd.Function` integration, Triton→CuTile transfer, optimizing existing configs | +| **Pitfalls** | [`pitfalls.md`](references/pitfalls.md) | Common pitfalls: in-place corruption, compilation timeout, cold-cache skew, NCU interference, `search_space` exhaustion, FP8 precision, `replace_hints` recompilation | +| **Parameter Design** | [`parameter-space-design.md`](references/parameter-space-design.md) | Per-kernel-type parameter spaces, cross-arch patterns, grid_fn patterns, pruning rules | +| **Search Strategies** | [`search-strategies.md`](references/search-strategies.md) | Exhaustive search, A/B test methodology, DISABLE_AUTOTUNE pattern | +| **Templates** | [`kernel-type-templates.md`](references/kernel-type-templates.md) | Copy-paste autotune templates for 8 kernel types | +| **Hardware** | [`hardware-constraints.md`](references/hardware-constraints.md) | Per-architecture constraints, tile size ranges, num_ctas rules, TMA requirements | + +## Source Code References + +Key files: `ops/cutile/matmul.py` (matmul autotune), `ops/cutile/attention.py` (FMHA autotune), `suites/unsloth/cutile/ct_ops.py` (shared `autotune_configs()` occupancy=[1,2,4,8]), `suites/unsloth/cutile/swiglu.py` (elementwise example), `suites/unsloth/cutile/rope_embedding.py` (split-buffer pattern), `suites/unsloth/cutile/grouped_gemm.py` (persistent GEMM, occupancy-only). + +## Worked Examples + +Each example shows the **before → after** pattern: `fixed_launch.py` (hardcoded `ct.launch`) and `autotuned_launch.py` (refactored to tune-once/cache/launch). + +| Directory | Kernel | Autotune Pattern | Complexity | Key Teaching Point | +|-----------|--------|-----------------|------------|-------------------| +| [`assets/examples/01_rmsnorm_occupancy_only/`](assets/examples/01_rmsnorm_occupancy_only/) | RMSNorm (reduction) | Occupancy-only `[1,2,4,8]` | Low | Most common pattern — no tile tuning, just find best occupancy. Grid = `NUM_SM * cfg.occupancy`. Not in-place. | +| [`assets/examples/02_matmul_full_search/`](assets/examples/02_matmul_full_search/) | GEMM C=A@B | Full: `TILE_M/N/K` + `occupancy` + `num_ctas` (sm90+) | High | Compute-bound kernel with multiple tunable dimensions. `args_fn` passes tile sizes as `ct.Constant[int]`. `grid_fn` depends on `cfg`. ≤30 configs. | +| [`assets/examples/03_rope_inplace_splitbuffer/`](assets/examples/03_rope_inplace_splitbuffer/) | RoPE embedding (in-place) | Occupancy-only, with split-buffer | Medium | In-place kernel MUST use split-buffer during search to avoid corruption. Search writes to scratch; final `ct.launch` uses real in-place args. | diff --git a/.agents/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/autotuned_launch.py b/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/autotuned_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/autotuned_launch.py rename to skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/autotuned_launch.py diff --git a/.agents/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/fixed_launch.py b/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/fixed_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/fixed_launch.py rename to skills/cutile-autotuning/assets/examples/01_rmsnorm_occupancy_only/fixed_launch.py diff --git a/.agents/skills/cutile-autotuning/assets/examples/02_matmul_full_search/autotuned_launch.py b/skills/cutile-autotuning/assets/examples/02_matmul_full_search/autotuned_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/02_matmul_full_search/autotuned_launch.py rename to skills/cutile-autotuning/assets/examples/02_matmul_full_search/autotuned_launch.py diff --git a/.agents/skills/cutile-autotuning/assets/examples/02_matmul_full_search/fixed_launch.py b/skills/cutile-autotuning/assets/examples/02_matmul_full_search/fixed_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/02_matmul_full_search/fixed_launch.py rename to skills/cutile-autotuning/assets/examples/02_matmul_full_search/fixed_launch.py diff --git a/.agents/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/autotuned_launch.py b/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/autotuned_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/autotuned_launch.py rename to skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/autotuned_launch.py diff --git a/.agents/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/fixed_launch.py b/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/fixed_launch.py similarity index 100% rename from .agents/skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/fixed_launch.py rename to skills/cutile-autotuning/assets/examples/03_rope_inplace_splitbuffer/fixed_launch.py diff --git a/skills/cutile-autotuning/references/api-reference.md b/skills/cutile-autotuning/references/api-reference.md new file mode 100644 index 00000000..0c545368 --- /dev/null +++ b/skills/cutile-autotuning/references/api-reference.md @@ -0,0 +1,179 @@ +# exhaustive_search API Reference + +> **⚠️ Deprecated API**: `cuda.tile_experimental.autotune_launch()` (aka `ct_experimental.autotune_launch`) is deprecated and should NOT be used. It combines search + launch in one call with random sampling, which produces less reproducible results and worse config selection compared to `exhaustive_search`. Always use `cuda.tile.tune.exhaustive_search` (the current API below) with explicit caching and `ct.launch`. + +## Current API (`cuda.tile.tune`) + +```python +from cuda.tile.tune import exhaustive_search, TuningResult + +result: TuningResult = exhaustive_search( + search_space, # Sequence[T] — list or tuple of configs (NOT a generator) + stream, # torch.cuda.current_stream() + grid_fn, # callable(cfg) → tuple[int, ...] + kernel, # @ct.kernel decorated function + args_fn, # callable(cfg) → tuple of kernel args + hints_fn=None, # callable(cfg) → {"occupancy": int, "num_ctas": int} + *, + quiet=False # suppress output +) +``` + +## TuningResult + +```python +@dataclass +class TuningResult[T]: + best: Measurement # best config + timing (mean_us, error_margin_us, num_samples) + successes: Sequence[Measurement] # all successful configs (sorted by performance) + failures: Sequence[tuple[T, str, str]] # (config, exception_type, message) +``` + +Key properties: +- **Exhaustive**: evaluates ALL configs in order — no random sampling, no skipped configs +- **Search only**: does not perform the final production launch — it executes trial runs internally for benchmarking, but you call `ct.launch` separately for the actual production invocation +- **No built-in cache**: you manage caching explicitly (see tune-once/cache/launch pattern) +- **Deterministic**: same search space always produces the same evaluation order + +## Tune-Once / Cache / Launch Pattern + +This is the **recommended pattern** for all autotuned kernels. It ensures: +- First call: runs `exhaustive_search` to find the best config (~2-30s depending on space size) +- Subsequent calls: uses cached config with `ct.launch` — zero overhead (identical to a fixed `ct.launch`) + +```python +_cache = {} + +def run_kernel_autotuned(x, ...): + stream = torch.cuda.current_stream() + cache_key = (x.shape, x.dtype, str(x.device)) + + if cache_key not in _cache: + configs = list(_my_autotune_configs()) + result = exhaustive_search( + configs, stream, + grid_fn=lambda cfg: ..., + kernel=my_kernel, + args_fn=lambda cfg: ..., + hints_fn=lambda cfg: {"occupancy": cfg.occupancy}, + ) + best_cfg = result.best.config + tuned_kernel = my_kernel.replace_hints(occupancy=best_cfg.occupancy) + _cache[cache_key] = (best_cfg, tuned_kernel) # cache BOTH config and compiled kernel + + cfg, tuned_kernel = _cache[cache_key] + grid = compute_grid(cfg) + ct.launch(stream, grid, tuned_kernel, (x, ...)) +``` + +**Why this pattern matters**: The `ct.launch` call in the fast path is identical to what you'd write for a fixed-config kernel. There is zero per-call overhead — no lock, no hash lookup, no lambda invocation. The only cost is the Python dict lookup for `_cache[cache_key]`. + +> **⚠️ Critical: always cache the tuned kernel object, not just the config.** `replace_hints()` returns a **new** kernel object with its own independent JIT cache. Calling it on every invocation triggers recompilation each time, degrading performance by 100–500×. Call `replace_hints()` once after `exhaustive_search`, store the returned kernel in the cache alongside the config, and reuse it directly on the fast path. See Pitfall #7. + +## replace_hints + +After finding the best config, use `kernel.replace_hints()` to create a kernel variant with the optimal hints: + +```python +# For occupancy-only: +tuned_kernel = my_kernel.replace_hints(occupancy=cfg.occupancy) + +# For occupancy + num_ctas: +tuned_kernel = my_kernel.replace_hints(occupancy=cfg.occupancy, num_ctas=cfg.num_ctas) +``` + +`replace_hints` accepts only `occupancy` and `num_ctas` — these are the only compiler hints controllable via the autotune API. + +**`ByTarget` wrapping for cross-architecture portability**: When creating tuned kernel variants via `ct.kernel()`, prefer wrapping hint values in `ct.ByTarget` for portability across GPU architectures: + +```python +# Preferred: explicit architecture targeting (portable) +tuned_kernel = ct.kernel( + my_kernel._pyfunc, + occupancy=ct.ByTarget(sm_100=best_cfg.occupancy), + num_ctas=ct.ByTarget(sm_100=best_cfg.num_ctas, default=1), +) + +# Also acceptable: plain integers (when targeting a single architecture) +tuned_kernel = ct.kernel(my_kernel._pyfunc, occupancy=best_cfg.occupancy) +``` + +When targeting only the current GPU (the common case in autotuning), plain integers work fine. Use `ByTarget` when the code may run on multiple architectures or when following production conventions (TileGym production code consistently uses `ByTarget`). + +## Kernel Hints + +CuTile kernel performance is controlled by two compile-time hints: + +- **`occupancy`**: Number of CTAs per SM. Higher occupancy = more parallelism but less shared memory per CTA. +- **`num_ctas`**: Number of CTAs in a CGA (Cooperative Group Array). Used for multi-CTA cooperation (e.g., TMA multicast). Only supported on sm90+. + +Three ways to set hints: + +```python +# 1. Fixed value in decorator (no autotune needed) +@ct.kernel(occupancy=2, num_ctas=1) +def my_kernel(...): ... + +# 2. Architecture-specific fixed value (no autotune needed) +@ct.kernel(num_ctas=ct.ByTarget(sm_100=2, sm_120=1, default=1)) +def my_kernel(...): ... + +# 3. Runtime autotune via exhaustive_search + replace_hints +# IMPORTANT: Remove fixed hints from decorator first! +@ct.kernel +def my_kernel(...): ... + +# Then in the host wrapper: +tuned_kernel = my_kernel.replace_hints(occupancy=best_occ, num_ctas=best_ctas) +ct.launch(stream, grid, tuned_kernel, args) +``` + +**Important**: `replace_hints` correctly overrides decorator hints (it uses `dataclasses.replace()` internally). However, if you forget to call `replace_hints`, the decorator's fixed values are used instead of the autotuned values. To avoid this confusion, always remove fixed hints from the `@ct.kernel(...)` decorator before adding autotuning — this makes it explicit that hints come only from the autotune path. + +## search_space Design + +The search space is a list of `SimpleNamespace` objects. Each namespace holds config fields that `grid_fn`, `args_fn`, and `hints_fn` can read. + +```python +from types import SimpleNamespace + +# Occupancy-only (elementwise kernels) +def autotune_configs(): + for occ in [1, 2, 4, 8]: + yield SimpleNamespace(occupancy=occ) + +# Full matmul search space — see parameter-space-design.md for complete per-architecture configs +# Pattern: yield SimpleNamespace(TILE_SIZE_M=..., TILE_SIZE_N=..., TILE_SIZE_K=..., num_ctas=..., occupancy=...) +``` + +**Note**: `exhaustive_search` requires a `Sequence` (list/tuple), not a generator. Always convert with `list()`: +```python +configs = list(autotune_configs()) +result = exhaustive_search(configs, ...) +``` + +## grid_fn Patterns + +```python +from math import ceil + +# Pattern A: Simple tile coverage (matmul, elementwise) +grid_fn=lambda cfg: (ceil(M / cfg.TILE_SIZE_M) * ceil(N / cfg.TILE_SIZE_N), 1, 1) + +# Pattern B: Persistent matmul (static_persistent_matmul_kernel) +NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count +grid_fn=lambda cfg: ( + min(NUM_SMS // cfg.num_ctas, ceil(M / cfg.TILE_M) * ceil(N / cfg.TILE_N)) * cfg.occupancy, + 1, 1, +) + +# Pattern C: 2D grid (FMHA — one dim for seq tiles, one for batch*heads) +grid_fn=lambda cfg: (ceil(q_len / cfg.TILE_M), batch_size * num_heads, 1) + +# Pattern D: 1D elementwise (cdiv = math.ceil(a/b), from ct_ops.py) +grid_fn=lambda cfg: (cdiv(n_elements, BLOCK_SIZE),) + +# Pattern E: Grouped GEMM persistent (grid fixed at NUM_SMS, occupancy via hints_fn only) +grid_fn=lambda cfg: (NUM_SMS, 1, 1) +``` + diff --git a/.agents/skills/cutile-autotuning/references/hardware-constraints.md b/skills/cutile-autotuning/references/hardware-constraints.md similarity index 100% rename from .agents/skills/cutile-autotuning/references/hardware-constraints.md rename to skills/cutile-autotuning/references/hardware-constraints.md diff --git a/.agents/skills/cutile-autotuning/references/kernel-type-templates.md b/skills/cutile-autotuning/references/kernel-type-templates.md similarity index 100% rename from .agents/skills/cutile-autotuning/references/kernel-type-templates.md rename to skills/cutile-autotuning/references/kernel-type-templates.md diff --git a/.agents/skills/cutile-autotuning/references/parameter-space-design.md b/skills/cutile-autotuning/references/parameter-space-design.md similarity index 100% rename from .agents/skills/cutile-autotuning/references/parameter-space-design.md rename to skills/cutile-autotuning/references/parameter-space-design.md diff --git a/skills/cutile-autotuning/references/pitfalls.md b/skills/cutile-autotuning/references/pitfalls.md new file mode 100644 index 00000000..0b0fe810 --- /dev/null +++ b/skills/cutile-autotuning/references/pitfalls.md @@ -0,0 +1,116 @@ +# Pitfall Checklist + +Before submitting code with autotune, verify these: + +## Pitfall #1: In-Place Kernel Data Corruption + +**Problem**: `exhaustive_search` runs the kernel multiple times to benchmark. If the kernel modifies input tensors in-place, the data is corrupted after the first trial run. + +**Solution**: Split-buffer pattern — use separate read-only input and write-only output during search: + +```python +# During exhaustive_search: use separate output buffer +Q_scratch = torch.empty_like(Q) +configs = list(_rope_autotune_configs()) +result = exhaustive_search( + configs, stream, + grid_fn=..., + kernel=rope_kernel, + args_fn=lambda cfg: (Q, Q_scratch, ...), # Q_in != Q_out + hints_fn=..., +) + +# After search: launch with in-place args using tuned config +cfg = result.best.config +tuned_kernel = rope_kernel.replace_hints(occupancy=cfg.occupancy) +ct.launch(stream, grid, tuned_kernel, (Q, Q, ...)) # Q_in == Q_out (in-place) +``` + +**Real example**: `rope_embedding.py` — Search uses split-buffer, final launch uses same-buffer. + +**Also wrong**: Using `Q.clone()` in `args_fn` — this adds ~4us per clone, which is fatal for small kernels (~5us). The clone+copy pattern caused 0.48x performance in RoPE. + +**Tip — isolating output buffers in `args_fn`**: For kernels that write to a dedicated output tensor (not in-place), you *may* use `c.clone()` inside `args_fn` to prevent trial runs from overwriting the final output buffer. This is only needed when the caller reads the output tensor after `exhaustive_search` returns — if you immediately overwrite it with `ct.launch`, clone is unnecessary: + +```python +# Output tensor c will be overwritten by each trial — clone it so trials don't +# corrupt the buffer the caller expects to use after exhaustive_search returns. +result = exhaustive_search( + configs, stream, + grid_fn=..., + kernel=my_kernel, + args_fn=lambda cfg: (a, b, c.clone()), # each trial gets a fresh output + hints_fn=..., +) +``` + +This is safe because the clone cost (~4us) is negligible relative to compute-bound kernel execution time (~50us+). Only avoid `clone()` for very small, memory-bound kernels where 4us is a significant fraction of runtime — in that case, pre-allocate a single scratch buffer outside `args_fn` (as in the split-buffer pattern above). + +## Pitfall #2: Compilation Timeout + +**Problem**: >30 configs in the **final code** causes compilation to exceed 5 minutes. CuTile compilation is heavier than Triton. + +**Solution**: +- Keep the final code's search space ≤ 30 configs — apply arch filters, tile size filters, and pruning rules until you're under the limit +- Use architecture-conditional yield to only generate relevant configs +- If the initial template configs don't beat baseline, use a temporary directed probe (30–100 configs, via bash, not written to file) to identify winning dimensions, then lock the final code to ≤ 8 top candidates (see Design Philosophy) + +**Real example**: Grouped GEMM expanded from 4 to 32 configs → all backward tests timed out. Reverted to occupancy-only (4 configs) with no performance loss. + +## Pitfall #3: Cold-Cache Performance Skew + +**Problem**: First process run is slower due to driver/JIT caches. Can cause wrong config selection. + +**Solution**: Always warm up before measuring. `exhaustive_search` has built-in warmup, but first-process cold start is unavoidable. Re-run if you suspect the initial result was affected. + +## Pitfall #4: NCU Profiling Interference + +**Problem**: NCU profiles autotune trial runs, cluttering the trace. + +**Solution**: Set `DISABLE_AUTOTUNE=1` before profiling, or use `ncu --launch-skip N`. + +## Pitfall #5: search_space as Generator (Exhaustion) + +**Problem**: `exhaustive_search` requires a `Sequence` (list/tuple), not a generator. Passing a generator directly will fail or produce unexpected results. + +**Solution**: Always convert to list: +```python +# CORRECT: convert generator to list +configs = list(_matmul_autotune_configs()) +result = exhaustive_search(configs, ...) + +# WRONG: passing generator directly +result = exhaustive_search(_matmul_autotune_configs(), ...) +``` + +## Pitfall #6: FP8 Precision Loss + +**Problem**: Hardware `/` breaks FP8 quantization bucket boundaries. + +**Solution**: Use `ct.truediv(x, y, rounding_mode=RoundingMode.FULL)` for IEEE-compliant division in FP8 kernels. Never use `/` operator for FP8 scale computation. + +## Pitfall #7: `replace_hints` on Hot Path (Recompilation) + +**Problem**: `replace_hints()` returns a **new kernel object** with its own JIT cache (internally uses `dataclasses.replace()` which creates a fresh instance). Calling it on every kernel invocation — even with the same arguments — triggers recompilation every time. This is the most common autotune performance bug: `cutile_ms` jumps from ~0.04ms to 16–39ms (100–500× slower). + +**Incorrect** (recompiles on every call): +```python +_cache[key] = result.best.config # only stores config + +cfg = _cache[key] +tuned = my_kernel.replace_hints(occupancy=cfg.occupancy) # NEW kernel each time! +ct.launch(stream, grid, tuned, ...) +``` + +**Correct** (compile once, reuse forever): +```python +best_cfg = result.best.config +tuned = my_kernel.replace_hints(occupancy=best_cfg.occupancy) # compile ONCE +_cache[key] = (best_cfg, tuned) # cache both + +cfg, tuned = _cache[key] +ct.launch(stream, grid, tuned, ...) # reuse compiled kernel +``` + +**Rule**: Call `replace_hints` exactly once per config (immediately after `exhaustive_search`), cache the returned kernel object, and never call `replace_hints` again on the fast path. + diff --git a/.agents/skills/cutile-autotuning/references/search-strategies.md b/skills/cutile-autotuning/references/search-strategies.md similarity index 100% rename from .agents/skills/cutile-autotuning/references/search-strategies.md rename to skills/cutile-autotuning/references/search-strategies.md diff --git a/skills/cutile-autotuning/references/workflow.md b/skills/cutile-autotuning/references/workflow.md new file mode 100644 index 00000000..78b5543c --- /dev/null +++ b/skills/cutile-autotuning/references/workflow.md @@ -0,0 +1,202 @@ +# Step-by-Step Workflow + +## Adding Autotune to a New Kernel + +1. **Classify the kernel** using the decision tree above. + - *VERIFY*: You know whether this is occupancy-only or requires tile-size tuning. + +2. **Remove hardcoded hints from decorator** (strongly recommended): If the kernel currently has hardcoded hints in its decorator (e.g. `@ct.kernel(occupancy=2, num_ctas=1)`), **remove those fixed hints** and change to bare `@ct.kernel` before adding autotuning. While `replace_hints` does correctly override decorator values at runtime, leaving them creates a silent fallback trap: if any code path (e.g., `DISABLE_AUTOTUNE`, error handling, or a future refactor) skips `replace_hints`, the decorator's fixed hints are used instead of the autotuned values — and this produces no error, just silently worse performance. Removing them makes the failure mode explicit (missing hints → compiler defaults) rather than silent (wrong fixed hints used). + - *VERIFY*: The `@ct.kernel` decorator has no `occupancy=` or `num_ctas=` arguments before proceeding. Use bare `@ct.kernel` instead. + +3. **Check for in-place writes**: If the kernel modifies input tensors in-place, you MUST use the split-buffer pattern during `exhaustive_search` — see Pitfall #1. + - *VERIFY*: Either the kernel is not in-place, or you have added a split-buffer scratch tensor for the search phase. + +4. **Select the template** from [`kernel-type-templates.md`](references/kernel-type-templates.md) based on kernel type. + +5. **Design the search space** following [`parameter-space-design.md`](references/parameter-space-design.md): + - **Start from reference configs**, not from scratch. Clone configs from existing production kernels of the same type (e.g., `ops/cutile/matmul.py` for GEMM) and adapt. For GEMM-class kernels, `nvMatmulHeuristics` can suggest 8-16 high-quality candidates that reach 96-99% peak performance — see [`parameter-space-design.md`](references/parameter-space-design.md) for details. + - Detect the current GPU architecture with `torch.cuda.get_device_capability()`. + - **Target one architecture at a time.** Generate configs only for the detected arch. Do NOT add branches for other architectures — they cannot be tested on this machine and untested code paths are unreliable. If multi-arch support is needed later, add it in a separate pass on the appropriate hardware. + - **When modifying code that already has autotune configs**: see "Handling Existing Autotune Configs (Multi-Architecture)" below. The "do NOT add branches" rule means do not *invent new configs* for untested architectures — it does NOT mean remove existing configs that were previously validated. + - Identify tunable parameters (tile sizes, occupancy, num_ctas) + - **Ensure the search space includes the original fixed config** (or an equivalent). This guarantees that the autotuned result is at least as good as the original — no performance regression is possible. + - If the generated set exceeds 30, apply tile size filters and pruning rules to reduce it to ≤ 30 in the final code + - *VERIFY*: Total configs in final code ≤ 30 (CuTile compilation is heavy, >30 configs will timeout). Temporary directed probes during development (30–100 configs, run via `bash + python3 -c`) are allowed — see Design Philosophy. + +6. **Implement** the tune-once/cache/launch pattern: + - Define a `_cache` dict at module level + - Define a cache key that captures all parameters affecting optimal config (shapes, dtypes, device, any flags like `is_causal`). **⚠️ Use `str(x.device)` not `x.device`** in the cache key — `torch.device` objects are not reliably hashable and can cause `TypeError: unhashable type` at runtime. Always convert to string: `cache_key = (..., x.dtype, str(x.device))`. **Tip**: For GEMM-class kernels, round dimensions to the next power of 2 in the cache key (e.g., `cache_key = (next_pow2(M), next_pow2(N), next_pow2(K), dtype, str(device))`) to reduce unique key count and avoid re-tuning for similar shapes. + - Call `exhaustive_search(list(configs), ...)` only when cache misses + - Store `result.best.config` in cache + - Use `kernel.replace_hints(...)` to create the tuned kernel variant + - Use `ct.launch()` for the actual kernel invocation + - `grid_fn` correctly computes grid from config + - `args_fn` passes all kernel arguments including tile sizes as `ct.Constant[int]` + - `hints_fn` passes `occupancy` and/or `num_ctas` from config + - *VERIFY*: `exhaustive_search` receives a `list()` of configs, not a raw generator. + +7. **(Optional) Add DISABLE_AUTOTUNE support** for CI and profiling: check `os.environ.get("DISABLE_AUTOTUNE", "0") == "1"` — when set, skip `exhaustive_search` entirely and fall back to `ct.launch` with the first valid config. Useful for: + - CI determinism (autotune adds variable wall time) + - NCU profiling (prevents autotune trial runs from cluttering the trace — see Pitfall #4) + - Debugging (isolates kernel correctness from autotune behavior) + Skip this step if your task only requires adding autotuning and the project's tests don't check for `DISABLE_AUTOTUNE`. + +8. **Test**: Run correctness tests first (`pytest -k "test_op and cutile"`), then benchmark. + - *VERIFY*: Correctness passes with autotune enabled AND with `DISABLE_AUTOTUNE=1`. + +9. **Validate with A/B test**: Compare autotune version vs fixed best-known config. See [`search-strategies.md`](references/search-strategies.md) for methodology. + - *VERIFY*: Autotune version ≥ baseline (or within noise). If worse, check that the search space includes the original fixed config, and that `replace_hints` is being used correctly. + +10. **Shrink the search space** — reduce compilation cost without losing performance. + + Templates provide broad search spaces as a starting point (e.g., 9 configs for varlen attention). Not all configs contribute to finding the optimal one — on a given architecture and kernel shape, many large-tile or multi-CTA configs compile for seconds each but are never selected. The goal of this step is to *prune the dead weight* so the final committed code has 5–8 configs per architecture instead of 10–15. + + **Why this matters**: Each config in `exhaustive_search` requires a full JIT compilation + warmup + benchmark of the kernel. For complex kernels (FMHA, varlen attention), this costs 2–4 seconds *per config*. Cutting from 9 to 5 configs saves 8–16 seconds of one-time autotuning cost per unique shape, with zero performance loss. + + **Procedure**: + + 1. After Step 9 passes, you already have a working autotuned kernel with the full template search space. Now run the test on 2–3 representative shapes and observe which config wins for each shape. You can inspect this by temporarily adding a print inside the cache-miss block: + ```python + print(f"[autotune] shape={cache_key[:5]} best={result.best.config} " + f"time={result.best.time_ms:.3f}ms " + f"configs_tried={len(result.successes)}") + ``` + + 2. Identify which configs are *competitive* — within 5% of the best for at least one shape. Configs that are never within 5% of the best across any test shape are *dead weight*. + + 3. Remove dead-weight configs from the generator. Always keep: + - The original fixed config (safety net — guarantees no regression) + - The config(s) that won on each test shape + - Any config within 5% of a winner (may win on untested shapes) + + 4. Re-run the test to confirm speedup is unchanged after pruning. + + **Common dead-weight patterns** (prune these first): + - `TILE_M=256` configs for attention/varlen kernels where `S_qo` in the test shapes is ≤ 4096 and batch×heads is large — the grid is already saturated at TILE_M=128. + - `num_ctas=2` configs for kernels with irregular or small grids — multi-CTA parallelism requires enough CTAs to benefit from cooperative launch, which doesn't hold when `grid[0]` is small. + - `occupancy=4` or `occupancy=8` configs on sm100+ for compute-bound kernels — Blackwell typically prefers lower occupancy (1–2) with larger tiles. + + **Target**: ≤ 8 configs per architecture branch in the final code. This keeps the one-time tuning cost under 25 seconds even for the most complex kernels (FMHA, varlen attention). + + - *VERIFY*: Config count ≤ 8 per architecture. `speedup_over_fixed` unchanged after pruning. + +11. **(MANDATORY) Verify correctness and performance before finalizing.** + + The verification requirements depend on the task type. In ALL cases, start with the code-level sanity check, then apply the task-specific verification. + + --- + + **A. Code-level sanity check (ALL tasks — do this first)** + + Review your implementation for known performance anti-patterns. These checks catch *implementation bugs*, not algorithmic issues — they apply regardless of whether you are adding, modifying, or fixing autotune code. + + - `replace_hints` must be called *exactly once* per config and the returned kernel object cached (Pitfall #7). If `replace_hints` appears on the hot path (outside the `if cache_key not in` block), you have a recompilation bug that causes 100-500× slowdown. + - `exhaustive_search` must be inside the cache-miss block, not called on every kernel invocation. + - The fast path should only do: cache lookup → `ct.launch` with the cached tuned kernel. No JIT-triggering calls in between. + - The cache must store `(best_cfg, tuned_kernel)` together — not just `best_cfg` alone. + + --- + + **B. Task-specific verification** + + **B1. Adding or modifying autotune configs** (the original code is correct): + + - *Correctness*: autotuned kernel output matches the reference (e.g. `torch` or fixed-config kernel) within tolerance. + - *Performance*: autotuned kernel must be *at least as fast* as the original fixed-config kernel. If it is slower: + - Check that the search space includes the original fixed config (this guarantees no regression). + - Check if `replace_hints` is being called on every code path — revisit Step 2 (if any path skips `replace_hints`, the decorator's fixed hints are used instead of autotuned values). + - Expand search space if all configs perform similarly (see `references/parameter-space-design.md` → "Adapting Search Space"). + + **B2. Fixing a correctness bug** (the original code produces wrong results): + + - *Correctness is the primary goal*: the fixed kernel must produce correct results. Do NOT compare speedup against the broken original — a correct-but-slower kernel is always better than a fast-but-wrong one. + - *Perf sanity check*: after fixing, verify that the implementation is not catastrophically slow due to an implementation bug (e.g. Pitfall #7). Two ways to check: + 1. *Code review*: confirm the code-level sanity check (Section A above) passes — this catches the most common perf bugs. + 2. *Runtime check*: if possible, compare your fixed+autotuned kernel against a simple correct baseline (e.g. the equivalent `torch` operation, or the kernel launched with a single hardcoded config and no autotuning). Your autotuned version should not be slower than this naive baseline. Minor overhead from the fix itself (e.g. split-buffer allocation) is acceptable. + + --- + + *⚠️ Autotuning bugs (silent hint override, split-buffer omission, hot-path recompilation) are only caught at runtime — always verify by running the kernel, not just by reading the code.* + +## Handling Existing Autotune Configs (Multi-Architecture) + +When adding autotune to a kernel, the source code may already contain autotune configs from a previous pass on different hardware. There are three scenarios: + +**Scenario 1: No existing autotune code.** The source has no autotune at all — follow the standard "Adding Autotune to a New Kernel" workflow above. Generate configs for the current GPU architecture only. + +**Scenario 2: Existing autotune, but no config for the current architecture.** The source already has autotune with configs for other architecture(s) (e.g., sm103) but NOT for the current GPU (e.g., sm100). Steps: + +1. Detect the current architecture with `torch.cuda.get_device_capability()`. +2. Check whether the existing config generator already uses architecture-conditional branching (i.e., `if/elif` on device capability). + - **If yes** (conditional yield structure exists): Add a new `elif` branch for the current architecture. Preserve all existing branches **unchanged** — do not modify their config values. + - **If no** (flat configs, no architecture branching): Add an `if` branch for the current architecture with new configs, and keep the existing flat configs in the `else` block as the default fallback. This ensures that all other architectures continue to use the original configs unchanged — the code modification must not alter kernel behavior on any architecture other than the current one. +3. Design configs for the current architecture following the standard workflow (Steps 4–10 above). +4. Validate only the current architecture's configs (Step 11). Other branches are assumed correct since they were previously validated on their respective hardware. + +Example — adding sm100 to a generator that already has sm103 configs (conditional structure exists): + +```python +def _my_autotune_configs(): + gpu_capability = torch.cuda.get_device_capability() + + if gpu_capability == (10, 0): # sm100 (B200) + # NEW: configs for sm100 (added in this pass) + for occ in [1, 2, 4]: + yield SimpleNamespace(occupancy=occ, TILE_M=128, TILE_N=128) + elif gpu_capability == (10, 3): # sm103 (GB300) + # EXISTING: configs for sm103 (do NOT modify) + for occ in [2, 4, 8]: + yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) + else: + # Fallback for unknown architectures + yield SimpleNamespace(occupancy=2, TILE_M=128, TILE_N=128) +``` + +Example — adding current-arch configs to flat (non-branching) code: + +```python +# BEFORE: flat configs (no architecture branching) +def _my_autotune_configs(): + for occ in [2, 4, 8]: + yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) + +# AFTER: if-branch for current arch, original configs become the else-default +def _my_autotune_configs(): + gpu_capability = torch.cuda.get_device_capability() + + if gpu_capability == (10, 0): # sm100 (B200) — current arch + # NEW: configs designed and tested for sm100 + for occ in [1, 2, 4]: + yield SimpleNamespace(occupancy=occ, TILE_M=128, TILE_N=128) + else: + # UNCHANGED: original flat configs as default for all other architectures + for occ in [2, 4, 8]: + yield SimpleNamespace(occupancy=occ, TILE_M=256, TILE_N=128) +``` + +**Scenario 3: Existing autotune with config for the current architecture.** The source already has a conditional branch for the current GPU architecture. Only modify the current architecture's branch (e.g., adjust tile sizes, add/remove occupancy values). Do **NOT** modify or remove configs for other architectures. + +**Key principles:** + +- **"Target one architecture at a time" means only *add or modify* configs for the detected arch** — it does NOT mean delete existing configs for other architectures. Existing configs were validated on their respective hardware and must be preserved. +- **When adding architecture branching to flat configs**: add an `if` for the current architecture and keep existing configs in the `else` as the default. This guarantees that the code change does not alter kernel behavior on any non-current architecture — the `else` path is identical to the original flat code. +- **Test/validation (Step 11) only applies to the current architecture's branch.** Other branches are assumed correct since they were previously validated on their respective hardware. You cannot test them here because you don't have access to that hardware. + +## Integration with torch.autograd.Function + +When the kernel is used inside a `torch.autograd.Function`: +- Place the tune-once/cache/launch logic in `forward()` only. The cached config is reused across calls. +- In `backward()`, using `ct.launch` with a fixed or cached config is often sufficient. However, if backward has its own independent search space (e.g. grouped GEMM dX and dW have separate optimal configs), autotuning is appropriate there too. +- Example: `rope_embedding.py` — forward uses `exhaustive_search` + cache with split-buffer, backward uses `ct.launch` with same-buffer (Q_in=Q_out). + +## Cross-Backend Config Transfer (Triton → CuTile) + +Use `src/tilegym/autotune.py`: maps `BLOCK_SIZE_M/N/K` → `TILE_SIZE_M/N/K`; `num_warps`/`num_stages` have no CuTile equivalent. + +## Optimizing an Existing Autotune Config + +1. **Profile first**: Use NCU (set `DISABLE_AUTOTUNE=1`). +2. **Expand** (too narrow): add tile sizes, `num_ctas` (sm90+), `swap_ab`. +3. **Prune** (too slow): remove suboptimal configs, use arch-conditional yield, add size filters. +4. **Re-validate**: A/B test to confirm improvement. + diff --git a/.agents/skills/cutile-python/SKILL.md b/skills/cutile-python/SKILL.md similarity index 98% rename from .agents/skills/cutile-python/SKILL.md rename to skills/cutile-python/SKILL.md index 10da8f19..f408c83b 100644 --- a/.agents/skills/cutile-python/SKILL.md +++ b/skills/cutile-python/SKILL.md @@ -53,7 +53,7 @@ atomics, metaprogramming, classes, enums, autotuning). Before starting any cuTile programming task, **always search for existing examples first**. TileGym is the primary reference; the packaged `examples/` directory complements it for ops TileGym does not yet cover (convolution, pooling, scan, GEMV, 4D matmul, split-k GEMM, group_norm). The skill supports two installation contexts: -- **Inside a TileGym checkout** (`/.agents/skills/cutile-python/`, or `/.claude/skills/cutile-python/` via the backward-compat symlink) — TileGym ops are at `/src/tilegym/ops/cutile/`. +- **Inside a TileGym checkout** (`/skills/cutile-python/`, or `/.agents/skills/cutile-python/` / `/.claude/skills/cutile-python/` via the backward-compat symlinks) — TileGym ops are at `/src/tilegym/ops/cutile/`. - **Installed elsewhere** (e.g. `~/.agents/skills/cutile-python/`, `~/.claude/skills/cutile-python/`, or inside a different repo) — clone TileGym once to `${TILEGYM_SKILL_CACHE_DIR:-~/.cache/tilegym}/TileGym` and use its `src/tilegym/ops/cutile/`. See **[examples/tilegym_and_examples_guide.md](examples/tilegym_and_examples_guide.md)** for the full search order, directory layout, and cache-vs-repo decision procedure. diff --git a/.agents/skills/cutile-python/examples/convolution/README.md b/skills/cutile-python/examples/convolution/README.md similarity index 100% rename from .agents/skills/cutile-python/examples/convolution/README.md rename to skills/cutile-python/examples/convolution/README.md diff --git a/.agents/skills/cutile-python/examples/convolution/conv2d_with_bias_dilation_groups.py b/skills/cutile-python/examples/convolution/conv2d_with_bias_dilation_groups.py similarity index 100% rename from .agents/skills/cutile-python/examples/convolution/conv2d_with_bias_dilation_groups.py rename to skills/cutile-python/examples/convolution/conv2d_with_bias_dilation_groups.py diff --git a/.agents/skills/cutile-python/examples/convolution/conv3d_with_bias_dilation_groups.py b/skills/cutile-python/examples/convolution/conv3d_with_bias_dilation_groups.py similarity index 100% rename from .agents/skills/cutile-python/examples/convolution/conv3d_with_bias_dilation_groups.py rename to skills/cutile-python/examples/convolution/conv3d_with_bias_dilation_groups.py diff --git a/.agents/skills/cutile-python/examples/convolution/conv_transpose_2d.py b/skills/cutile-python/examples/convolution/conv_transpose_2d.py similarity index 100% rename from .agents/skills/cutile-python/examples/convolution/conv_transpose_2d.py rename to skills/cutile-python/examples/convolution/conv_transpose_2d.py diff --git a/.agents/skills/cutile-python/examples/convolution/conv_transpose_3d.py b/skills/cutile-python/examples/convolution/conv_transpose_3d.py similarity index 100% rename from .agents/skills/cutile-python/examples/convolution/conv_transpose_3d.py rename to skills/cutile-python/examples/convolution/conv_transpose_3d.py diff --git a/.agents/skills/cutile-python/examples/matmul/README.md b/skills/cutile-python/examples/matmul/README.md similarity index 100% rename from .agents/skills/cutile-python/examples/matmul/README.md rename to skills/cutile-python/examples/matmul/README.md diff --git a/.agents/skills/cutile-python/examples/matmul/matmul_4d_tensors.py b/skills/cutile-python/examples/matmul/matmul_4d_tensors.py similarity index 100% rename from .agents/skills/cutile-python/examples/matmul/matmul_4d_tensors.py rename to skills/cutile-python/examples/matmul/matmul_4d_tensors.py diff --git a/.agents/skills/cutile-python/examples/matmul/matrix_vector_multiplication.py b/skills/cutile-python/examples/matmul/matrix_vector_multiplication.py similarity index 100% rename from .agents/skills/cutile-python/examples/matmul/matrix_vector_multiplication.py rename to skills/cutile-python/examples/matmul/matrix_vector_multiplication.py diff --git a/.agents/skills/cutile-python/examples/matmul/split_k_gemm.py b/skills/cutile-python/examples/matmul/split_k_gemm.py similarity index 100% rename from .agents/skills/cutile-python/examples/matmul/split_k_gemm.py rename to skills/cutile-python/examples/matmul/split_k_gemm.py diff --git a/.agents/skills/cutile-python/examples/normalization/README.md b/skills/cutile-python/examples/normalization/README.md similarity index 100% rename from .agents/skills/cutile-python/examples/normalization/README.md rename to skills/cutile-python/examples/normalization/README.md diff --git a/.agents/skills/cutile-python/examples/normalization/group_norm.py b/skills/cutile-python/examples/normalization/group_norm.py similarity index 100% rename from .agents/skills/cutile-python/examples/normalization/group_norm.py rename to skills/cutile-python/examples/normalization/group_norm.py diff --git a/.agents/skills/cutile-python/examples/pooling/README.md b/skills/cutile-python/examples/pooling/README.md similarity index 100% rename from .agents/skills/cutile-python/examples/pooling/README.md rename to skills/cutile-python/examples/pooling/README.md diff --git a/.agents/skills/cutile-python/examples/pooling/avgpool3d.py b/skills/cutile-python/examples/pooling/avgpool3d.py similarity index 100% rename from .agents/skills/cutile-python/examples/pooling/avgpool3d.py rename to skills/cutile-python/examples/pooling/avgpool3d.py diff --git a/.agents/skills/cutile-python/examples/pooling/maxpool3d.py b/skills/cutile-python/examples/pooling/maxpool3d.py similarity index 100% rename from .agents/skills/cutile-python/examples/pooling/maxpool3d.py rename to skills/cutile-python/examples/pooling/maxpool3d.py diff --git a/.agents/skills/cutile-python/examples/scan/README.md b/skills/cutile-python/examples/scan/README.md similarity index 100% rename from .agents/skills/cutile-python/examples/scan/README.md rename to skills/cutile-python/examples/scan/README.md diff --git a/.agents/skills/cutile-python/examples/scan/cumsum_cumprod_blocking.py b/skills/cutile-python/examples/scan/cumsum_cumprod_blocking.py similarity index 100% rename from .agents/skills/cutile-python/examples/scan/cumsum_cumprod_blocking.py rename to skills/cutile-python/examples/scan/cumsum_cumprod_blocking.py diff --git a/.agents/skills/cutile-python/examples/tilegym_and_examples_guide.md b/skills/cutile-python/examples/tilegym_and_examples_guide.md similarity index 92% rename from .agents/skills/cutile-python/examples/tilegym_and_examples_guide.md rename to skills/cutile-python/examples/tilegym_and_examples_guide.md index 0e7b132e..00545d64 100644 --- a/.agents/skills/cutile-python/examples/tilegym_and_examples_guide.md +++ b/skills/cutile-python/examples/tilegym_and_examples_guide.md @@ -8,7 +8,7 @@ The skill supports two installation contexts. Figure out which one applies befor ### Case 1 — skill inside a TileGym checkout -Path looks like `/.agents/skills/cutile-python/` (or `/.claude/skills/cutile-python/` via the backward-compat symlink). The enclosing repo **is** TileGym. No clone needed — use it directly: +Path looks like `/skills/cutile-python/` (or `/.agents/skills/cutile-python/` / `/.claude/skills/cutile-python/` via the backward-compat symlinks). The enclosing repo **is** TileGym. No clone needed — use it directly: ``` /src/tilegym/ops/cutile/ diff --git a/.agents/skills/cutile-python/guidelines/01_implementation_lessons.md b/skills/cutile-python/guidelines/01_implementation_lessons.md similarity index 100% rename from .agents/skills/cutile-python/guidelines/01_implementation_lessons.md rename to skills/cutile-python/guidelines/01_implementation_lessons.md diff --git a/.agents/skills/cutile-python/guidelines/02_code_generation_rules.md b/skills/cutile-python/guidelines/02_code_generation_rules.md similarity index 100% rename from .agents/skills/cutile-python/guidelines/02_code_generation_rules.md rename to skills/cutile-python/guidelines/02_code_generation_rules.md diff --git a/.agents/skills/cutile-python/guidelines/03_concepts.md b/skills/cutile-python/guidelines/03_concepts.md similarity index 100% rename from .agents/skills/cutile-python/guidelines/03_concepts.md rename to skills/cutile-python/guidelines/03_concepts.md diff --git a/.agents/skills/cutile-python/orchestration/analyzer_agent.md b/skills/cutile-python/orchestration/analyzer_agent.md similarity index 100% rename from .agents/skills/cutile-python/orchestration/analyzer_agent.md rename to skills/cutile-python/orchestration/analyzer_agent.md diff --git a/.agents/skills/cutile-python/orchestration/composer_agent.md b/skills/cutile-python/orchestration/composer_agent.md similarity index 100% rename from .agents/skills/cutile-python/orchestration/composer_agent.md rename to skills/cutile-python/orchestration/composer_agent.md diff --git a/.agents/skills/cutile-python/orchestration/kernel_agent.md b/skills/cutile-python/orchestration/kernel_agent.md similarity index 100% rename from .agents/skills/cutile-python/orchestration/kernel_agent.md rename to skills/cutile-python/orchestration/kernel_agent.md diff --git a/.agents/skills/cutile-python/orchestration/overview.md b/skills/cutile-python/orchestration/overview.md similarity index 100% rename from .agents/skills/cutile-python/orchestration/overview.md rename to skills/cutile-python/orchestration/overview.md diff --git a/.agents/skills/cutile-python/orchestration/workflow.md b/skills/cutile-python/orchestration/workflow.md similarity index 100% rename from .agents/skills/cutile-python/orchestration/workflow.md rename to skills/cutile-python/orchestration/workflow.md diff --git a/.agents/skills/cutile-python/torch-learner/examples/lstm_trace.md b/skills/cutile-python/torch-learner/examples/lstm_trace.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/examples/lstm_trace.md rename to skills/cutile-python/torch-learner/examples/lstm_trace.md diff --git a/.agents/skills/cutile-python/torch-learner/references/1_pytorch_codebase_map.md b/skills/cutile-python/torch-learner/references/1_pytorch_codebase_map.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/references/1_pytorch_codebase_map.md rename to skills/cutile-python/torch-learner/references/1_pytorch_codebase_map.md diff --git a/.agents/skills/cutile-python/torch-learner/references/2_dispatch_mechanism.md b/skills/cutile-python/torch-learner/references/2_dispatch_mechanism.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/references/2_dispatch_mechanism.md rename to skills/cutile-python/torch-learner/references/2_dispatch_mechanism.md diff --git a/.agents/skills/cutile-python/torch-learner/references/3_tracing_strategies.md b/skills/cutile-python/torch-learner/references/3_tracing_strategies.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/references/3_tracing_strategies.md rename to skills/cutile-python/torch-learner/references/3_tracing_strategies.md diff --git a/.agents/skills/cutile-python/torch-learner/references/4_language_layers.md b/skills/cutile-python/torch-learner/references/4_language_layers.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/references/4_language_layers.md rename to skills/cutile-python/torch-learner/references/4_language_layers.md diff --git a/.agents/skills/cutile-python/torch-learner/references/5_well_known_ops.md b/skills/cutile-python/torch-learner/references/5_well_known_ops.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/references/5_well_known_ops.md rename to skills/cutile-python/torch-learner/references/5_well_known_ops.md diff --git a/.agents/skills/cutile-python/torch-learner/tracing_workflow.md b/skills/cutile-python/torch-learner/tracing_workflow.md similarity index 100% rename from .agents/skills/cutile-python/torch-learner/tracing_workflow.md rename to skills/cutile-python/torch-learner/tracing_workflow.md diff --git a/.agents/skills/improve-cutile-kernel-perf/SKILL.md b/skills/improve-cutile-kernel-perf/SKILL.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/SKILL.md rename to skills/improve-cutile-kernel-perf/SKILL.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/cutile-api-reference.md b/skills/improve-cutile-kernel-perf/references/cutile-api-reference.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/cutile-api-reference.md rename to skills/improve-cutile-kernel-perf/references/cutile-api-reference.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/cutile-patterns-reference.md b/skills/improve-cutile-kernel-perf/references/cutile-patterns-reference.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/cutile-patterns-reference.md rename to skills/improve-cutile-kernel-perf/references/cutile-patterns-reference.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/ir-dump-guide.md b/skills/improve-cutile-kernel-perf/references/ir-dump-guide.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/ir-dump-guide.md rename to skills/improve-cutile-kernel-perf/references/ir-dump-guide.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/optimization-playbook.md b/skills/improve-cutile-kernel-perf/references/optimization-playbook.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/optimization-playbook.md rename to skills/improve-cutile-kernel-perf/references/optimization-playbook.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/perf-knobs-catalog.md b/skills/improve-cutile-kernel-perf/references/perf-knobs-catalog.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/perf-knobs-catalog.md rename to skills/improve-cutile-kernel-perf/references/perf-knobs-catalog.md diff --git a/.agents/skills/improve-cutile-kernel-perf/references/performance-model.md b/skills/improve-cutile-kernel-perf/references/performance-model.md similarity index 100% rename from .agents/skills/improve-cutile-kernel-perf/references/performance-model.md rename to skills/improve-cutile-kernel-perf/references/performance-model.md diff --git a/.agents/skills/monkey-patch-kernels-to-transformers/SKILL.md b/skills/monkey-patch-kernels-to-transformers/SKILL.md similarity index 100% rename from .agents/skills/monkey-patch-kernels-to-transformers/SKILL.md rename to skills/monkey-patch-kernels-to-transformers/SKILL.md diff --git a/.agents/skills/monkey-patch-kernels-to-transformers/references/auto-kernelize.md b/skills/monkey-patch-kernels-to-transformers/references/auto-kernelize.md similarity index 100% rename from .agents/skills/monkey-patch-kernels-to-transformers/references/auto-kernelize.md rename to skills/monkey-patch-kernels-to-transformers/references/auto-kernelize.md diff --git a/.agents/skills/monkey-patch-kernels-to-transformers/references/environment-setup.md b/skills/monkey-patch-kernels-to-transformers/references/environment-setup.md similarity index 100% rename from .agents/skills/monkey-patch-kernels-to-transformers/references/environment-setup.md rename to skills/monkey-patch-kernels-to-transformers/references/environment-setup.md diff --git a/.agents/skills/monkey-patch-kernels-to-transformers/references/kernel-integration.md b/skills/monkey-patch-kernels-to-transformers/references/kernel-integration.md similarity index 100% rename from .agents/skills/monkey-patch-kernels-to-transformers/references/kernel-integration.md rename to skills/monkey-patch-kernels-to-transformers/references/kernel-integration.md diff --git a/.agents/skills/monkey-patch-kernels-to-transformers/references/workflow-diagram.png b/skills/monkey-patch-kernels-to-transformers/references/workflow-diagram.png similarity index 100% rename from .agents/skills/monkey-patch-kernels-to-transformers/references/workflow-diagram.png rename to skills/monkey-patch-kernels-to-transformers/references/workflow-diagram.png From 6131ed351e13646e1aab8163634535e42a7ef020 Mon Sep 17 00:00:00 2001 From: Hannah Li Date: Thu, 28 May 2026 07:18:40 +0800 Subject: [PATCH 2/3] Fix sibling-link paths in references/workflow.md Signed-off-by: Hannah Li --- skills/cutile-autotuning/references/workflow.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skills/cutile-autotuning/references/workflow.md b/skills/cutile-autotuning/references/workflow.md index 78b5543c..7517acd2 100644 --- a/skills/cutile-autotuning/references/workflow.md +++ b/skills/cutile-autotuning/references/workflow.md @@ -11,10 +11,10 @@ 3. **Check for in-place writes**: If the kernel modifies input tensors in-place, you MUST use the split-buffer pattern during `exhaustive_search` — see Pitfall #1. - *VERIFY*: Either the kernel is not in-place, or you have added a split-buffer scratch tensor for the search phase. -4. **Select the template** from [`kernel-type-templates.md`](references/kernel-type-templates.md) based on kernel type. +4. **Select the template** from [`kernel-type-templates.md`](kernel-type-templates.md) based on kernel type. -5. **Design the search space** following [`parameter-space-design.md`](references/parameter-space-design.md): - - **Start from reference configs**, not from scratch. Clone configs from existing production kernels of the same type (e.g., `ops/cutile/matmul.py` for GEMM) and adapt. For GEMM-class kernels, `nvMatmulHeuristics` can suggest 8-16 high-quality candidates that reach 96-99% peak performance — see [`parameter-space-design.md`](references/parameter-space-design.md) for details. +5. **Design the search space** following [`parameter-space-design.md`](parameter-space-design.md): + - **Start from reference configs**, not from scratch. Clone configs from existing production kernels of the same type (e.g., `ops/cutile/matmul.py` for GEMM) and adapt. For GEMM-class kernels, `nvMatmulHeuristics` can suggest 8-16 high-quality candidates that reach 96-99% peak performance — see [`parameter-space-design.md`](parameter-space-design.md) for details. - Detect the current GPU architecture with `torch.cuda.get_device_capability()`. - **Target one architecture at a time.** Generate configs only for the detected arch. Do NOT add branches for other architectures — they cannot be tested on this machine and untested code paths are unreliable. If multi-arch support is needed later, add it in a separate pass on the appropriate hardware. - **When modifying code that already has autotune configs**: see "Handling Existing Autotune Configs (Multi-Architecture)" below. The "do NOT add branches" rule means do not *invent new configs* for untested architectures — it does NOT mean remove existing configs that were previously validated. @@ -44,7 +44,7 @@ 8. **Test**: Run correctness tests first (`pytest -k "test_op and cutile"`), then benchmark. - *VERIFY*: Correctness passes with autotune enabled AND with `DISABLE_AUTOTUNE=1`. -9. **Validate with A/B test**: Compare autotune version vs fixed best-known config. See [`search-strategies.md`](references/search-strategies.md) for methodology. +9. **Validate with A/B test**: Compare autotune version vs fixed best-known config. See [`search-strategies.md`](search-strategies.md) for methodology. - *VERIFY*: Autotune version ≥ baseline (or within noise). If worse, check that the search space includes the original fixed config, and that `replace_hints` is being used correctly. 10. **Shrink the search space** — reduce compilation cost without losing performance. From 0531fbca3731112d4b285af7120eafa1738d6268 Mon Sep 17 00:00:00 2001 From: nvskills-svc-account Date: Wed, 27 May 2026 23:49:47 +0000 Subject: [PATCH 3/3] Attach NVSkills validation signatures --- skills/adding-cutile-kernel/skill-card.md | 37 +++++++++++++++ skills/adding-cutile-kernel/skill.oms.sig | 1 + .../converting-cutile-to-julia/skill-card.md | 41 +++++++++++++++++ .../converting-cutile-to-julia/skill.oms.sig | 1 + .../converting-cutile-to-triton/skill-card.md | 46 +++++++++++++++++++ .../converting-cutile-to-triton/skill.oms.sig | 1 + skills/cutile-autotuning/skill-card.md | 43 +++++++++++++++++ skills/cutile-autotuning/skill.oms.sig | 1 + skills/cutile-python/skill-card.md | 43 +++++++++++++++++ skills/cutile-python/skill.oms.sig | 1 + .../improve-cutile-kernel-perf/skill-card.md | 42 +++++++++++++++++ .../improve-cutile-kernel-perf/skill.oms.sig | 1 + .../skill-card.md | 41 +++++++++++++++++ .../skill.oms.sig | 1 + 14 files changed, 300 insertions(+) create mode 100644 skills/adding-cutile-kernel/skill-card.md create mode 100644 skills/adding-cutile-kernel/skill.oms.sig create mode 100644 skills/converting-cutile-to-julia/skill-card.md create mode 100644 skills/converting-cutile-to-julia/skill.oms.sig create mode 100644 skills/converting-cutile-to-triton/skill-card.md create mode 100644 skills/converting-cutile-to-triton/skill.oms.sig create mode 100644 skills/cutile-autotuning/skill-card.md create mode 100644 skills/cutile-autotuning/skill.oms.sig create mode 100644 skills/cutile-python/skill-card.md create mode 100644 skills/cutile-python/skill.oms.sig create mode 100644 skills/improve-cutile-kernel-perf/skill-card.md create mode 100644 skills/improve-cutile-kernel-perf/skill.oms.sig create mode 100644 skills/monkey-patch-kernels-to-transformers/skill-card.md create mode 100644 skills/monkey-patch-kernels-to-transformers/skill.oms.sig diff --git a/skills/adding-cutile-kernel/skill-card.md b/skills/adding-cutile-kernel/skill-card.md new file mode 100644 index 00000000..e11edb7e --- /dev/null +++ b/skills/adding-cutile-kernel/skill-card.md @@ -0,0 +1,37 @@ +## Description:
+Add a new cuTile GPU kernel operator to TileGym, covering dispatch registration in ops.py, cuTile backend implementation, __init__.py exports, test creation, and benchmark in tests/benchmark.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers use this skill to add new cuTile GPU kernel operators to the TileGym library, following the standardized workflow for dispatch registration, backend implementation, testing, and benchmarking.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [TileGym Repository](https://github.com/NVIDIA/TileGym)
+ + +## Skill Output:
+**Output Type(s):** [Code, Files, Shell commands]
+**Output Format:** [Python source files and pytest/benchmark scripts]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+v1.3.0-13-g2385245 (source: git tag)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/adding-cutile-kernel/skill.oms.sig b/skills/adding-cutile-kernel/skill.oms.sig new file mode 100644 index 00000000..8ea82764 --- /dev/null +++ b/skills/adding-cutile-kernel/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiYWRkaW5nLWN1dGlsZS1rZXJuZWwiLAogICAgICAiZGlnZXN0IjogewogICAgICAgICJzaGEyNTYiOiAiZjhiNDAyYmY2MWM1NGEyYmRjMjRlZDhiYmU1ZDc3MTgwYTYzODIyZTFlYzY5MmFmOGYwOTU2M2Y4YzZhMjllYyIKICAgICAgfQogICAgfQogIF0sCiAgInByZWRpY2F0ZVR5cGUiOiAiaHR0cHM6Ly9tb2RlbF9zaWduaW5nL3NpZ25hdHVyZS92MS4wIiwKICAicHJlZGljYXRlIjogewogICAgInJlc291cmNlcyI6IFsKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJTS0lMTC5tZCIsCiAgICAgICAgImRpZ2VzdCI6ICI2ZmUxODZlZDllNWNmOTc2ZGEyMmM4ZThlMGM1ODYzYTNkN2E3ZDA2MzczYjVjYjczZDFlNThjNDNkODQzNWU0IgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInNraWxsLWNhcmQubWQiLAogICAgICAgICJkaWdlc3QiOiAiMDYxNDBiYjIyMDVjYWMxY2RlYjFkYzdhMWY1YjY1ODg2ZGU2MDRjM2ZjNDBjZTA3NzZmMWViMzUzZTQ4ODExZCIKICAgICAgfQogICAgXSwKICAgICJzZXJpYWxpemF0aW9uIjogewogICAgICAiaGFzaF90eXBlIjogInNoYTI1NiIsCiAgICAgICJhbGxvd19zeW1saW5rcyI6IGZhbHNlLAogICAgICAibWV0aG9kIjogImZpbGVzIiwKICAgICAgImlnbm9yZV9wYXRocyI6IFsKICAgICAgICAiLmdpdGlnbm9yZSIsCiAgICAgICAgIi5naXRhdHRyaWJ1dGVzIiwKICAgICAgICAiLmdpdGh1YiIsCiAgICAgICAgIi5naXQiCiAgICAgIF0KICAgIH0KICB9Cn0=","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGQCMFapGhY++LtZosIT7EtxG5wHSFuNA56Dx/vz9DmxzRxnVnHsU8bAmk2nGymc1oc/QwIwCFgbtbp6gfT7Op92jmEDLtU2XJH2WQrQ+Sq3ndIRkoUsRoh4gatHMEwFMHXADLOG","keyid":""}]}} \ No newline at end of file diff --git a/skills/converting-cutile-to-julia/skill-card.md b/skills/converting-cutile-to-julia/skill-card.md new file mode 100644 index 00000000..cd21cf65 --- /dev/null +++ b/skills/converting-cutile-to-julia/skill-card.md @@ -0,0 +1,41 @@ +## Description:
+Converts cuTile Python GPU kernels (@ct.kernel) to cuTile.jl Julia equivalents, handling kernel syntax translation, 0-indexed to 1-indexed conversion, broadcasting differences, memory layout (row-major to column-major), type system mapping, and launch API differences.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers who need to port cuTile Python GPU kernels to Julia cuTile.jl equivalents, enabling Julia-native GPU kernel development without a Python bridge.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [API Mapping (Python to Julia)](references/api-mapping.md)
+- [Critical Rules](references/critical-rules.md)
+- [Debugging Guide](references/debugging.md)
+- [Testing & Verification Guide](references/testing.md)
+- [Conversion Workflow](translations/workflow.md)
+ + +## Skill Output:
+**Output Type(s):** [Code, Files, Shell commands]
+**Output Format:** [Julia source files (.jl) with inline documentation]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+v1.3.0 (source: git tag)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/converting-cutile-to-julia/skill.oms.sig b/skills/converting-cutile-to-julia/skill.oms.sig new file mode 100644 index 00000000..4a1dca8b --- /dev/null +++ b/skills/converting-cutile-to-julia/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiY29udmVydGluZy1jdXRpbGUtdG8tanVsaWEiLAogICAgICAiZGlnZXN0IjogewogICAgICAgICJzaGEyNTYiOiAiYmQxOTI3MTg0ZDVkNzkwMjRmNWQ4Nzg5ZThlNDA0NTk3Mzk4ZWE3NmFmZjg5NjllMjJkM2RhOThmYWE5ODY2OCIKICAgICAgfQogICAgfQogIF0sCiAgInByZWRpY2F0ZVR5cGUiOiAiaHR0cHM6Ly9tb2RlbF9zaWduaW5nL3NpZ25hdHVyZS92MS4wIiwKICAicHJlZGljYXRlIjogewogICAgInJlc291cmNlcyI6IFsKICAgICAgewogICAgICAgICJuYW1lIjogIlNLSUxMLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJlMTkzNDNiYmQ4M2FmZTA2YWQ2OGYyMmU3YmFkOWUwYTAyY2ZkNzE0ZDdiOTk1M2NiZmQ3ODA0OTY5ODYzYjBjIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvMDFfYWRkL2N1dGlsZV9qdWxpYS5qbCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiZGY4YTU4YzU1MWI2MTQ3YzYwNWRlMGMzZjdhMzJkMDY4ZmQzZmI1YjE1NjNmNDE1MzQ1YjkzMmRkMWEwOTdmNSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAxX2FkZC9jdXRpbGVfcHl0aG9uLnB5IiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJmZGYwYWFlOTFjYmQzNTlmODM2MmRmNjc2ZjE4MDIwYjJiYWMyZjc0OTQ3N2Y5NjBhZTZjNmEyN2E2NWViYzIyIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvMDJfbWF0bXVsL2N1dGlsZV9qdWxpYS5qbCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMzA5ZWI4MzMzM2Y1ZDFkZmNkM2EzNGFmNjdjNmZhNmFlODdkMjg4Y2QwZTg5NTU5YjI4Yjg2MDlkNmYzN2I4NyIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAyX21hdG11bC9jdXRpbGVfcHl0aG9uLnB5IiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI2YWI3MGRhNjM4MTljNzM5MmFkMjkxODVjOGI3NWMxNzQ2Y2NhNTgyNTMxOWYzOTY2MzMzNGYyYjUzYWJjMTcxIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvMDNfc29mdG1heC9jdXRpbGVfanVsaWEuamwiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImVkZjcxYWZjZThlMDJkZWVlNGQyZDIxZDA1Yzk4OGI2MDFkNWQ2YTJiNDkyNzA4OTM0M2IzZWFjMGFjNmVlMjUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy8wM19zb2Z0bWF4L2N1dGlsZV9weXRob24ucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImM1ZjRkOTE5MmYwYWUyNGM0MGE5OWQ0ZTM0MmRhNjI2ODMwYTQ4YzYzYjhiMGFhZjg5NzhjZTJiZmE4ZDEwMzUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL2FwaS1tYXBwaW5nLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI5ZGE4Yzg3M2NmODUyOWVhZDRiMzA5M2ZmYjE4NDc5MjVjY2FmOGVkMTBiZDg2NTUyNzgzNzE0MWQ3NjdmZmMzIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9jcml0aWNhbC1ydWxlcy5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiOTNmZDAzOWE4OGQ0M2M5ZjNhY2JhY2I0ZDYxMTI1OGIyMjE5ZDU3NjMwNTg1OTYzMjU4MmNlOWIzZmU1MTAwYyIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvZGVidWdnaW5nLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIxNzU0Njg5MWViZTNjZjY4NzI2NTE1ZGNiY2EzMWExZWUwMjZjZTdkZGUyOGY0OThlOWY0M2QyMTE1ZjMwMTk0IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy90ZXN0aW5nLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI3ZGJlODczMmNjZjMzOTgwYTAzMDNjMTUyODk3MmYxYzJiYjVmY2E0ZTZjZGY2NTM1NzE5ODA5N2Q2Y2NlMzdiIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAic2NyaXB0cy92YWxpZGF0ZV9jdXRpbGVfamwucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImMyZDEyMGIwOWVhZWZiNWM5MTE4N2U3MGNlNDYxNmZhYzcwOWEzZmNmODJiNDkyYmIxNDA0MmRhMWM3MWI5NTUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJza2lsbC1jYXJkLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJhNDEzNjI5NmY3ZjMxOWQxMWZlZjlmZDdhM2UxOTE4YTZmNGY1MTFiOWFlZGFlN2M1NDQ2MTFhMmIxMTdkMWU4IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAidHJhbnNsYXRpb25zL3dvcmtmbG93Lm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJiZTRlYTJlOGZiMjRmNGJmZTNmMDczNDk4OGI1NzE4M2ViZWVmNmE1ZjkzYzY1NWZmMjQzMmFlM2YwODZkMTI2IgogICAgICB9CiAgICBdLAogICAgInNlcmlhbGl6YXRpb24iOiB7CiAgICAgICJtZXRob2QiOiAiZmlsZXMiLAogICAgICAiaGFzaF90eXBlIjogInNoYTI1NiIsCiAgICAgICJhbGxvd19zeW1saW5rcyI6IGZhbHNlLAogICAgICAiaWdub3JlX3BhdGhzIjogWwogICAgICAgICIuZ2l0YXR0cmlidXRlcyIsCiAgICAgICAgIi5naXRpZ25vcmUiLAogICAgICAgICIuZ2l0aHViIiwKICAgICAgICAiLmdpdCIKICAgICAgXQogICAgfQogIH0KfQ==","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGYCMQCPiMZh/U+ZhZbb0e6oJtEm6J+Ln4ZH8jsb0yfUbpNmCeDFuzE0f7zznhSSW99TEZcCMQCvlklH5ObRLxFW/GViyM/8qIzDEofPhlT9y2Ssgp2/cnlhKg125m7oT1atGfgFQQY=","keyid":""}]}} \ No newline at end of file diff --git a/skills/converting-cutile-to-triton/skill-card.md b/skills/converting-cutile-to-triton/skill-card.md new file mode 100644 index 00000000..ff04bd8c --- /dev/null +++ b/skills/converting-cutile-to-triton/skill-card.md @@ -0,0 +1,46 @@ +## Description:
+Converts cuTile GPU kernels (@ct.kernel) to Triton (@triton.jit), handling standard in-repo conversion, debugging, and mapping cuTile idioms to Triton equivalents.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers converting cuTile GPU kernels to Triton for GPU kernel development, optimization, and debugging of existing Triton translations.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [API Mapping (cuTile to Triton)](references/api-mapping.md)
+- [Debugging Guide](references/debugging.md)
+- [Common Translation Gotchas](references/gotchas.md)
+- [Harness Integration](references/harness-integration.md)
+- [Optimization Strategy](references/optimization-strategy.md)
+- [Optimizing Reference](references/optimizing-reference.md)
+- [Performance Gotchas](references/performance-gotchas.md)
+- [Conversion Workflow](translations/workflow.md)
+- [Advanced Patterns](translations/advanced-patterns.md)
+- [File Structure](translations/file-structure.md)
+ + +## Skill Output:
+**Output Type(s):** [Code, Files, Shell commands]
+**Output Format:** [Python source files with inline Triton kernel code]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+1.0.0 (source: frontmatter)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/converting-cutile-to-triton/skill.oms.sig b/skills/converting-cutile-to-triton/skill.oms.sig new file mode 100644 index 00000000..a91d949e --- /dev/null +++ b/skills/converting-cutile-to-triton/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiY29udmVydGluZy1jdXRpbGUtdG8tdHJpdG9uIiwKICAgICAgImRpZ2VzdCI6IHsKICAgICAgICAic2hhMjU2IjogIjhhZTQ0Zjc1MWQ4YTE5OWE4NTUxMzBiYzgwOTc3MThjYTFmYmU0YTA5NzFhZWRjNGQ1YzVlYTI4YjI0NjI0YjciCiAgICAgIH0KICAgIH0KICBdLAogICJwcmVkaWNhdGVUeXBlIjogImh0dHBzOi8vbW9kZWxfc2lnbmluZy9zaWduYXR1cmUvdjEuMCIsCiAgInByZWRpY2F0ZSI6IHsKICAgICJyZXNvdXJjZXMiOiBbCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogImRmYjZmYmU3YzYzYjNlMTU0ZmRlYTVmZTRmZmUzOThmMWM4OTRiNjViYjYwYWRkNWUzZWUzYTIxODJiMWM4NTMiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJTS0lMTC5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiNDIxMWNjYmEyNjJiNmFlNjNkYzI0NDEwMGI0MjhlOWMwYTFjYjE0YjAzZDVlYmMxOGE0OGQyYmY2YjUzMjY4ZiIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAxX3ZlY3Rvcl9hZGQvY3V0aWxlX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiZWUxZjQ4NjYwNzFiMDViYmE3NTk2OWVmMjZiNjFiYmI3YjhiZjQ3NDE0YjZiMmQ0MGM1OTFjMmMxMmIwMTVlZiIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAxX3ZlY3Rvcl9hZGQvdHJpdG9uX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiOTllOTJkOTE1NTZkZWI5YzhmZTIzMTRjMDdjNmE0ODNmNTEwYjAzOTVkYTg5N2Y1MGI0OTc4MWQ4YWU1MzBmYyIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAyX3NvZnRtYXgvY3V0aWxlX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiM2VlYmZmMDJjMzY1NWM5NTgyZDFkODlhMmM1NDNlNDJmNzNkMjE1ZWY1MjI3NWFiOTQ1MjFiNTBkY2ExNjVjMCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAyX3NvZnRtYXgvdHJpdG9uX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiNDVhZTMwNzcyZTBiZDBiNjUwNjRjNWNjYmIwMGE0YWQ3MzA3MjJiNDZjZDJhM2FlYmVmNTFhYjA5NjVmNjE5ZSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzAzX2xheWVybm9ybS9jdXRpbGVfa2VybmVsLnB5IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICJhNDQyNzRmOTg4NTBhZTFmYmU2MzZlZmU3NjVlMDlmZTYxNmE4YTA3OTQ2YzMwOGY2MjQyYTI4NmQ0YTJjZjYzIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvMDNfbGF5ZXJub3JtL3RyaXRvbl9rZXJuZWwucHkiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogImU4MzIwYzI1OTk4N2U3NzdjMTc3MjI4N2Q1NzhiNTg4NzJkZDQzMWEzYzA2YjI3OWI3YWQzZWE4ZTJlM2U0ZWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy8wNF9tYXRtdWwvY3V0aWxlX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiMGY2OGJmOWU2MjBjNjAyMjRjMDdhYmJlYWMyYzg0NmY5ZmY2MDZiM2M1NThkZTNlODRiZTE3ZDQ4MTQyNmY0YyIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogImV4YW1wbGVzLzA0X21hdG11bC90cml0b25fa2VybmVsLnB5IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICJlMGQ2YTY4NDNkZThlMGVlMmFmYmM2MTI2MTk4MzA0MGZkZjU4MjlmODM5Nzg3MGIwZjFkNjE2ZGNmMjUxMWE3IiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvMDVfYXR0ZW50aW9uL2N1dGlsZV9rZXJuZWwucHkiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogImQyMjgwNjI0NmY4YTIwZWYwZTU5MTRlOGViYThlM2Y0ZDNkNzY2ZDI2M2YyNTY0MWY4M2EyMzcwYWIwYTI4YWYiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy8wNV9hdHRlbnRpb24vdHJpdG9uX2tlcm5lbC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiZDVhMDM5MGQ0ZmE5ZDkzMzA1YWVlNGZjMGFjNjMwYzk3ZWY3ZTJjMjg3ZjEzZGJiZGQwM2Y4MWMxMTE1ZTBiNSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvYXBpLW1hcHBpbmcubWQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogIjRhNTM1MzMxYmVhNmRhZDFiMTQxYzRkZTljYjhkZDdjYWQ5ZDFiYTI0ZjA5MWVjNzhhMzNjYmQ4ZDA5NjFhNGMiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL2RlYnVnZ2luZy5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiN2RiMmUwMzUyNGE2NDkwZmFiZTcyZDExZWFlZDI0MWNkYjliNGZhNjE5Y2IzNDI5MDhjMjg4ODQyMWQ0NmUzOCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvZ290Y2hhcy5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiOWZhYmE0NDY4ZTQ5ODhhYzY4MzAzNTUzNWFhYzE3MGJkYzUwM2IwNmM0NjViNTkyNDI1YmNmMzY2NWE2OTMxMyIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvaGFybmVzcy1pbnRlZ3JhdGlvbi5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiZGQzYzlmMDBjNjhiODRjMDQwMDUzNDBlMzEyMjM0NjgyOTY2NzhiYTc5Njk5OWNhMGVmNTVjMDUzMzk3NDRiMiIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvb3B0aW1pemF0aW9uLXN0cmF0ZWd5Lm1kIgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICJjZWUyMDY4OGIzZjc2NTFiMTg4MTNhYTg3ZGViZGY2ZmE1ZTAwMjJjZTFjZDA3ZTYzOTRhMjFhNzBlOGQyMjZhIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9vcHRpbWl6aW5nLXJlZmVyZW5jZS5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiNDFmNjNlYTZjM2NhNTQyY2E4YWJlN2RlYjE3ZmM0ZTM3YmE3MzgxOGY5NTc2MDEzZTQ3ZWU1MzA3ZDVlZTg0MiIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvcGVyZm9ybWFuY2UtZ290Y2hhcy5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiMzRkODBmNGNhMDYxZjYyMzE2MDFlNjFmZjZjYTJkMDMwYzJlNzRjNjRkM2JmNTU2M2FkZWU0NjhiZmY1NmY1MyIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJuYW1lIjogInNraWxsLWNhcmQubWQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogIjdmODljY2NlMmEwZWE1ZDY1NTc4MmEyYTVkMTAzNmU5OTMyYzgxZGFlM2MyOGVmNTFiM2IwMzFhYmM2N2U3ZjQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJ0cmFuc2xhdGlvbnMvYWR2YW5jZWQtcGF0dGVybnMubWQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogIjRmM2EyYjE3YWE2YTllMTM2NTczZDc3NjdiZGM5YTg1M2FhMGEzOTNmYTNhYzQyODQwMjdlNDU5ZTQ2ODFjY2QiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJ0cmFuc2xhdGlvbnMvZmlsZS1zdHJ1Y3R1cmUubWQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogImQ2MDkzMmU3NTJiOTM5YjdmYzQ2YzE1MzhiNjA4MTU4OTE3ODc3MWY1OTcyMzZhM2M1MGEwZjVhN2VhY2U4ZGYiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAibmFtZSI6ICJ0cmFuc2xhdGlvbnMvd29ya2Zsb3cubWQiCiAgICAgIH0KICAgIF0sCiAgICAic2VyaWFsaXphdGlvbiI6IHsKICAgICAgIm1ldGhvZCI6ICJmaWxlcyIsCiAgICAgICJpZ25vcmVfcGF0aHMiOiBbCiAgICAgICAgIi5naXRodWIiLAogICAgICAgICIuZ2l0IiwKICAgICAgICAiLmdpdGlnbm9yZSIsCiAgICAgICAgIi5naXRhdHRyaWJ1dGVzIgogICAgICBdLAogICAgICAiaGFzaF90eXBlIjogInNoYTI1NiIsCiAgICAgICJhbGxvd19zeW1saW5rcyI6IGZhbHNlCiAgICB9CiAgfQp9","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMQD/CS22USYQLpMou7fAsez3RuUkgX/eMYMqP4DRApqVwo5J1yCsTahU6BIgyg6HMCUCMEv+kRzE3tvDKlxeRhOU79RP4BxO4/ylNixO2qvpRFvup6csWKB+wzugdGvg1h6JgA==","keyid":""}]}} \ No newline at end of file diff --git a/skills/cutile-autotuning/skill-card.md b/skills/cutile-autotuning/skill-card.md new file mode 100644 index 00000000..543d2f2f --- /dev/null +++ b/skills/cutile-autotuning/skill-card.md @@ -0,0 +1,43 @@ +## Description:
+Use when adding, modifying, optimizing, or debugging CuTile autotuning code. Trigger signals: `exhaustive_search` / `replace_hints` / `hints_fn` / `cuda.tile.tune` in code, `autotune` in filenames, or correctness/performance issues in autotuned CuTile kernels. Covers: tune-once/cache/launch pattern, per-architecture configs (sm80–sm120), parameter space design (tile sizes, occupancy, num_ctas), and 7 common pitfalls with solutions.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers working with CuTile GPU kernels use this skill to add, optimize, or debug autotuning configurations for CUDA Tile kernels across NVIDIA GPU architectures (sm80–sm120).
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [exhaustive_search API Reference](references/api-reference.md)
+- [Hardware Constraints](references/hardware-constraints.md)
+- [Kernel Type Templates](references/kernel-type-templates.md)
+- [Parameter Space Design](references/parameter-space-design.md)
+- [Pitfalls](references/pitfalls.md)
+- [Search Strategies](references/search-strategies.md)
+- [Workflow](references/workflow.md)
+ + +## Skill Output:
+**Output Type(s):** [Code, Configuration instructions]
+**Output Format:** [Markdown with inline Python code blocks]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+v1.3.0 (source: git tag)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/cutile-autotuning/skill.oms.sig b/skills/cutile-autotuning/skill.oms.sig new file mode 100644 index 00000000..d633630c --- /dev/null +++ b/skills/cutile-autotuning/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiY3V0aWxlLWF1dG90dW5pbmciLAogICAgICAiZGlnZXN0IjogewogICAgICAgICJzaGEyNTYiOiAiYzUwYjljZjUxMTVjMWZhMzk4NDhmYjlhNTNjZjZkZTYzNTNhODBmYTQ0MDg0MDNlZTQ0ZWE5ZTg3OTBlOTc4OCIKICAgICAgfQogICAgfQogIF0sCiAgInByZWRpY2F0ZVR5cGUiOiAiaHR0cHM6Ly9tb2RlbF9zaWduaW5nL3NpZ25hdHVyZS92MS4wIiwKICAicHJlZGljYXRlIjogewogICAgInNlcmlhbGl6YXRpb24iOiB7CiAgICAgICJtZXRob2QiOiAiZmlsZXMiLAogICAgICAiYWxsb3dfc3ltbGlua3MiOiBmYWxzZSwKICAgICAgImhhc2hfdHlwZSI6ICJzaGEyNTYiLAogICAgICAiaWdub3JlX3BhdGhzIjogWwogICAgICAgICIuZ2l0IiwKICAgICAgICAiLmdpdGh1YiIsCiAgICAgICAgIi5naXRhdHRyaWJ1dGVzIiwKICAgICAgICAiLmdpdGlnbm9yZSIKICAgICAgXQogICAgfSwKICAgICJyZXNvdXJjZXMiOiBbCiAgICAgIHsKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIyMDQ5ZmFjYWI2MTIyYTNmNjcxNTc1M2NjMDhhY2QxZTY1ZjBmOWQ4NGQ3NmZkN2QwMTZmNjQ3NTMzZTcxZGRjIiwKICAgICAgICAibmFtZSI6ICJTS0lMTC5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjQ5ZmM4YWI3N2Q3Nzk3YmIyY2ZhY2I0NmQyMTViMWJhMmFjZjViM2ZmOGI3NmFjNGJjOGY5NjNjMjdhNGExZTgiLAogICAgICAgICJuYW1lIjogImFzc2V0cy9leGFtcGxlcy8wMV9ybXNub3JtX29jY3VwYW5jeV9vbmx5L2F1dG90dW5lZF9sYXVuY2gucHkiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJlMDU3ODU0YjdiZGQzMmYxZTYyNjkxNTM5OGU0MGNmMGNhZDAwMDJhNzc4ZGQ3OTBlZjQ0MzJkYzI4MzZjOWNhIiwKICAgICAgICAibmFtZSI6ICJhc3NldHMvZXhhbXBsZXMvMDFfcm1zbm9ybV9vY2N1cGFuY3lfb25seS9maXhlZF9sYXVuY2gucHkiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIxMTA5MjcwZTBhYjQyYjIwODI5ZmNmYTcxYjQxZGI5NzljOWMwMjBmZjBkYzdmMjc1ZWYzZmI0Zjg1NWQxNmI0IiwKICAgICAgICAibmFtZSI6ICJhc3NldHMvZXhhbXBsZXMvMDJfbWF0bXVsX2Z1bGxfc2VhcmNoL2F1dG90dW5lZF9sYXVuY2gucHkiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIwYjE1NzNlNzY0YWIzYjJmZjUyZWMxMmYzODk4ZDBkYjZhODEyOWI2NDVmYmJlZTk0OGU2NTFjZTM2NDNlNjYzIiwKICAgICAgICAibmFtZSI6ICJhc3NldHMvZXhhbXBsZXMvMDJfbWF0bXVsX2Z1bGxfc2VhcmNoL2ZpeGVkX2xhdW5jaC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImI5ZjQzODA2YzIxMzU4MDBiNjA3OWNmN2RkOWM0ZmQ0ZGNhMDg4ZjQ0ZGM2MTkxMDk4ZjA5NTI2MGM5ZjNkOWEiLAogICAgICAgICJuYW1lIjogImFzc2V0cy9leGFtcGxlcy8wM19yb3BlX2lucGxhY2Vfc3BsaXRidWZmZXIvYXV0b3R1bmVkX2xhdW5jaC5weSIKICAgICAgfSwKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImNlYWY2NjkyMDEzMmE1MzQ1MTZiNTVjYjBjOWJmOTAyMzI1ZTBmYmMzYzBhMDk3ZTc1ODM1MjVkMTU0MWU0YWMiLAogICAgICAgICJuYW1lIjogImFzc2V0cy9leGFtcGxlcy8wM19yb3BlX2lucGxhY2Vfc3BsaXRidWZmZXIvZml4ZWRfbGF1bmNoLnB5IgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiODNlYmI2NmRlYTYwYzNjYjFlZDMzZjg2YmEzMmFhNmY2YTE0MzdjOWZiODFhMGJiMGNmMTljYjYxZTM0N2FiMiIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9hcGktcmVmZXJlbmNlLm1kIgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNGUwMzA4MWVhYzQwNTM5YzFkNmE0NTEzN2Y4ZGE0Zjk4ODAwNmVkYTM5YTc3ZWQyN2Q4ZDJkODNhMjkxZjdhNyIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9oYXJkd2FyZS1jb25zdHJhaW50cy5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImVkMGZjN2RiYWQ1YWIwZjRlMGFhODBlN2U4YjU4MWQ2MzcwZDEzNTAzOTM3OTEwOWYyZTU0OTBlNjg3NTgzZjIiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMva2VybmVsLXR5cGUtdGVtcGxhdGVzLm1kIgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiYTNhMjg0ZDE0YWUyMTc1YzlkZmM5MTk1ZTJjYjMyZmI0ZGJjZjRkYTRhNzdlMmE4MWEwM2ZlNWIyM2I0MTJjZCIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9wYXJhbWV0ZXItc3BhY2UtZGVzaWduLm1kIgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNmVhZWI1ZjU0NWYyZDg3Y2ZiZDdlZTQ3MDFmZmYzZTA5MWEzMWIwYWU3ZTZlZWJiZDY0MTY4ODRiODEyMjBkZiIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9waXRmYWxscy5tZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImZlNDdlMzlkZTNjNzdhNDJjMTJmOTMyM2ZkYzUyODMyNDE2MGRlYjU1N2U3NzQwYTA2NzlhMmYxYzEyODg1YWUiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvc2VhcmNoLXN0cmF0ZWdpZXMubWQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIzNzQxZWM2ZDU0OWYyYTFlNDc3OGJjOTc2ZmNmNGQ2ZTUwNmU3Y2NlNzJmN2FiMzE5YTA4ODk0YjQzYzBkMzAyIiwKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL3dvcmtmbG93Lm1kIgogICAgICB9LAogICAgICB7CiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMTA3MjMyOWZiYzM5MmYwZDQ4OTIyMGY4NTg2YTk0NzFmM2I1Y2U1ZGFlZTM2OTk1MDQ0NmJjYjYzOTY3M2E3NCIsCiAgICAgICAgIm5hbWUiOiAic2tpbGwtY2FyZC5tZCIKICAgICAgfQogICAgXQogIH0KfQ==","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMCpTXbks9+aAzD4S9o/bRshNtRKh2Ga4LPNQdOjp5lixHP5LTQXCDxxfk7YiDr6A0gIxAMJQenj3ABJeXxiZM32r4LlK6OQQslU9OqI3nYjX13jPS7EGlivzEfCAcUv3aVJ1QA==","keyid":""}]}} \ No newline at end of file diff --git a/skills/cutile-python/skill-card.md b/skills/cutile-python/skill-card.md new file mode 100644 index 00000000..80b4f17b --- /dev/null +++ b/skills/cutile-python/skill-card.md @@ -0,0 +1,43 @@ +## Description:
+Expert cuTile programming assistant that writes high-performance GPU kernels using cuTile's tile-based programming model with proper validation, optimization, and deep agent orchestration for complex multi-kernel tasks.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+MIT
+## Use Case:
+Developers and engineers use this skill to write, debug, and optimize high-performance GPU kernels using cuTile's tile-based programming model, including complex multi-kernel tasks requiring deep agent orchestration.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [cuTile Language Specification](https://docs.nvidia.com/cuda/cutile-python)
+- [Implementation Lessons](guidelines/01_implementation_lessons.md)
+- [Code Generation Rules](guidelines/02_code_generation_rules.md)
+- [Core Concepts](guidelines/03_concepts.md)
+- [Orchestration Workflow](orchestration/workflow.md)
+- [Orchestration Overview](orchestration/overview.md)
+- [TileGym and Examples Guide](examples/tilegym_and_examples_guide.md)
+ + +## Skill Output:
+**Output Type(s):** [Code]
+**Output Format:** [Python source code with inline validation]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+1.3.0 (source: frontmatter, git tag)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/cutile-python/skill.oms.sig b/skills/cutile-python/skill.oms.sig new file mode 100644 index 00000000..83463e7b --- /dev/null +++ b/skills/cutile-python/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiY3V0aWxlLXB5dGhvbiIsCiAgICAgICJkaWdlc3QiOiB7CiAgICAgICAgInNoYTI1NiI6ICJhOTRjYTFhMTEyYWRlNmRlZDBlNTEyNWM2MTE2YTViODdiNDI1MTNhY2IwYmNiYTY2M2Q1ZDE4NGUwMGJiZjgxIgogICAgICB9CiAgICB9CiAgXSwKICAicHJlZGljYXRlVHlwZSI6ICJodHRwczovL21vZGVsX3NpZ25pbmcvc2lnbmF0dXJlL3YxLjAiLAogICJwcmVkaWNhdGUiOiB7CiAgICAicmVzb3VyY2VzIjogWwogICAgICB7CiAgICAgICAgIm5hbWUiOiAiU0tJTEwubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjk3ZWM3ZWE2MmE0ODUyMTNiOWRmNTc4Zjc3M2ZlN2Y2MjQ4ZWU1NWYzMDI4NTFkZGY3NGIyM2ZjZDkxNDcyODciCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9jb252b2x1dGlvbi9SRUFETUUubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImFjOWExZjBmMzc1NDI4YjY4Y2ExOGVhOTI5YzIwNGE0ZjU5NjVhM2Y1ZmRkODlhNGM5ODI0NzE1OTY3NWNmYjgiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9jb252b2x1dGlvbi9jb252MmRfd2l0aF9iaWFzX2RpbGF0aW9uX2dyb3Vwcy5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiZDEwM2M1ZDVmMWI1MzIxZDI0ZjcxMmNlZWI0YzcwN2E3NDAyYzc5ZTAzM2EyYThlNmU3NmYzMjgzM2FiZDFlYyIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL2NvbnZvbHV0aW9uL2NvbnYzZF93aXRoX2JpYXNfZGlsYXRpb25fZ3JvdXBzLnB5IiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI4N2I2YjM1YTY5MWRhOTM3NDQ0ZjIzYzIwYmE1Y2VhYjdhOTIwYTM1ZmMzYmMxNDQwOTQ2NTllYzNiMzU4ZTY3IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvY29udm9sdXRpb24vY29udl90cmFuc3Bvc2VfMmQucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImEzMGE4MzFhMGM2MzdhODJmYWRjNjg3ODI0MTA2ZTYxYzMxYmYzMTc3NGZjNDc0NDQ2MjE4ZDAxZjJiMmQzN2QiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9jb252b2x1dGlvbi9jb252X3RyYW5zcG9zZV8zZC5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMTQzZTY5NTNhYTZhZmQ3Y2ZjNzViNWNkMGIxNDdmOWFiNzQ2NzI5N2RjMzAwODdhMzZlNjU0NmVhODMzZjlkMiIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL21hdG11bC9SRUFETUUubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImIwNTliOWNmNTI5NTA0MmUyNGY2YjcwMTFiZDk5ZWQ2ODIxMTRmNTdkMjY3MWU4ZjJkYzQ3NGZjY2ZjMzZiMjQiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9tYXRtdWwvbWF0bXVsXzRkX3RlbnNvcnMucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjcwNTVhZjcxZDFiYmRlMzdhMGVkZjhhNDg3OTgyNDc5ZmQ4MmJhZmM5YmNhMjcyMTFlYmU0NGU0MWRmMGFlNDYiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9tYXRtdWwvbWF0cml4X3ZlY3Rvcl9tdWx0aXBsaWNhdGlvbi5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNzliNzIzYTc0Zjc0MGU0NDgzOTBmOGIwMTJiZmVmM2I3NWJlYWQzNDAwMDUwN2ZkYmVkNjA5MGU1ZTFmMjIwZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL21hdG11bC9zcGxpdF9rX2dlbW0ucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjU3MjNmOGIzYmY1NDZiM2M0MGE3OTM4NDMxYzYyYTY2MTk4MzllZDJmMWMwYjM0ZGEyMjU1YWJkYWEwOGVjYzUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9ub3JtYWxpemF0aW9uL1JFQURNRS5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNDlhODJkMGQwMTM5ZWYzYTk0YTJmOGRiZjU0MmQ0ZDQzZjUzZTRhOGQ1NDRiMmUwZWJiYzZlZGU2MzQzYjRiMCIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL25vcm1hbGl6YXRpb24vZ3JvdXBfbm9ybS5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiYTk1MjhmMDc2MTU4NWU0MThkZWVhODdiYzA0YmFiMzcxNTQ2YjU4OTk2YmViNDc4ZDY5YmQ3OWVjYjEwMzRiYSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL3Bvb2xpbmcvUkVBRE1FLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI0MGZkYzgwMmIyZmFkYjg1ZTBhZjNhZTc2OWRlMTAyOTdiOTEwMTc4ZjRjMDVlYWU1YjI5MDg3Mjc4MWJlNjhiIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvcG9vbGluZy9hdmdwb29sM2QucHkiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjgwYmNlY2I2MTJmODllOWM5MTAxZWNjOTcxOWUyMGMxMDVhZTc2NmYyZTcyZDhkMmU2ZTdiNzZhYmQ0Yjk4MjUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJleGFtcGxlcy9wb29saW5nL21heHBvb2wzZC5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiZmUwMmQ3MGJkYzhmYjFiZWNiYzlkOTcxNDJhMTZiNTM5MGY1ZTI5YzY4ZDZlNmQzODU0Y2UwZTgzZjVjOTY0ZCIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL3NjYW4vUkVBRE1FLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJlOGFiMDk2Mjk2OTM0OTZmYjNmNGEyZjg2NDJmZmRmYzdmYTRkOTA5Njc2MWNhMTNlZTczMzIyMjBhMjEwNDg4IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZXhhbXBsZXMvc2Nhbi9jdW1zdW1fY3VtcHJvZF9ibG9ja2luZy5weSIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiN2VmYTJhZjUwZDVkNDJjMmI0NjJlYjVhN2MyMzI1YTM5NjA0OTljM2RkMTM4ODcwNjNlNzMzYTFiOWQ4OGYxYSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogImV4YW1wbGVzL3RpbGVneW1fYW5kX2V4YW1wbGVzX2d1aWRlLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIxZDE0NjI1OTI4MDM0YzU4ZWEzYWUxODRlN2ZkMjdlMzUwZGQ2MTNmYTg4MjM0ZjU4ZGViMDU3MjQ4OWY2MmM3IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZ3VpZGVsaW5lcy8wMV9pbXBsZW1lbnRhdGlvbl9sZXNzb25zLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI0MmIwZjMyY2E3OTA2YjM2ODFlYjE5N2FiNzc1ZmJkNWQ5NzE3M2E1NGIyMmFlOTBiMWQ1OTM5NDcxNzA3YjIxIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAiZ3VpZGVsaW5lcy8wMl9jb2RlX2dlbmVyYXRpb25fcnVsZXMubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImQzODMwYzg4ZTNhZWUyN2ZlZjFjYWViOGVmNWU1MWI0NjcxOTdjMDkwNmJiNzEzYjMzYzJkYjcyMjNhZDhiNDYiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJndWlkZWxpbmVzLzAzX2NvbmNlcHRzLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJkNDRkY2UxMmQwMTE5Zjk2MzAwMGExNzMzNzk5ZDBiYWM5YzFkNDQ3ZmYwZTA0MzZjYzU5OWY3Mzg2YjhlZmZjIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAib3JjaGVzdHJhdGlvbi9hbmFseXplcl9hZ2VudC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMTEyMWUzZjhhZDU2OGY4MjUyMzczNWVhYzg3NWMyNmNhMGNhNjYxNjFhN2VmZThhYjM5OGE4MDY1YTg5OTA2NSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogIm9yY2hlc3RyYXRpb24vY29tcG9zZXJfYWdlbnQubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImU5ZWVkOWExYjRlNTg1N2JlOGNiNmQ1N2Y5OWYzZTgyYjM1YTQzMWM4ZDNkMzEwNjBlYzM1ODUxZTVlODg3YjUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJvcmNoZXN0cmF0aW9uL2tlcm5lbF9hZ2VudC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNDA5OTg3ZmI3YTFlM2Q0Nzk0ZmY4OGQzMTk4YzA0YzFlNWYzMjZmNjc0MmJkNzc0ZGY0M2U0NDIyYTllZDcwNCIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogIm9yY2hlc3RyYXRpb24vb3ZlcnZpZXcubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogIjEzYmRmYjEzMDJkOWQ5Yjk5ZjM4NzIxNzEwYWU1YmI0ZjA0NGNkZWMzMzhlMzI0NGQzYmFmMzdlOGZjZGJiYzUiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJvcmNoZXN0cmF0aW9uL3dvcmtmbG93Lm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJmZGEwMDRiZmI2ZjQ4MjliZjJmMjRkMzNlOGRjYWI3MmVlZTNhNTIzZDBlMDQ5ZTJhYTA2ZjZhZGY2NWQyOTNiIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAic2tpbGwtY2FyZC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiYWI0ZmUyNTYxYTM3MzU1NTg2ZjA2ZmMyZGFmNDhmNWNiNWE5YjNhNzY0M2E3ZDg0OWQ4NmI1NGZlOTNmNzFiOCIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInRvcmNoLWxlYXJuZXIvZXhhbXBsZXMvbHN0bV90cmFjZS5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiNzU3NzQyMmNlNTRiYzBiMjhmZTVmOTYwNDcwNDUzMjlkMTc5YzI1MDE5NmIzNDQwYTYzNzA4Y2ZiYWRlODhmZSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInRvcmNoLWxlYXJuZXIvcmVmZXJlbmNlcy8xX3B5dG9yY2hfY29kZWJhc2VfbWFwLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI2ZjMyMTI0ZjYwM2M5OWFjZDM4NWU0MmViMDY5MDExODZmNjgxOGJkYTIxMjg1MzJmMzBjZWNkYmFhZDgwY2Y1IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAidG9yY2gtbGVhcm5lci9yZWZlcmVuY2VzLzJfZGlzcGF0Y2hfbWVjaGFuaXNtLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJkNWY1NWVhZDRlZjk0ZmI5ZTEzN2NlNjI0ZTJmZjkxMDIzMzkyMWZjZDY1NDQ0M2M4MjBhMGY4NjAzZTZkNjdkIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAidG9yY2gtbGVhcm5lci9yZWZlcmVuY2VzLzNfdHJhY2luZ19zdHJhdGVnaWVzLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJkODg0NmJiYmUwMTQxNjExNDg3MDhmMmI3ZGU1OWZiOTVkMWQ3MTU0ZGU0MjE0NmMxMDdlMTIxZTNjZjE2NDIzIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAidG9yY2gtbGVhcm5lci9yZWZlcmVuY2VzLzRfbGFuZ3VhZ2VfbGF5ZXJzLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICJkN2U5YTBiMWFhZWM2MDU1OTQwYWQ5YjcyMmZlMWI1YzdlNmNhNDQ0YmFhNTAzNjlkYTY3YzQ4YjM1MGM1MzE5IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAidG9yY2gtbGVhcm5lci9yZWZlcmVuY2VzLzVfd2VsbF9rbm93bl9vcHMubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImRjMDhkMmIzYTI5NjQ3MmE0Yzg1YTg0NTJkOTA1MGU1Njc0ZWY3NDU3ODA4ODhkMDgwZmRjNWJlNzg1MDY1NzAiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJ0b3JjaC1sZWFybmVyL3RyYWNpbmdfd29ya2Zsb3cubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImNlYWNjYTllZjA5YjI4MjFkZmUyNzBjZDEwZDBjYTdhOWE3ODI4NWM2ZmM2NjkwNTczZTRjMjdmMTQ5YmFmZGMiCiAgICAgIH0KICAgIF0sCiAgICAic2VyaWFsaXphdGlvbiI6IHsKICAgICAgIm1ldGhvZCI6ICJmaWxlcyIsCiAgICAgICJhbGxvd19zeW1saW5rcyI6IGZhbHNlLAogICAgICAiaGFzaF90eXBlIjogInNoYTI1NiIsCiAgICAgICJpZ25vcmVfcGF0aHMiOiBbCiAgICAgICAgIi5naXRpZ25vcmUiLAogICAgICAgICIuZ2l0YXR0cmlidXRlcyIsCiAgICAgICAgIi5naXQiLAogICAgICAgICIuZ2l0aHViIgogICAgICBdCiAgICB9CiAgfQp9","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMG9mLg6qE166g5oAnb1dRWo6KzaJGAKCnNs7gSB0j041qzNQir485/9qyw5Pp6wNeAIxAP23SPRnMrAHjG6LqZGvNvKiV+MuOh2MIkCLnbB9sBYbzTKdMoC/AUf074w3cZ/C2Q==","keyid":""}]}} \ No newline at end of file diff --git a/skills/improve-cutile-kernel-perf/skill-card.md b/skills/improve-cutile-kernel-perf/skill-card.md new file mode 100644 index 00000000..dabe559a --- /dev/null +++ b/skills/improve-cutile-kernel-perf/skill-card.md @@ -0,0 +1,42 @@ +## Description:
+Iteratively optimize cuTile kernel performance through systematic profiling, bottleneck analysis, IR comparison, and targeted tuning.
+ +This skill is ready for commercial/non-commercial use.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers use this skill to systematically benchmark, diagnose bottlenecks, and iteratively tune cuTile GPU kernel performance in the TileGym project.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [Optimization Playbook](references/optimization-playbook.md)
+- [Performance Knobs Catalog](references/perf-knobs-catalog.md)
+- [cuTile API Reference](references/cutile-api-reference.md)
+- [GPU Performance Model](references/performance-model.md)
+- [IR Analysis Guide](references/ir-dump-guide.md)
+- [cuTile Patterns Quick-Reference](references/cutile-patterns-reference.md)
+ + +## Skill Output:
+**Output Type(s):** [Code, Shell commands, Analysis]
+**Output Format:** [Markdown with inline code blocks and performance tables]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+2026.04.11-alpha (source: frontmatter)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/improve-cutile-kernel-perf/skill.oms.sig b/skills/improve-cutile-kernel-perf/skill.oms.sig new file mode 100644 index 00000000..51f7a093 --- /dev/null +++ b/skills/improve-cutile-kernel-perf/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiaW1wcm92ZS1jdXRpbGUta2VybmVsLXBlcmYiLAogICAgICAiZGlnZXN0IjogewogICAgICAgICJzaGEyNTYiOiAiMDIzNDM3YTM3NWJjYmYyZmNkOWJiMmQ1OTM1ZDZmM2ZjMmNmMjAxN2Q4Zjc3YTA1YjYzNDJhNmY0MzU5YTcwNCIKICAgICAgfQogICAgfQogIF0sCiAgInByZWRpY2F0ZVR5cGUiOiAiaHR0cHM6Ly9tb2RlbF9zaWduaW5nL3NpZ25hdHVyZS92MS4wIiwKICAicHJlZGljYXRlIjogewogICAgInJlc291cmNlcyI6IFsKICAgICAgewogICAgICAgICJuYW1lIjogIlNLSUxMLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI4MmU1ZTFmMTZkNDE1M2FlOThiMDdhOTFkMjhlYjYyZDRlYTk3NzY1Nzk0MGFhYzY2Y2M2MmRkY2VmOWQ3MmUwIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9jdXRpbGUtYXBpLXJlZmVyZW5jZS5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiM2VjNDA0NGQ2NjIxMTM3NzEwN2UzYTI0MmFkZTM1ODJkYTE2ZGM0ZjExMDc1N2RkOGRiNDc5ZDMyNzM0ZjU1YyIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvY3V0aWxlLXBhdHRlcm5zLXJlZmVyZW5jZS5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiZTg0NzA2ZTcxMDBhY2UwMGFlZGI3Y2M2MmUxNjhlZDMxOTUxNzJlZmQ1NmJiMTJhMThiMWJkZjRlZGE5ZjIxYSIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvaXItZHVtcC1ndWlkZS5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMjA3Mjc2N2JkYmM3NWExOWQyMmJhMmMzZTAzNGY2ZGIxN2JiODNkM2QzNGZmOGVhNTUzYzBlYWQ1MzVkZjhhZiIKICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvb3B0aW1pemF0aW9uLXBsYXlib29rLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICI0MDE2MzA2YzFkN2YwMTg1NzQyOGNlODcxMzgzMWIwMTVkODNlZjhhZWY1MjEyNWE1MDllYzJmYTk2NWM3MjM4IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9wZXJmLWtub2JzLWNhdGFsb2cubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IiwKICAgICAgICAiZGlnZXN0IjogImFjMTg3ZmRlZmFjZTkyNGE5NDU2NWQ3MDExNTAyODUwYmNjNjkyZGE0Nzk1ODQ5OTg3YzkzYWVlOWViMTQ3NWEiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL3BlcmZvcm1hbmNlLW1vZGVsLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIsCiAgICAgICAgImRpZ2VzdCI6ICIxN2Q3OTNjOTQwZDgwYTQ4ZmZmODUzNzUzMzY0YjU1ZGQ1MmY0NDAyMmEzNmMzODYwN2U5ZTUyOWMzOWM0MGI3IgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAic2tpbGwtY2FyZC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiLAogICAgICAgICJkaWdlc3QiOiAiMGUyNTZlZGE2ZWNiNWE2MGJhYjE4MTJjMzM1N2I2NWY0NmQ1N2M1NThkNzA2MWE2YzZiZDc0MTg1ZjFiY2M4MyIKICAgICAgfQogICAgXSwKICAgICJzZXJpYWxpemF0aW9uIjogewogICAgICAibWV0aG9kIjogImZpbGVzIiwKICAgICAgImhhc2hfdHlwZSI6ICJzaGEyNTYiLAogICAgICAiYWxsb3dfc3ltbGlua3MiOiBmYWxzZSwKICAgICAgImlnbm9yZV9wYXRocyI6IFsKICAgICAgICAiLmdpdGlnbm9yZSIsCiAgICAgICAgIi5naXQiLAogICAgICAgICIuZ2l0YXR0cmlidXRlcyIsCiAgICAgICAgIi5naXRodWIiCiAgICAgIF0KICAgIH0KICB9Cn0=","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMG1W2pwlPEJiMCCtlQdrnZ4K7gmiVaty89Pmgic65+pndvZr6jP39QhNSiZEW1/9jwIxAM6iiW008+xp5k+w6G/Nz2sdrsCqIrPjqeHIpQBI/aj86DDgLynW3Ddq/rlGqVI73w==","keyid":""}]}} \ No newline at end of file diff --git a/skills/monkey-patch-kernels-to-transformers/skill-card.md b/skills/monkey-patch-kernels-to-transformers/skill-card.md new file mode 100644 index 00000000..27fd58ec --- /dev/null +++ b/skills/monkey-patch-kernels-to-transformers/skill-card.md @@ -0,0 +1,41 @@ +## Description:
+Integrate TileGym kernels into Hugging Face `transformers` models by replacing the library's submodule(s) and certain class(es)' implementations, and patching certain class(es)' init/forward/load weight methods prior to instantiating models.
+ +This skill is for research and development only.
+ +## Owner: NVIDIA
+ +### License/Terms of Use:
+CC-BY-4.0 AND Apache-2.0
+## Use Case:
+Developers and engineers who need to integrate TileGym GPU kernels into Hugging Face transformers models using a non-intrusive monkey-patch approach to validate end-to-end functional correctness and improve performance.
+ +### Deployment Geography for Use:
+Global
+ +## Known Risks and Mitigations:
+Risk: Review before execution as proposals could introduce incorrect or misleading guidance into skills.
+Mitigation: Review and scan skill before deployment.
+ +## Reference(s):
+- [Environment Setup](references/environment-setup.md)
+- [Kernel Integration Workflow](references/kernel-integration.md)
+- [Auto Kernelize](references/auto-kernelize.md)
+- [Workflow Diagram](references/workflow-diagram.png)
+- [CUDA Tile IR Supported Architectures](https://docs.nvidia.com/cuda/tile-ir/latest/sections/stability.html#supported-architectures)
+ + +## Skill Output:
+**Output Type(s):** [Code, Shell commands, Configuration instructions]
+**Output Format:** [Markdown with inline bash code blocks]
+**Output Parameters:** [1D]
+**Other Properties Related to Output:** [None]
+ +## Skill Version(s):
+2026.05.05-beta (source: frontmatter)
+ +## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+ +(For Release on NVIDIA Platforms Only)
+Please report quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
diff --git a/skills/monkey-patch-kernels-to-transformers/skill.oms.sig b/skills/monkey-patch-kernels-to-transformers/skill.oms.sig new file mode 100644 index 00000000..bf6369dc --- /dev/null +++ b/skills/monkey-patch-kernels-to-transformers/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAibW9ua2V5LXBhdGNoLWtlcm5lbHMtdG8tdHJhbnNmb3JtZXJzIiwKICAgICAgImRpZ2VzdCI6IHsKICAgICAgICAic2hhMjU2IjogIjFiZjFjZTllMjBjOWJjMTAzMTM4ZjdkNjE2N2VmZTFmNzZkZjQyNjYwMDQ0NjhmMjc0YWM4OTU4NTUwMTA3ZjIiCiAgICAgIH0KICAgIH0KICBdLAogICJwcmVkaWNhdGVUeXBlIjogImh0dHBzOi8vbW9kZWxfc2lnbmluZy9zaWduYXR1cmUvdjEuMCIsCiAgInByZWRpY2F0ZSI6IHsKICAgICJzZXJpYWxpemF0aW9uIjogewogICAgICAiaWdub3JlX3BhdGhzIjogWwogICAgICAgICIuZ2l0IiwKICAgICAgICAiLmdpdGF0dHJpYnV0ZXMiLAogICAgICAgICIuZ2l0aWdub3JlIiwKICAgICAgICAiLmdpdGh1YiIKICAgICAgXSwKICAgICAgImhhc2hfdHlwZSI6ICJzaGEyNTYiLAogICAgICAiYWxsb3dfc3ltbGlua3MiOiBmYWxzZSwKICAgICAgIm1ldGhvZCI6ICJmaWxlcyIKICAgIH0sCiAgICAicmVzb3VyY2VzIjogWwogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICIwYTk3MmMwNWJiNWJlNGE0ZTJmYTcyZDZiZDA1MGZlYTQ3ZWUyYjE2NDRjMjRkYmY4Yzg5ZjNiNWUwMDI3YzA0IiwKICAgICAgICAibmFtZSI6ICJTS0lMTC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogIjQyZmQ4ZmJhMDc0ODUwM2Q5Mjc5ZjI3NGRlMTA4ZjlhOTgzZmNlZTIzNGU3NTViOGVmMzY0NDlhMjg4NTYzODkiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvYXV0by1rZXJuZWxpemUubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICJiMzllYzhlNTNlMmQ4NTQxMWQzZmUxNmViNzM4MTMwZWU4YWZhYzQ3ZGQ4OGU5MGE3NjU0Y2NhMDhjYThjYTQ3IiwKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL2Vudmlyb25tZW50LXNldHVwLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiN2Y5ZjUxNjJhNDQwODI1YjljMjI1NDc5NTE2MjA4MDc5ZGE3OWU4YjQzOWNmMTViNDJmZTY4ZmU0NWNmNTBmMCIsCiAgICAgICAgIm5hbWUiOiAicmVmZXJlbmNlcy9rZXJuZWwtaW50ZWdyYXRpb24ubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICIyMmVkZGQzZDgxYjNjMzdkMmI0NzY2NWQ2ZmYxOTcwMTg2NTM2NDQ5Nzk5MWQ2NTA2MmUyZjljYTNlZjZlZjg1IiwKICAgICAgICAibmFtZSI6ICJyZWZlcmVuY2VzL3dvcmtmbG93LWRpYWdyYW0ucG5nIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiMDQ3ZDViMmU4OTZiYzE0NDZiYjhkODU0NWJmZjIzZDM1MTQxMTkyNzM2NGRkYWE4NzA3NjBlMjc1ZTM2N2VhYyIsCiAgICAgICAgIm5hbWUiOiAic2tpbGwtY2FyZC5tZCIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiCiAgICAgIH0KICAgIF0KICB9Cn0=","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMQDZlGuMoidIrFWdXjuaEdzClxAV/X9d5itdivkSorr7nkGD1q08Jw4Kp2F5QqnJfNkCMC25pdb81hjLOIPIaIycfa30xVRL3B67c5y7YbqTDmpJgitQlGvgkIkULeZKBDSIUA==","keyid":""}]}} \ No newline at end of file