Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# serializer version: 1
# name: test_r7_sia0_pgr2_placement_golden
list([
dict({
'basename': 'Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAB_SAV_UserArgsqilRk36jOuWZWGOWS9uiKxEE9gZgpXZAO4oE69gsGs8=',
'err': 0,
'tail_lr_reset_a': True,
'tail_lr_reset_b': True,
}),
])
# ---
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
################################################################################
# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
################################################################################
# DESIGNED config — gfx1250 F32X TN SIA0 (ScheduleIterAlg=0) PGR2 non-TDM GEMM.
#
# Characterizes the SIA0 PrefetchGlobalRead=2 global-read / local-write placement
# and tail local-read reset that the rest of the char matrix never reaches (every
# other designed config pins ScheduleIterAlg: [3]; the subtile configs use the
# LogicalScheduler and bypass the legacy SIA emission entirely).
#
# This is the exact knob #8417 ("Fix SIA0 PGR2 global-read placement") modifies,
# in the legacy (non-subtile) emission path:
#
# Components/SIA.py:noSchedGlobalRead
# PGR2 global-read-inc placement: iter 0 (pre-fix) vs the local-write
# iteration (post-fix), gated on _ScheduleIterAlg == 0 and non-TDM.
#
# KernelWriter.py tail local-read reset
# "Tail: local read reset offsets a/b" emitted for SIA0 non-TDM via the new
# `not (enableTDMA and enableTDMB)` clause. With ScheduleIterAlg=0, StreamK=0
# and TDMInst=0, the pre-fix condition (`_ScheduleIterAlg != 0 or StreamK`)
# is False, so the reset appears ONLY after #8417 — a clean binary flip.
#
# Derived from the F32X TN problem in
# Tests/common/gemm/gfx12/xfp32_gfx1250.yaml, which the #8417 author lists among
# the configs whose codegen changed. Reduced to a single MI shape / size so emit
# stays cheap; ScheduleIterAlg pinned to [0] (the xfp32 source forks [0, 1]).

GlobalParameters:
SyncsPerBenchmark: 0
SleepPercent: 0
NumElementsToValidate: 0
NumWarmups: 0
NumBenchmarks: 1
EnqueuesPerSync: 1
DataInitTypeBeta: 0
DataInitTypeAlpha: 1
NewClient: 2
Device: 0
PrintSolutionRejectionReason: False

BenchmarkProblems:
########################################
# F32X TN, SIA0 PGR2 non-TDM
# Bias + Activation(hipblaslt_all) + ScaleAB + ScaleAlphaVec
########################################
-
- # ProblemType
OperationType: GEMM
DataType: S
DestDataType: S
ComputeDataType: S
F32XdlMathOp: X
TransposeA: True
TransposeB: False
UseBeta: True
Batched: True
SupportUserArgs: True
UseBias: 1
Activation: True
UseScaleAB: Scalar
UseScaleAlphaVec: 1
BiasDataTypeList: ['s']
ActivationType: hipblaslt_all
- # BenchmarkProblemSizeGroup
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
- [16, 16, 32, 1, 1, 2, 2, 2, 2]
- ScheduleIterAlg: [0]
- TDMInst: [0]
- StreamK: [0]
- DepthU: [32]
- WavefrontSize: [32]
- TransposeLDS: [-1]
- LdsPadA: [-1]
- LdsPadB: [-1]
- StaggerU: [0]
- PrefetchGlobalRead: [2]
- PrefetchLocalRead: [1]
- GlobalReadVectorWidthA: [1]
- GlobalReadVectorWidthB: [1]
- 1LDSBuffer: [0]
- SourceSwap: [true]
- DirectToVgprA: [False]
- StoreVectorWidth: [-1]
BenchmarkFinalParameters:
- ProblemSizes:
- Exact: [127, 127, 1, 127]
- BiasTypeArgs: ['s']
- ActivationArgs:
- [Enum: none]
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
################################################################################
# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
################################################################################
"""R7 — SIA0 (ScheduleIterAlg=0) PGR2 non-TDM global-read / tail-reset placement.

CPU-only characterization. Closes the gap that let a SIA0-only codegen change
reach develop without any CPU-PR-CI signal: every other designed config pins
``ScheduleIterAlg: [3]`` and the subtile configs use the LogicalScheduler, so
the legacy SIA0 emission path was never exercised by a content-sensitive test.

Drives the designed config
``data/_designed/gfx1250/sia0_pgr2_xf32_tn.yaml`` (F32X TN, ScheduleIterAlg=0,
PrefetchGlobalRead=2, TDMInst=0/non-TDM, StreamK=0) through the config-driven
emit harness. That kernel routes through:

Components/SIA.py:noSchedGlobalRead
PGR2 global-read placement (the ``_ScheduleIterAlg == 0`` arm).

KernelWriter.py tail local-read reset
"Tail: local read reset offsets a/b" — emitted for SIA0 non-TDM via the
``not (enableTDMA and enableTDMB)`` clause. With ScheduleIterAlg=0,
StreamK=0 and non-TDM, this reset appears ONLY when that clause is present,
so its presence is a precise, toolchain-independent (Tensile-emitted
comment) characterization of the SIA0 non-TDM tail-reset behavior.

The projection snapshot below pins that behavior. It is derived from
``Tests/common/gemm/gfx12/xfp32_gfx1250.yaml`` (the F32X TN problem), reduced to
a single MI shape / size for a cheap emit.

CPU-only. No GPU, no compile, no hardware access.
"""

import os

import pytest

from config_harness import emit_kernels_from_config

pytestmark = pytest.mark.unit

_ARCH = "gfx1250"
_LIMIT = 8

_CONFIG = os.path.join(
os.path.dirname(__file__),
"data",
"test_data",
"_designed",
"gfx1250",
"sia0_pgr2_xf32_tn.yaml",
)


def _tail_lr_reset(src):
"""Return the SIA0 tail local-read reset markers emitted for this kernel.

These ``addComment1`` strings come from KernelWriter.localReadResetOffsets
in the tail-loop preamble; they are emitted by Tensile (not the assembler),
so their presence is independent of the amdclang/hipcc version.
"""
return {
"tail_lr_reset_a": "Tail: local read reset offsets a" in src,
"tail_lr_reset_b": "Tail: local read reset offsets b" in src,
}


def test_r7_sia0_pgr2_emits_assembly():
"""SIA0 PGR2 non-TDM F32X config emits real gfx1250 assembly, all err==0."""
results = emit_kernels_from_config(_CONFIG, limit=_LIMIT, arch=_ARCH)
assert len(results) >= 1, f"expected >=1 kernel, got 0 (config: {_CONFIG})"
assert all(err == 0 for (_b, _s, err) in results), (
f"some kernels failed: {[(b, e) for b, _s, e in results if e != 0]}"
)
for base, src, _err in results:
assert src and len(src.splitlines()) > 100, (
f"kernel {base!r}: assembly unexpectedly short"
)
assert ".amdgcn_target" in src, f"kernel {base!r}: missing .amdgcn_target"
assert "gfx1250" in src, f"kernel {base!r}: wrong arch in assembly"
assert base.startswith("Cijk_"), f"kernel {base!r}: unexpected basename prefix"


def test_r7_sia0_pgr2_placement_golden(snapshot):
"""Golden: SIA0 tail local-read reset markers per kernel.

Pins the SIA0 non-TDM tail-reset behavior. A change to the SIA0 PGR2
placement / tail-reset logic (the class of change made in #8417) flips these
markers and surfaces a snapshot diff in CPU PR CI — the signal that was
previously absent for the SIA0 path.
"""
results = emit_kernels_from_config(_CONFIG, limit=_LIMIT, arch=_ARCH)
digest = sorted(
({"basename": b, "err": e, **_tail_lr_reset(s)} for (b, s, e) in results),
key=lambda d: d["basename"],
)
assert digest == snapshot
Loading