diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/__snapshots__/test_r7_sia0_pgr2_placement_char.ambr b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/__snapshots__/test_r7_sia0_pgr2_placement_char.ambr new file mode 100644 index 000000000000..754ab05a8c12 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/__snapshots__/test_r7_sia0_pgr2_placement_char.ambr @@ -0,0 +1,11 @@ +# serializer version: 1 +# name: test_r7_sia0_pgr2_placement_golden + list([ + dict({ + 'basename': 'Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAB_SAV_UserArgsqilRk36jOuWZWGOWS9uiKxEE9gZgpXZAO4oE69gsGs8=', + 'err': 0, + 'tail_lr_reset_a': True, + 'tail_lr_reset_b': True, + }), + ]) +# --- diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/data/test_data/_designed/gfx1250/sia0_pgr2_xf32_tn.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/data/test_data/_designed/gfx1250/sia0_pgr2_xf32_tn.yaml new file mode 100644 index 000000000000..052a10d8fe82 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/data/test_data/_designed/gfx1250/sia0_pgr2_xf32_tn.yaml @@ -0,0 +1,94 @@ +################################################################################ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +################################################################################ +# DESIGNED config — gfx1250 F32X TN SIA0 (ScheduleIterAlg=0) PGR2 non-TDM GEMM. +# +# Characterizes the SIA0 PrefetchGlobalRead=2 global-read / local-write placement +# and tail local-read reset that the rest of the char matrix never reaches (every +# other designed config pins ScheduleIterAlg: [3]; the subtile configs use the +# LogicalScheduler and bypass the legacy SIA emission entirely). +# +# This is the exact knob #8417 ("Fix SIA0 PGR2 global-read placement") modifies, +# in the legacy (non-subtile) emission path: +# +# Components/SIA.py:noSchedGlobalRead +# PGR2 global-read-inc placement: iter 0 (pre-fix) vs the local-write +# iteration (post-fix), gated on _ScheduleIterAlg == 0 and non-TDM. +# +# KernelWriter.py tail local-read reset +# "Tail: local read reset offsets a/b" emitted for SIA0 non-TDM via the new +# `not (enableTDMA and enableTDMB)` clause. With ScheduleIterAlg=0, StreamK=0 +# and TDMInst=0, the pre-fix condition (`_ScheduleIterAlg != 0 or StreamK`) +# is False, so the reset appears ONLY after #8417 — a clean binary flip. +# +# Derived from the F32X TN problem in +# Tests/common/gemm/gfx12/xfp32_gfx1250.yaml, which the #8417 author lists among +# the configs whose codegen changed. Reduced to a single MI shape / size so emit +# stays cheap; ScheduleIterAlg pinned to [0] (the xfp32 source forks [0, 1]). + +GlobalParameters: + SyncsPerBenchmark: 0 + SleepPercent: 0 + NumElementsToValidate: 0 + NumWarmups: 0 + NumBenchmarks: 1 + EnqueuesPerSync: 1 + DataInitTypeBeta: 0 + DataInitTypeAlpha: 1 + NewClient: 2 + Device: 0 + PrintSolutionRejectionReason: False + +BenchmarkProblems: + ######################################## + # F32X TN, SIA0 PGR2 non-TDM + # Bias + Activation(hipblaslt_all) + ScaleAB + ScaleAlphaVec + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: S + DestDataType: S + ComputeDataType: S + F32XdlMathOp: X + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + SupportUserArgs: True + UseBias: 1 + Activation: True + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + BiasDataTypeList: ['s'] + ActivationType: hipblaslt_all + - # BenchmarkProblemSizeGroup + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - ScheduleIterAlg: [0] + - TDMInst: [0] + - StreamK: [0] + - DepthU: [32] + - WavefrontSize: [32] + - TransposeLDS: [-1] + - LdsPadA: [-1] + - LdsPadB: [-1] + - StaggerU: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - GlobalReadVectorWidthA: [1] + - GlobalReadVectorWidthB: [1] + - 1LDSBuffer: [0] + - SourceSwap: [true] + - DirectToVgprA: [False] + - StoreVectorWidth: [-1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [127, 127, 1, 127] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/test_r7_sia0_pgr2_placement_char.py b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/test_r7_sia0_pgr2_placement_char.py new file mode 100644 index 000000000000..af9f2be51982 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/_codegen/test_r7_sia0_pgr2_placement_char.py @@ -0,0 +1,97 @@ +################################################################################ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +################################################################################ +"""R7 — SIA0 (ScheduleIterAlg=0) PGR2 non-TDM global-read / tail-reset placement. + +CPU-only characterization. Closes the gap that let a SIA0-only codegen change +reach develop without any CPU-PR-CI signal: every other designed config pins +``ScheduleIterAlg: [3]`` and the subtile configs use the LogicalScheduler, so +the legacy SIA0 emission path was never exercised by a content-sensitive test. + +Drives the designed config +``data/_designed/gfx1250/sia0_pgr2_xf32_tn.yaml`` (F32X TN, ScheduleIterAlg=0, +PrefetchGlobalRead=2, TDMInst=0/non-TDM, StreamK=0) through the config-driven +emit harness. That kernel routes through: + + Components/SIA.py:noSchedGlobalRead + PGR2 global-read placement (the ``_ScheduleIterAlg == 0`` arm). + + KernelWriter.py tail local-read reset + "Tail: local read reset offsets a/b" — emitted for SIA0 non-TDM via the + ``not (enableTDMA and enableTDMB)`` clause. With ScheduleIterAlg=0, + StreamK=0 and non-TDM, this reset appears ONLY when that clause is present, + so its presence is a precise, toolchain-independent (Tensile-emitted + comment) characterization of the SIA0 non-TDM tail-reset behavior. + +The projection snapshot below pins that behavior. It is derived from +``Tests/common/gemm/gfx12/xfp32_gfx1250.yaml`` (the F32X TN problem), reduced to +a single MI shape / size for a cheap emit. + +CPU-only. No GPU, no compile, no hardware access. +""" + +import os + +import pytest + +from config_harness import emit_kernels_from_config + +pytestmark = pytest.mark.unit + +_ARCH = "gfx1250" +_LIMIT = 8 + +_CONFIG = os.path.join( + os.path.dirname(__file__), + "data", + "test_data", + "_designed", + "gfx1250", + "sia0_pgr2_xf32_tn.yaml", +) + + +def _tail_lr_reset(src): + """Return the SIA0 tail local-read reset markers emitted for this kernel. + + These ``addComment1`` strings come from KernelWriter.localReadResetOffsets + in the tail-loop preamble; they are emitted by Tensile (not the assembler), + so their presence is independent of the amdclang/hipcc version. + """ + return { + "tail_lr_reset_a": "Tail: local read reset offsets a" in src, + "tail_lr_reset_b": "Tail: local read reset offsets b" in src, + } + + +def test_r7_sia0_pgr2_emits_assembly(): + """SIA0 PGR2 non-TDM F32X config emits real gfx1250 assembly, all err==0.""" + results = emit_kernels_from_config(_CONFIG, limit=_LIMIT, arch=_ARCH) + assert len(results) >= 1, f"expected >=1 kernel, got 0 (config: {_CONFIG})" + assert all(err == 0 for (_b, _s, err) in results), ( + f"some kernels failed: {[(b, e) for b, _s, e in results if e != 0]}" + ) + for base, src, _err in results: + assert src and len(src.splitlines()) > 100, ( + f"kernel {base!r}: assembly unexpectedly short" + ) + assert ".amdgcn_target" in src, f"kernel {base!r}: missing .amdgcn_target" + assert "gfx1250" in src, f"kernel {base!r}: wrong arch in assembly" + assert base.startswith("Cijk_"), f"kernel {base!r}: unexpected basename prefix" + + +def test_r7_sia0_pgr2_placement_golden(snapshot): + """Golden: SIA0 tail local-read reset markers per kernel. + + Pins the SIA0 non-TDM tail-reset behavior. A change to the SIA0 PGR2 + placement / tail-reset logic (the class of change made in #8417) flips these + markers and surfaces a snapshot diff in CPU PR CI — the signal that was + previously absent for the SIA0 path. + """ + results = emit_kernels_from_config(_CONFIG, limit=_LIMIT, arch=_ARCH) + digest = sorted( + ({"basename": b, "err": e, **_tail_lr_reset(s)} for (b, s, e) in results), + key=lambda d: d["basename"], + ) + assert digest == snapshot