From 906fe49a65919a246c18cde9cdc01e1caea6fec7 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 13:22:50 -0400
Subject: [PATCH 01/18] Add golden test infrastructure: bundle discovery,
 registration, and loading

Bundle discovery scans integration_test_bundles/ for .meta.json files,
derives GTest names from folder structure, and eagerly loads graph +
tensor data. IntegrationTestBundle models bundles as facets with optional
golden outputs. BundleRegistration wires discovered bundles into GTest.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../harness/golden/IntegrationTestBundle.hpp  | 175 +++++++++++++-----
 1 file changed, 128 insertions(+), 47 deletions(-)
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
index a05f33612437..1cd93dc80c4a 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <memory>
 #include <optional>
+#include <set>
 #include <unordered_map>
 #include <variant>
 #include <vector>
@@ -42,16 +43,25 @@ using TensorMap = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::u
 //   outputTensorUids — UIDs of the graph's output tensors, derived from the
 //                      graph. Always available (even for a graph-only bundle),
 //                      so the harness knows which tensors to compare / allocate.
-//   tensors          — the loaded tensor data. Absent (nullopt) for a graph-only
-//                      bundle (no "tensors" in the graph, or .bin data not pulled
-//                      via DVC); such a bundle cannot be executed/compared and
-//                      the harness SKIPs it.
+//   tensors          — the loaded tensor data, keyed by uid. Holds the INPUT
+//                      tensors (with their data) whenever they are present on
+//                      disk, plus the OUTPUT tensors carrying their golden values
+//                      iff every output blob is present (see hasGoldenOutputs).
+//                      Absent (nullopt) only when the input blobs themselves are
+//                      not on disk — a true graph-only bundle. The harness may
+//                      still synthesize inputs for such a bundle (tier 3); if it
+//                      cannot, it SKIPs.
+//   hasGoldenOutputs — true iff every output tensor's .bin blob was present and
+//                      loaded into `tensors`. When false, `tensors` (if present)
+//                      carries inputs only — the engine output must be verified
+//                      against a reference executor, not golden data.
 struct IntegrationTestBundle
 {
     flatbuffers::DetachedBuffer graphBuffer;
     hipdnn_test_sdk::utilities::BundleMetadata metadata;
     std::vector<int64_t> outputTensorUids;
     std::optional<TensorMap> tensors;
+    bool hasGoldenOutputs = false;
 
     // View over the graph flatbuffer, valid as long as this bundle lives.
     hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper graphWrapper() const
@@ -97,35 +107,48 @@ inline const char* toString(LoadError error)
 namespace detail
 {
 
-// True iff every tensor declared in the graph has its companion .bin blob on
-// disk. The blob path is "{stem}.tensor{uid}.bin", matching the loader's own
-// derivation. A graph with no "tensors" array is graph-only -> returns false.
-inline bool allTensorBlobsPresent(const nlohmann::json& graphJson,
-                                  const std::filesystem::path& jsonPath)
+// The on-disk blob path for a tensor: "{stem}.tensor{uid}.bin", matching the
+// loader's own derivation.
+inline std::filesystem::path tensorBlobPath(const std::filesystem::path& jsonPath, int64_t uid)
 {
-    if(!graphJson.contains("tensors") || !graphJson.at("tensors").is_array()
-       || graphJson.at("tensors").empty())
-    {
-        return false;
-    }
-
     auto basePath = jsonPath;
     basePath.replace_extension();
-    for(const auto& tensor : graphJson.at("tensors"))
+    return {basePath.string() + ".tensor" + std::to_string(uid) + ".bin"};
+}
+
+// True iff every uid in `uids` has its companion .bin blob on disk. An empty
+// `uids` set returns true (vacuously) — callers handle "no such tensors"
+// separately (e.g. a graph with no inputs, or no outputs).
+inline bool blobsPresentFor(const std::vector<int64_t>& uids, const std::filesystem::path& jsonPath)
+{
+    for(const int64_t uid : uids)
     {
-        if(!tensor.contains("uid"))
+        if(!std::filesystem::exists(tensorBlobPath(jsonPath, uid)))
         {
             return false;
         }
-        const auto uid = tensor.at("uid").get<int64_t>();
-        const auto binPath
-            = std::filesystem::path(basePath.string() + ".tensor" + std::to_string(uid) + ".bin");
-        if(!std::filesystem::exists(binPath))
+    }
+    return true;
+}
+
+// The uids of every tensor declared in the graph's "tensors" array. Empty if the
+// array is absent/empty (a graph-only bundle). Tensors without a "uid" are
+// skipped (malformed entries are caught later when building the flatbuffer).
+inline std::vector<int64_t> allTensorUids(const nlohmann::json& graphJson)
+{
+    std::vector<int64_t> uids;
+    if(!graphJson.contains("tensors") || !graphJson.at("tensors").is_array())
+    {
+        return uids;
+    }
+    for(const auto& tensor : graphJson.at("tensors"))
+    {
+        if(tensor.contains("uid"))
         {
-            return false;
+            uids.push_back(tensor.at("uid").get<int64_t>());
         }
     }
-    return true;
+    return uids;
 }
 
 } // namespace detail
@@ -141,9 +164,19 @@ inline bool allTensorBlobsPresent(const nlohmann::json& graphJson,
 //   * graph .json not parseable           -> LoadError::MALFORMED_JSON      (FAIL)
 //   * parseable but not a valid graph     -> LoadError::INVALID_GRAPH_SCHEMA(FAIL)
 //   * valid graph, no .meta.json companion-> LoadError::MISSING_METADATA    (FAIL)
-//   * valid graph, tensor .bin data absent-> bundle with tensors == nullopt (SKIP)
+//   * valid graph, input .bin data absent -> bundle, tensors == nullopt     (tier-3:
+//                                            harness may synthesize, else SKIP)
 //   * valid graph, .bin present but broken-> LoadError::TENSOR_LOAD_FAILED  (FAIL)
-//   * valid graph, all .bin present       -> fully loaded bundle            (RUN)
+//   * valid graph, inputs present,
+//       outputs absent                    -> bundle, tensors set,
+//                                            hasGoldenOutputs == false (verify via ref)
+//   * valid graph, inputs + outputs present-> bundle, hasGoldenOutputs == true (golden)
+//
+// Inputs and outputs are loaded INDEPENDENTLY (partial loading): a bundle that
+// ships input blobs but no output (golden) blobs is legitimate — its engine
+// output is verified against a reference executor instead of golden data. Output
+// uids come from getOutputTensorUidsFromGraph; everything else declared in the
+// graph is treated as an input.
 //
 // The function is total: it never lets an exception escape. Every outcome is
 // either a loaded bundle or a classified LoadError.
@@ -193,35 +226,83 @@ inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPat
     bundle.metadata = std::move(*metadata);
     bundle.outputTensorUids = hipdnn_test_sdk::utilities::getOutputTensorUidsFromGraph(graphJson);
 
-    // 5. Load tensor .bin data if every blob is present; otherwise leave
-    //    tensors == nullopt (graph-only bundle -> harness SKIPs). A blob that is
-    //    present but fails to load (wrong size, unreadable, unsupported dtype)
-    //    throws inside tensorFromFileAndAttributes; we catch it here and classify
-    //    it as TENSOR_LOAD_FAILED so the loader is total (every outcome is either
-    //    a bundle or a named LoadError, never a raw escaping exception).
-    if(detail::allTensorBlobsPresent(graphJson, jsonPath))
+    // 5. Load tensor .bin data, inputs and outputs INDEPENDENTLY.
+    //
+    //    Output uids are the graph's outputs; every other declared tensor is an
+    //    input. We load inputs only if all input blobs are present, and outputs
+    //    (golden) only if all output blobs are present:
+    //
+    //      * all input blobs present  -> tensors gets the inputs
+    //      * all output blobs present -> tensors also gets the golden outputs and
+    //                                    hasGoldenOutputs = true
+    //      * input blobs absent       -> tensors stays nullopt (tier-3: harness
+    //                                    may synthesize inputs, else SKIP)
+    //
+    //    A blob that is present but fails to load (wrong size, unreadable,
+    //    unsupported dtype) throws inside tensorFromFileAndAttributes; we catch it
+    //    and classify it as TENSOR_LOAD_FAILED so the loader stays total.
     {
-        const auto& graph
-            = *hipdnn_flatbuffers_sdk::data_objects::GetGraph(bundle.graphBuffer.data());
-        auto basePath = jsonPath;
-        basePath.replace_extension();
+        const std::vector<int64_t> allUids = detail::allTensorUids(graphJson);
 
-        try
+        const std::set<int64_t> outputUidSet(bundle.outputTensorUids.begin(),
+                                             bundle.outputTensorUids.end());
+        std::vector<int64_t> inputUids;
+        for(const int64_t uid : allUids)
         {
-            TensorMap tensorMap;
-            for(const auto* attributes : *graph.tensors())
+            if(outputUidSet.count(uid) == 0)
             {
-                const auto tensorPath
-                    = basePath.string() + ".tensor" + std::to_string(attributes->uid()) + ".bin";
-                tensorMap[attributes->uid()]
-                    = hipdnn_test_sdk::utilities::tensorFromFileAndAttributes(tensorPath,
-                                                                              *attributes);
+                inputUids.push_back(uid);
             }
-            bundle.tensors = std::move(tensorMap);
         }
-        catch(const std::exception&)
+
+        // A graph with no declared inputs cannot be fed; treat as graph-only.
+        const bool inputsPresent
+            = !inputUids.empty() && detail::blobsPresentFor(inputUids, jsonPath);
+        const bool outputsPresent = !bundle.outputTensorUids.empty()
+                                    && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
+
+        if(inputsPresent)
         {
-            return LoadError::TENSOR_LOAD_FAILED;
+            const auto& graph
+                = *hipdnn_flatbuffers_sdk::data_objects::GetGraph(bundle.graphBuffer.data());
+
+            // uid -> attributes, so we can load a chosen subset of tensors.
+            std::unordered_map<int64_t,
+                               const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes*>
+                attrByUid;
+            for(const auto* attributes : *graph.tensors())
+            {
+                attrByUid[attributes->uid()] = attributes;
+            }
+
+            const auto loadUids = [&](const std::vector<int64_t>& uids, TensorMap& into) {
+                for(const int64_t uid : uids)
+                {
+                    const auto it = attrByUid.find(uid);
+                    if(it == attrByUid.end())
+                    {
+                        continue;
+                    }
+                    into[uid] = hipdnn_test_sdk::utilities::tensorFromFileAndAttributes(
+                        detail::tensorBlobPath(jsonPath, uid), *it->second);
+                }
+            };
+
+            try
+            {
+                TensorMap tensorMap;
+                loadUids(inputUids, tensorMap);
+                if(outputsPresent)
+                {
+                    loadUids(bundle.outputTensorUids, tensorMap);
+                    bundle.hasGoldenOutputs = true;
+                }
+                bundle.tensors = std::move(tensorMap);
+            }
+            catch(const std::exception&)
+            {
+                return LoadError::TENSOR_LOAD_FAILED;
+            }
         }
     }
 

From 2ec76764eb59d79a782c9564b6ceb951a885d020 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 13:23:12 -0400
Subject: [PATCH 02/18] Add --verification-mode flag with harness refactor and
 A/B/C outcome model

Verification modes (auto|golden|gpu|cpu) control what engine output is
compared against. The harness extracts golden outputs before execution,
runs references via ReferenceCapabilityError-aware adapters, and reports
unverifiable bundles. Includes CLI flags, env fallbacks, and unit tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../CpuReferenceGraphExecutorAdapter.hpp      |  24 +-
 .../src/harness/ReferenceCapabilityError.hpp  |  40 ++
 .../src/harness/TestConfig.hpp                |  69 ++-
 ...raphGoldenReferenceVerificationHarness.hpp | 557 ++++++++++++++----
 .../golden/UnverifiableBundleReport.hpp       | 125 ++++
 .../GpuReferenceGraphExecutor.hpp             |  15 +-
 dnn-providers/integration-tests/src/main.cpp  |  33 +-
 .../tests/TestTestConfig.cpp                  |  30 +
 8 files changed, 781 insertions(+), 112 deletions(-)
 create mode 100644 dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp
 create mode 100644 dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp

diff --git a/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp b/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
index ff80def310de..3c16ff9f04c0 100644
--- a/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
+++ b/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
@@ -3,9 +3,13 @@
 
 #pragma once
 
+#include <stdexcept>
+#include <string>
+
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
 #include "IReferenceGraphExecutor.hpp"
+#include "ReferenceCapabilityError.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -17,7 +21,25 @@ class CpuReferenceGraphExecutorAdapter : public IReferenceGraphExecutor
                  size_t size,
                  const std::unordered_map<int64_t, void*>& variantPack) override
     {
-        _executor.execute(graphBuffer, size, variantPack);
+        // The shared test_sdk CPU executor throws a plain std::runtime_error for
+        // BOTH "no plan for this op" (capability miss, case A) and a genuine
+        // runtime failure (case C) — it does not distinguish them by type. We
+        // cannot tell them apart here, so we conservatively translate every throw
+        // into a ReferenceCapabilityError (case A), carrying the original message
+        // so a real failure still surfaces in the unverifiable report. Net effect:
+        // a CPU-ref crash routes as "couldn't run" rather than a hard FAIL. The
+        // GPU executor (our code) keeps full A-vs-C fidelity by throwing the right
+        // type at the source.
+        try
+        {
+            _executor.execute(graphBuffer, size, variantPack);
+        }
+        catch(const std::exception& e)
+        {
+            throw ReferenceCapabilityError(std::string("CPU reference executor could not run "
+                                                       "this graph: ")
+                                           + e.what());
+        }
     }
 
     bool requiresDeviceMemory() const override
diff --git a/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp b/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp
new file mode 100644
index 000000000000..869664ba843e
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp
@@ -0,0 +1,40 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace hipdnn_integration_tests
+{
+
+// Signals "this reference executor has no plan for this op" — a CAPABILITY MISS,
+// not a runtime failure. The golden-verification harness distinguishes three
+// reference outcomes:
+//
+//   A  capability miss  — ref cannot run this op   -> ReferenceCapabilityError
+//   B  disagreement     — ref ran, output != engine-> mismatch at compare time
+//   C  runtime error    — ref CAN run it but threw -> any other std::exception
+//
+// In `auto` mode a case-A miss falls through to the next reference; in explicit
+// gpu/cpu mode it SKIPs. A case-C error is loud (auto: fall through + loud
+// report; explicit / end-of-auto: FAIL). Throwing the right type at the source
+// is what lets the harness tell A from C.
+//
+// Deriving from std::runtime_error keeps existing `catch(const std::exception&)`
+// / `catch(const std::runtime_error&)` call sites working unchanged.
+//
+// NOTE: the GPU reference executor (our code) throws this directly at its
+// capability-miss sites. The CPU reference executor lives in the shared test_sdk
+// library and throws a plain std::runtime_error for BOTH A and C; the
+// CpuReferenceGraphExecutorAdapter translates that into a ReferenceCapabilityError
+// (it cannot tell A from C, so it conservatively treats every test_sdk throw as a
+// capability miss and carries the original message for the report).
+class ReferenceCapabilityError : public std::runtime_error
+{
+public:
+    using std::runtime_error::runtime_error;
+};
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/TestConfig.hpp b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
index e9ed9b27c106..11ebdfc7acaa 100644
--- a/dnn-providers/integration-tests/src/harness/TestConfig.hpp
+++ b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
@@ -35,6 +35,50 @@ enum class ReferenceExecutorType
     GPU,
 };
 
+// How a bundle's engine output is verified (RFC 0010 §4.4). This governs the
+// BUNDLE tests only and is independent of ReferenceExecutorType (which governs
+// the parameterized tests' choice of which ref executor to exercise).
+//
+//   AUTO   — per-test fallback: golden -> GPU ref -> CPU ref -> SKIP+report
+//   GOLDEN — golden data only; SKIP if a bundle has no golden outputs
+//   GPU    — ignore golden; compare engine against the GPU reference executor
+//   CPU    — ignore golden; compare engine against the CPU reference executor
+enum class VerificationMode
+{
+    AUTO,
+    GOLDEN,
+    GPU,
+    CPU,
+};
+
+// Parse a verification-mode string (case-insensitive) into the enum. Throws
+// std::runtime_error on an unrecognized value. Shared by the CLI flag parser and
+// the env-var fallback so both accept exactly the same spellings.
+inline VerificationMode parseVerificationMode(std::string value)
+{
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    if(value == "auto")
+    {
+        return VerificationMode::AUTO;
+    }
+    if(value == "golden")
+    {
+        return VerificationMode::GOLDEN;
+    }
+    if(value == "gpu")
+    {
+        return VerificationMode::GPU;
+    }
+    if(value == "cpu")
+    {
+        return VerificationMode::CPU;
+    }
+    throw std::runtime_error("Invalid verification mode '" + value
+                             + "'; expected 'auto', 'golden', 'gpu', or 'cpu'");
+}
+
 // Singleton class for storing CLI-based test configuration.
 // All arguments are independently optional:
 //   - articlePath: omit to use hipDNN's default plugin discovery
@@ -64,7 +108,8 @@ class TestConfig
                            std::optional<ReferenceExecutorType> referenceExecutorType
                            = std::nullopt,
                            bool allowBundles = false,
-                           std::optional<std::filesystem::path> goldenDataDir = std::nullopt)
+                           std::optional<std::filesystem::path> goldenDataDir = std::nullopt,
+                           std::optional<VerificationMode> verificationMode = std::nullopt)
     {
         TestConfig& instance = get();
         if(instance._initialized)
@@ -128,6 +173,19 @@ class TestConfig
             }
         }
 
+        // Verification mode: CLI flag wins; else HIPDNN_TEST_VERIFICATION_MODE env
+        // var; else default AUTO (resolved at the accessor). An invalid value
+        // (CLI or env) throws — parseVerificationMode reports the offending value.
+        instance._verificationMode = verificationMode;
+        if(!instance._verificationMode.has_value())
+        {
+            auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_VERIFICATION_MODE");
+            if(!envVal.empty())
+            {
+                instance._verificationMode = parseVerificationMode(envVal);
+            }
+        }
+
         // Detect device 0's gfx arch and VRAM once at startup. Used by
         // [[test_skips]] and golden-ref metadata guards (arch/VRAM checks).
         // todo: In future allow the test runner to use any specified device.
@@ -291,6 +349,14 @@ class TestConfig
         return _goldenDataDir.value();
     }
 
+    // Bundle verification mode. Resolved once at init: CLI flag >
+    // HIPDNN_TEST_VERIFICATION_MODE env var > AUTO default.
+    VerificationMode getVerificationMode() const
+    {
+        throwIfNotInitialized();
+        return _verificationMode.value_or(VerificationMode::AUTO);
+    }
+
 private:
     TestConfig() = default;
 
@@ -307,6 +373,7 @@ class TestConfig
     std::optional<TestSettings> _testSettings;
     std::optional<ReferenceExecutorType> _referenceExecutorType;
     std::optional<std::filesystem::path> _goldenDataDir;
+    std::optional<VerificationMode> _verificationMode;
     std::string _currentArch;
     std::size_t _currentDeviceVramMb = 0;
     std::string _currentPlatform;
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index db50296cacc3..4d5d8a75013b 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -7,7 +7,10 @@
 #include <cstdint>
 #include <filesystem>
 #include <memory>
+#include <optional>
 #include <ostream>
+#include <random>
+#include <set>
 #include <sstream>
 #include <string>
 #include <unordered_map>
@@ -25,22 +28,45 @@
 #include <hipdnn_test_sdk/utilities/TensorDiff.hpp>
 #include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
+#include <hipdnn_test_sdk/utilities/detail/FlatbufferTensorAttributesUtils.hpp>
 
+#include "harness/CpuReferenceGraphExecutorAdapter.hpp"
+#include "harness/IReferenceGraphExecutor.hpp"
+#include "harness/ReferenceCapabilityError.hpp"
 #include "harness/SharedHandle.hpp"
 #include "harness/TestConfig.hpp"
 #include "harness/golden/BundleDiscovery.hpp"
 #include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/golden/UnverifiableBundleReport.hpp"
+#include "harness/golden/input_init/SynthesizeInputs.hpp"
+#include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
 
 namespace hipdnn_integration_tests::golden
 {
 
-// Saved expected output tensors, keyed by output tensor UID. Extracted from a
-// loaded bundle's output tensors just before execution: the harness keeps these
-// as the golden reference and zeroes the live tensors so the runner computes
-// into clean buffers.
-using GoldenOutputs
+// Output tensors, keyed by uid. Used both for the engine's computed "actual"
+// outputs and for an expected source (golden from disk, or a reference executor's
+// output). Each set is a distinct allocation so engine and reference never write
+// the same buffers.
+using OutputTensors
     = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
 
+// Verifies a bundle's engine output against an expected source chosen by the
+// verification mode (RFC 0010 §4.4):
+//
+//   actual   = the engine (the system under test), run once into fresh buffers.
+//   expected = golden data from disk, OR a reference executor's output.
+//
+// Memory invariants for running engine + a reference off the same inputs:
+//   * INPUT tensors are read-only by both executors and are NEVER mark*Modified().
+//     The engine's rawDeviceData() uploads host->device (state becomes BOTH
+//     valid); a later CPU-ref rawHostData() therefore sees the host copy still
+//     valid and does NOT download — inputs stay intact across both runs.
+//   * OUTPUT buffers are separate ITensor objects per executor (engineOutputs vs
+//     refOutputs), so the two runs cannot stomp each other. Only output buffers
+//     are mark*Modified().
+//   * Virtual (inter-node) tensors are allocated internally by each executor; the
+//     variant packs we build carry only real (input + output) tensors.
 class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Test
 {
 public:
@@ -51,7 +77,7 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
 
     // The bundle is loaded once at registration time and shared into the test's
     // factory; the harness does not load from disk. The path is kept only for
-    // diagnostic messages.
+    // diagnostic messages and the unverifiable report.
     void setBundle(std::shared_ptr<IntegrationTestBundle> bundle, std::filesystem::path path)
     {
         _bundle = std::move(bundle);
@@ -72,58 +98,19 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
             GTEST_SKIP() << "No bundle set";
         }
 
-        // A graph-only bundle (no tensor data on disk, or .bin not pulled via
-        // DVC) cannot be executed or compared -> SKIP.
-        if(!_bundle->tensors.has_value())
-        {
-            GTEST_SKIP() << "Tensor data not available (graph-only bundle or DVC not pulled?): "
-                         << _bundlePath;
-        }
-
         applyMetadataGuards();
     }
 
-    // Save each output tensor's loaded data as the golden reference, then zero
-    // the live tensor so the runner computes into a clean buffer. Returns the
-    // golden map keyed by output UID.
-    GoldenOutputs extractGolden(TensorMap& tensorMap) const
-    {
-        GoldenOutputs golden;
-        const auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-
-        for(const int64_t uid : _bundle->outputTensorUids)
-        {
-            const auto dataType = tensorAttrMap.at(uid)->data_type();
-            auto& livePtr = tensorMap.at(uid);
-
-            auto zeroed = std::visit(
-                [&](auto nativeType) {
-                    using DataType = decltype(nativeType);
-                    auto tensorPtr = std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>(
-                        new hipdnn_data_sdk::utilities::Tensor<DataType>(livePtr->dims(),
-                                                                         livePtr->strides()));
-                    tensorPtr->fillTensorWithValue(0.f);
-                    return tensorPtr;
-                },
-                hipdnn_test_sdk::utilities::datatypeToNativeVariant(dataType));
-
-            std::swap(zeroed, livePtr); // live map now holds the zero buffer
-            golden[uid] = std::move(zeroed); // golden holds the original data
-        }
-        return golden;
-    }
-
     // NOLINTNEXTLINE(readability-identifier-naming)
     void TestBody() override
     {
-        runGoldenComparison();
+        runComparison();
     }
 
-    // Builds the graph from its serialized bytes, selects an engine (honouring
-    // an explicit --engine if given), builds plans, and executes into the
-    // variant pack. "Unsupported graph" is signalled by throwing (the harness
-    // translates that into a SKIP). Genuine build/execute errors use ASSERT_*.
+    // Builds the graph from its serialized bytes, selects an engine (honouring an
+    // explicit --engine if given), builds plans, and executes into the variant
+    // pack. "Unsupported graph" is signalled by throwing (the harness translates
+    // that into a SKIP). Genuine build/execute errors use ASSERT_*.
     virtual void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack)
     {
         auto handle = getSharedHandle();
@@ -179,75 +166,434 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         ASSERT_TRUE(result.is_good()) << result.get_message();
     }
 
+    // Runs a reference executor (the chosen expected-output source) into the given
+    // variant pack. Throws ReferenceCapabilityError when the executor has no plan
+    // for the op (capability miss, case A); throws any other exception for a
+    // genuine runtime failure (case C). Virtual so unit tests can stub it the same
+    // way they stub executeGraphThroughEngine.
+    virtual void runReferenceExecutor(ReferenceExecutorType type,
+                                      std::unordered_map<int64_t, void*>& variantPack)
+    {
+        auto executor = makeReferenceExecutor(type);
+        executor->execute(_bundle->graphBuffer.data(), _bundle->graphBuffer.size(), variantPack);
+    }
+
+    // Factory split out so a stub harness can short-circuit it. Default: the real
+    // CPU / GPU reference executors.
+    virtual std::unique_ptr<IReferenceGraphExecutor>
+        makeReferenceExecutor(ReferenceExecutorType type)
+    {
+        switch(type)
+        {
+        case ReferenceExecutorType::CPU:
+            return std::make_unique<CpuReferenceGraphExecutorAdapter>();
+        case ReferenceExecutorType::GPU:
+            return std::make_unique<gpu_graph_executor::GpuReferenceGraphExecutor>();
+        default:
+            throw std::runtime_error("Unknown reference executor type");
+        }
+    }
+
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
     std::shared_ptr<IntegrationTestBundle> _bundle;
 
-    void runGoldenComparison()
-    {
-        auto& tensorMap = *_bundle->tensors;
+    // ---- top-level dispatch -------------------------------------------------
 
+    void runComparison()
+    {
         if(_bundle->outputTensorUids.empty())
         {
-            GTEST_SKIP() << "Bundle has no output tensors to compare: " << _bundlePath;
+            skipUnverifiable("bundle has no output tensors to compare");
+            return;
+        }
+
+        if(!ensureInputsAvailable())
+        {
+            return; // skipUnverifiable already recorded + GTEST_SKIP issued
+        }
+
+        switch(TestConfig::get().getVerificationMode())
+        {
+        case VerificationMode::GOLDEN:
+            runGoldenMode();
+            return;
+        case VerificationMode::GPU:
+            runExplicitRefMode(ReferenceExecutorType::GPU);
+            return;
+        case VerificationMode::CPU:
+            runExplicitRefMode(ReferenceExecutorType::CPU);
+            return;
+        case VerificationMode::AUTO:
+            runAutoMode();
+            return;
+        default:
+            FAIL() << "Unknown verification mode";
+            return;
+        }
+    }
+
+    // golden mode: golden data only.
+    void runGoldenMode()
+    {
+        if(!_bundle->hasGoldenOutputs)
+        {
+            skipUnverifiable("no golden data (verification-mode=golden)");
+            return;
+        }
+        auto engineOutputs = runEngineCapturingOutputs();
+        if(!engineOutputs)
+        {
+            if(!::testing::Test::HasFatalFailure())
+            {
+                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+            }
+            return;
+        }
+        compareAgainstGolden(*engineOutputs);
+    }
+
+    // explicit gpu / cpu mode: ignore golden; compare against the named reference.
+    //   A (capability miss) -> SKIP+report
+    //   C (runtime error)   -> FAIL (the user named this reference)
+    //   B (mismatch)        -> FAIL
+    void runExplicitRefMode(ReferenceExecutorType type)
+    {
+        auto engineOutputs = runEngineCapturingOutputs();
+        if(!engineOutputs)
+        {
+            if(!::testing::Test::HasFatalFailure())
+            {
+                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+            }
+            return;
+        }
+
+        OutputTensors refOutputs;
+        const RefRunResult result = runReferenceCapturingOutputs(type, refOutputs);
+        switch(result.status)
+        {
+        case RefStatus::CAPABILITY_MISS:
+            skipUnverifiable(refLabel(type) + " cannot run this op: " + result.message);
+            return;
+        case RefStatus::RUNTIME_ERROR:
+            recordRefError(refLabel(type) + " errored: " + result.message);
+            FAIL() << refLabel(type) << " errored (verification-mode=" << refLabel(type)
+                   << "): " << result.message;
+            return;
+        case RefStatus::RAN:
+            compareOutputs(*engineOutputs, refOutputs);
+            return;
+        default:
+            FAIL() << "Unknown RefStatus";
+            return;
+        }
+    }
+
+    // auto mode: golden -> GPU ref -> CPU ref -> SKIP+report.
+    //   capability miss falls through; a runtime error in a non-final ref is loud
+    //   but still falls through (keep verifying the engine); a runtime error in the
+    //   final ref (CPU) is a FAIL; a mismatch anywhere is a FAIL (never a second
+    //   opinion).
+    void runAutoMode()
+    {
+        auto engineOutputs = runEngineCapturingOutputs();
+        if(!engineOutputs)
+        {
+            if(!::testing::Test::HasFatalFailure())
+            {
+                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+            }
+            return;
+        }
+
+        if(_bundle->hasGoldenOutputs)
+        {
+            compareAgainstGolden(*engineOutputs);
+            return;
+        }
+
+        // GPU ref (non-final): capability miss or runtime error -> fall through.
+        {
+            OutputTensors refOutputs;
+            const RefRunResult gpu
+                = runReferenceCapturingOutputs(ReferenceExecutorType::GPU, refOutputs);
+            if(gpu.status == RefStatus::RAN)
+            {
+                compareOutputs(*engineOutputs, refOutputs);
+                return;
+            }
+            if(gpu.status == RefStatus::RUNTIME_ERROR)
+            {
+                // A reference that CAN run the op but failed is a reference bug:
+                // loud, but we still fall through to keep verifying the engine.
+                recordRefError("GPU reference errored (auto mode, falling through to CPU): "
+                               + gpu.message);
+            }
+        }
+
+        // CPU ref (final): capability miss -> unverifiable; runtime error -> FAIL.
+        {
+            OutputTensors refOutputs;
+            const RefRunResult cpu
+                = runReferenceCapturingOutputs(ReferenceExecutorType::CPU, refOutputs);
+            switch(cpu.status)
+            {
+            case RefStatus::CAPABILITY_MISS:
+                skipUnverifiable("no reference available (golden absent; GPU and CPU ref "
+                                 "cannot run this op): "
+                                 + cpu.message);
+                return;
+            case RefStatus::RUNTIME_ERROR:
+                recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
+                FAIL() << "CPU reference errored (auto mode, last resort): " << cpu.message;
+                return;
+            case RefStatus::RAN:
+                compareOutputs(*engineOutputs, refOutputs);
+                return;
+            default:
+                FAIL() << "Unknown RefStatus";
+                return;
+            }
+        }
+    }
+
+    // ---- inputs -------------------------------------------------------------
+
+    // Ensures _bundle->tensors holds usable input data. tier 1/2: already loaded
+    // from disk. tier 3 (tensors == nullopt): try to synthesize inputs from the
+    // graph. Returns false (after recording + SKIP) when neither is possible.
+    bool ensureInputsAvailable()
+    {
+        if(_bundle->tensors.has_value())
+        {
+            return true; // inputs (and maybe golden outputs) loaded from disk
         }
+        return synthesizeInputs();
+    }
+
+    // tier-3 synthesis: single-node graph whose op has a registered initializer.
+    // Builds zeroed input tensors from graph attributes, routes each leaf input to
+    // its owning node's initializer, and fills them. Any refusal -> SKIP+report.
+    bool synthesizeInputs()
+    {
+        const auto wrapper = _bundle->graphWrapper();
 
-        const auto golden = extractGolden(tensorMap);
+        if(wrapper.nodeCount() != 1)
+        {
+            skipUnverifiable("graph-only bundle with no input data: input synthesis supports "
+                             "single-node graphs only (this graph has "
+                             + std::to_string(wrapper.nodeCount()) + " nodes)");
+            return false;
+        }
 
-        // Build the variant pack from the tensor map. Device tests use GPU
-        // pointers (rawDeviceData); CPU-only unit tests use host pointers so
-        // they can run on CI without a GPU.
+        const auto& node = wrapper.getNode(0);
+
+        // Leaf inputs = non-virtual tensors that are not graph outputs. (For a
+        // single-node graph every such tensor is an input to that node.)
+        const auto& tensorAttrMap = wrapper.getTensorMap();
+        const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                           _bundle->outputTensorUids.end());
+
+        InputTensorMap inputs;
+        std::vector<int64_t> leafInputUids;
+        for(const auto& [uid, attrs] : tensorAttrMap)
+        {
+            if(attrs->virtual_() || outputUids.count(uid) != 0)
+            {
+                continue;
+            }
+            inputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*attrs);
+            inputs[uid]->fillTensorWithValue(0.f);
+            leafInputUids.push_back(uid);
+        }
+
+        std::mt19937 rng(static_cast<std::mt19937::result_type>(
+            _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
+
+        const FillOutcome outcome = synthesizeNodeInputs(node, leafInputUids, inputs, rng);
+        if(!outcome.filled)
+        {
+            skipUnverifiable(outcome.reason);
+            return false;
+        }
+
+        _bundle->tensors = std::move(inputs);
+        return true;
+    }
+
+    // ---- engine + reference runs -------------------------------------------
+
+    // Allocate fresh zeroed output buffers (one ITensor per output uid) from the
+    // graph's tensor attributes — no .bin needed.
+    OutputTensors allocateZeroedOutputs() const
+    {
+        const auto wrapper = _bundle->graphWrapper();
+        const auto& tensorAttrMap = wrapper.getTensorMap();
+
+        OutputTensors outputs;
+        for(const int64_t uid : _bundle->outputTensorUids)
+        {
+            outputs[uid]
+                = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
+            outputs[uid]->fillTensorWithValue(0.f);
+        }
+        return outputs;
+    }
+
+    // Build a variant pack: inputs from _bundle->tensors, outputs from `outputs`.
+    // useDevice selects device vs host pointers (engine/GPU-ref use device; CPU-ref
+    // uses host). Inputs are read but never mark*Modified() (see class invariants).
+    std::unordered_map<int64_t, void*> buildVariantPack(OutputTensors& outputs,
+                                                        bool useDevice) const
+    {
         std::unordered_map<int64_t, void*> variantPack;
-        for(auto& [uid, tensor] : tensorMap)
+        const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                           _bundle->outputTensorUids.end());
+
+        for(auto& [uid, tensor] : *_bundle->tensors)
         {
-            variantPack[uid] = _requiresDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+            if(outputUids.count(uid) != 0)
+            {
+                continue; // golden output from disk; use the fresh buffer below instead
+            }
+            variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
         }
+        for(auto& [uid, tensor] : outputs)
+        {
+            variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+        }
+        return variantPack;
+    }
 
-        // executeGraphThroughEngine signals "unsupported graph" by throwing;
-        // the harness translates that into a SKIP. ASSERT_NO_FATAL_FAILURE
-        // still wraps the call so that a genuine GTest assertion inside the
-        // executor FAILs rather than falling through to the comparison.
-        bool executorThrew = false;
-        std::string executorError;
+    // Run the engine into fresh output buffers. Returns nullopt if the engine
+    // signalled "unsupported graph" (SKIP already issued) or a fatal assertion
+    // fired inside the executor.
+    std::optional<OutputTensors> runEngineCapturingOutputs()
+    {
+        OutputTensors engineOutputs = allocateZeroedOutputs();
+        auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
+
+        // Call the executor directly (not via ASSERT_NO_FATAL_FAILURE, which would
+        // `return;` and cannot compile in this value-returning function). A fatal
+        // ASSERT_* inside the executor returns from it and sets the fatal-failure
+        // flag, which we detect below and surface as nullopt.
+        bool threw = false;
+        std::string error;
         try
         {
-            ASSERT_NO_FATAL_FAILURE(executeGraphThroughEngine(variantPack));
+            executeGraphThroughEngine(variantPack);
         }
         catch(const std::exception& e)
         {
-            executorThrew = true;
-            executorError = e.what();
+            threw = true;
+            error = e.what();
         }
 
-        if(executorThrew)
+        if(::testing::Test::HasFatalFailure())
         {
-            GTEST_SKIP() << "Executor could not run bundle " << _bundlePath << ": "
-                         << executorError;
+            return std::nullopt;
+        }
+        if(threw)
+        {
+            // GTEST_SKIP contains `return;` which cannot compile in a non-void
+            // function. Callers detect nullopt and issue the skip themselves.
+            return std::nullopt;
+        }
+
+        markOutputsModified(engineOutputs);
+        return engineOutputs;
+    }
+
+    enum class RefStatus
+    {
+        RAN,
+        CAPABILITY_MISS,
+        RUNTIME_ERROR,
+    };
+    struct RefRunResult
+    {
+        RefStatus status;
+        std::string message;
+    };
+
+    // Run a reference executor into fresh output buffers `refOutputs`.
+    //   ReferenceCapabilityError -> CapabilityMiss (case A)
+    //   any other std::exception -> RuntimeError   (case C)
+    RefRunResult runReferenceCapturingOutputs(ReferenceExecutorType type, OutputTensors& refOutputs)
+    {
+        refOutputs = allocateZeroedOutputs();
+        const bool useDevice = (type == ReferenceExecutorType::GPU);
+        auto variantPack = buildVariantPack(refOutputs, useDevice);
+
+        try
+        {
+            runReferenceExecutor(type, variantPack);
+        }
+        catch(const ReferenceCapabilityError& e)
+        {
+            return {RefStatus::CAPABILITY_MISS, e.what()};
+        }
+        catch(const std::exception& e)
+        {
+            return {RefStatus::RUNTIME_ERROR, e.what()};
         }
 
-        for(auto uid : _bundle->outputTensorUids)
+        markOutputsModifiedFor(refOutputs, useDevice);
+        return {RefStatus::RAN, {}};
+    }
+
+    void markOutputsModified(OutputTensors& outputs) const
+    {
+        markOutputsModifiedFor(outputs, _requiresDevice);
+    }
+
+    static void markOutputsModifiedFor(OutputTensors& outputs, bool device)
+    {
+        for(auto& [uid, tensor] : outputs)
         {
-            if(_requiresDevice)
+            if(device)
             {
-                tensorMap.at(uid)->markDeviceModified();
+                tensor->markDeviceModified();
             }
             else
             {
-                tensorMap.at(uid)->markHostModified();
+                tensor->markHostModified();
             }
         }
+    }
+
+    // ---- comparison ---------------------------------------------------------
+
+    // Compare engine output against the golden outputs stored in _bundle->tensors.
+    void compareAgainstGolden(OutputTensors& engineOutputs)
+    {
+        compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+            return *_bundle->tensors->at(uid);
+        });
+    }
 
+    void compareOutputs(OutputTensors& engineOutputs, OutputTensors& expected)
+    {
+        compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+            return *expected.at(uid);
+        });
+    }
+
+    template <typename ExpectedLookup>
+    void compareEach(OutputTensors& engineOutputs, ExpectedLookup expectedFor)
+    {
         auto wrapper = _bundle->graphWrapper();
         const auto& tensorAttrMap = wrapper.getTensorMap();
 
-        for(auto uid : _bundle->outputTensorUids)
+        for(const int64_t uid : _bundle->outputTensorUids)
         {
-            auto& actualTensor = *tensorMap.at(uid);
-            auto& expectedTensor = *golden.at(uid);
+            auto& actualTensor = *engineOutputs.at(uid);
+            auto& expectedTensor = expectedFor(uid);
 
             auto* attrs = tensorAttrMap.at(uid);
-            auto dataType = attrs->data_type();
+            const auto dataType = attrs->data_type();
 
             float atol = 0.0f;
             float rtol = 0.0f;
@@ -257,9 +603,32 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         }
     }
 
-    // Compare one output tensor against its golden reference via the allClose
-    // validator (which covers both CPU and GPU validation paths). Only on failure
-    // do we compute and report the element-wise tensor diff for diagnostics.
+    // ---- reporting helpers --------------------------------------------------
+
+    void skipUnverifiable(const std::string& reason)
+    {
+        UnverifiableBundleReport::get().record(
+            _bundlePath.string(), reason, UnverifiableSeverity::UNVERIFIABLE);
+        GTEST_SKIP() << "Unverifiable: " << reason << " (" << _bundlePath << ")";
+    }
+
+    void recordRefError(const std::string& reason)
+    {
+        UnverifiableBundleReport::get().record(
+            _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
+    }
+
+    static std::string refLabel(ReferenceExecutorType type)
+    {
+        return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
+    }
+
+    static constexpr int64_t K_DEFAULT_SEED = 42;
+
+    // ---- comparison + tolerance machinery (unchanged behaviour) -------------
+
+    // Compare one output tensor against its expected reference via the allClose
+    // validator. Only on failure do we compute and report the element-wise diff.
     void compareOutputTensor(int64_t uid,
                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
@@ -280,8 +649,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         }
     }
 
-    // Appends an element-wise diff summary for FP types; non-FP types get a
-    // generic note (computeTensorDiff has no integer specialization).
     static void
         appendTensorDiff(std::ostream& os,
                          int64_t uid,
@@ -329,8 +696,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
     }
 
-    // The human-readable label for an output tensor: its name if it has one,
-    // otherwise "uid=N".
     static std::string labelFor(int64_t uid,
                                 const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
     {
@@ -338,10 +703,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
     }
 
-    // Common header for a failed comparison (RFC 0011 §4.3 "What a failure looks
-    // like"): bundle path, tensor UID/name, shape + dtype, and tolerance. The
-    // per-element diff (worst index, expected/actual/abs-diff, mismatch count) is
-    // appended by the caller from the TensorDiffSummary it already computed.
     std::string reportHeader(int64_t uid,
                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
@@ -413,12 +774,9 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         }
     }
 
-    // A bundle graph may fuse several ops (e.g. Convolution + Pointwise
-    // activation). Each op type has its own numerical tolerance, so the only
-    // tolerance that holds for the fused output is the loosest one across all
-    // nodes: a tolerance tight enough for Conv (e.g. 1e-3) would wrongly fail an
-    // activation output that legitimately needs 1e-2. We therefore take the max
-    // tolerance over every node rather than picking a single "root" node.
+    // A bundle graph may fuse several ops; each op type has its own tolerance, so
+    // the only tolerance that holds for the fused output is the loosest one across
+    // all nodes. We therefore take the max over every node.
     static float deriveDefaultTolerance(
         const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
         hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
@@ -438,7 +796,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         return found ? maxTolerance : 1e-3f;
     }
 
-    // Dispatch a single node's tolerance lookup on the bundle's data type.
     static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
                                       hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
     {
@@ -461,10 +818,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
 
     void applyMetadataGuards() const
     {
-        // metadata is mandatory, so a loaded bundle always has it (a bundle with
-        // no .meta.json fails to load and never reaches here). Individual fields
-        // (VRAM, arch) are still optional within BundleMetadata; the guards below
-        // no-op when their field is absent, so they can be called unconditionally.
         if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
                _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
         {
diff --git a/dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp b/dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp
new file mode 100644
index 000000000000..d4f7ed908ad3
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp
@@ -0,0 +1,125 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace hipdnn_integration_tests::golden
+{
+
+// Why a bundle could not be verified. The two severities are printed in separate
+// sections so a genuine reference bug is never lost among expected coverage gaps.
+//
+//   Unverifiable — expected coverage gap (no golden data, no reference can run
+//                  the op, inputs could not be synthesized, ...). The engine was
+//                  not accused; we simply had no oracle. Quiet but listed.
+//   RefError     — a reference executor that CAN run the op threw at runtime
+//                  (case C) and the harness fell through to keep verifying the
+//                  engine. This is a reference bug and must be loud.
+enum class UnverifiableSeverity
+{
+    UNVERIFIABLE,
+    REF_ERROR,
+};
+
+// Process-wide collector of bundles that ended a run without a verdict. Mirrors
+// SupportMatrixCollector: a thread-safe singleton populated during test execution
+// and printed once after RUN_ALL_TESTS(). Records do not affect the GTest exit
+// code — an unverifiable bundle SKIPs; this report is the visible trail.
+class UnverifiableBundleReport
+{
+public:
+    struct Record
+    {
+        std::string bundle; // bundle path / identifier
+        std::string reason; // human-readable explanation
+        UnverifiableSeverity severity;
+    };
+
+    static UnverifiableBundleReport& get()
+    {
+        static UnverifiableBundleReport s_instance;
+        return s_instance;
+    }
+
+    UnverifiableBundleReport(const UnverifiableBundleReport&) = delete;
+    UnverifiableBundleReport& operator=(const UnverifiableBundleReport&) = delete;
+    UnverifiableBundleReport(UnverifiableBundleReport&&) = delete;
+    UnverifiableBundleReport& operator=(UnverifiableBundleReport&&) = delete;
+
+    void record(std::string bundle, std::string reason, UnverifiableSeverity severity)
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        _records.push_back({std::move(bundle), std::move(reason), severity});
+    }
+
+    std::vector<Record> getRecords() const
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        return _records;
+    }
+
+    void reset()
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        _records.clear();
+    }
+
+    // Print both severity sections to `os`. No-op when nothing was recorded.
+    void print(std::ostream& os = std::cout) const
+    {
+        std::vector<Record> records;
+        {
+            const std::lock_guard<std::mutex> lock(_mutex);
+            records = _records;
+        }
+        if(records.empty())
+        {
+            return;
+        }
+
+        printSection(os, records, UnverifiableSeverity::REF_ERROR, "REFERENCE EXECUTOR ERRORS");
+        printSection(os, records, UnverifiableSeverity::UNVERIFIABLE, "UNVERIFIABLE BUNDLES");
+    }
+
+private:
+    UnverifiableBundleReport() = default;
+
+    static void printSection(std::ostream& os,
+                             const std::vector<Record>& records,
+                             UnverifiableSeverity severity,
+                             const char* heading)
+    {
+        size_t count = 0;
+        for(const auto& r : records)
+        {
+            if(r.severity == severity)
+            {
+                ++count;
+            }
+        }
+        if(count == 0)
+        {
+            return;
+        }
+
+        os << "\n==== " << heading << " (" << count << ") ====\n";
+        for(const auto& r : records)
+        {
+            if(r.severity == severity)
+            {
+                os << "  - " << r.bundle << ": " << r.reason << "\n";
+            }
+        }
+    }
+
+    mutable std::mutex _mutex;
+    std::vector<Record> _records;
+};
+
+} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp b/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
index 671dcb248d19..ab5aea01933f 100644
--- a/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
+++ b/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
@@ -8,6 +8,7 @@
 
 #include "detail/GpuPlanBuilderRegistry.hpp"
 #include "harness/IReferenceGraphExecutor.hpp"
+#include "harness/ReferenceCapabilityError.hpp"
 
 namespace hipdnn_integration_tests::gpu_graph_executor
 {
@@ -83,8 +84,8 @@ class GpuReferenceGraphExecutor : public IReferenceGraphExecutor
         {
             const std::string nodeName
                 = node.name() == nullptr ? " unknown" : " " + node.name()->str();
-            throw std::runtime_error("GPU plan builder is not applicable for the given node:"
-                                     + nodeName);
+            throw ReferenceCapabilityError("GPU plan builder is not applicable for the given node:"
+                                           + nodeName);
         }
 
         return planBuilder.buildNodePlan(graph, node);
@@ -123,15 +124,17 @@ class GpuReferenceGraphExecutor : public IReferenceGraphExecutor
         case NodeAttrs::BlockScaleQuantizeAttributes:
         {
             const std::string nodeName = node.name() == nullptr ? "unknown" : node.name()->str();
-            throw std::runtime_error("GPU plan not yet implemented for node '" + nodeName
-                                     + "'. Register a GPU plan for this operation type.");
+            throw ReferenceCapabilityError("GPU plan not yet implemented for node '" + nodeName
+                                           + "'. Register a GPU plan for this operation type.");
         }
 
         case NodeAttrs::CustomOpAttributes:
-            throw std::runtime_error("GPU reference executor does not support custom operations");
+            throw ReferenceCapabilityError(
+                "GPU reference executor does not support custom operations");
 
         default:
-            throw std::runtime_error("Unsupported node type for GPU signature key generation");
+            throw ReferenceCapabilityError(
+                "Unsupported node type for GPU signature key generation");
         }
     }
 
diff --git a/dnn-providers/integration-tests/src/main.cpp b/dnn-providers/integration-tests/src/main.cpp
index 47645cdbd9eb..9cb995f90a93 100644
--- a/dnn-providers/integration-tests/src/main.cpp
+++ b/dnn-providers/integration-tests/src/main.cpp
@@ -22,6 +22,7 @@
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
 #include "harness/golden/BundleRegistration.hpp"
+#include "harness/golden/UnverifiableBundleReport.hpp"
 
 namespace
 {
@@ -96,10 +97,17 @@ int main(int argc, char** argv) noexcept
             .implicit_value(true)
             .help("Enable golden reference bundle test registration. "
                   "Can also be set via HIPDNN_TEST_ALLOW_BUNDLES=1 env var.");
-        parser.add_argument("--golden-data-dir")
+        parser.add_argument("--gd", "--golden-data-dir")
             .help("Path to the integration test bundle data directory. "
                   "Defaults to <exe>/../lib/integration_test_bundles/. "
                   "Can also be set via HIPDNN_TEST_GOLDEN_DATA_DIR env var.");
+        // --verification-mode governs BUNDLE tests (how the engine's output is
+        // verified). It is independent of --reference-executor, which governs the
+        // parameterized tests (which ref executor is exercised as the SUT).
+        parser.add_argument("--vm", "--verification-mode")
+            .help("How bundle engine output is verified: 'auto' (default; golden -> "
+                  "GPU ref -> CPU ref -> skip), 'golden', 'gpu', or 'cpu'. "
+                  "Can also be set via HIPDNN_TEST_VERIFICATION_MODE env var.");
 
         std::vector<std::string> remainingArgs;
         try
@@ -169,6 +177,22 @@ int main(int argc, char** argv) noexcept
             goldenDataDir = parser.get<std::string>("--golden-data-dir");
         }
 
+        // Parse --verification-mode (case-insensitive); invalid value -> exit 1.
+        std::optional<hipdnn_integration_tests::VerificationMode> verificationMode;
+        if(parser.is_used("--verification-mode"))
+        {
+            try
+            {
+                verificationMode = hipdnn_integration_tests::parseVerificationMode(
+                    parser.get<std::string>("--verification-mode"));
+            }
+            catch(const std::exception& e)
+            {
+                std::cerr << "Error: " << e.what() << '\n';
+                return 1;
+            }
+        }
+
         // Parse --test-article argument and load explicit plugin if provided
         std::optional<std::filesystem::path> articlePath;
         if(parser.is_used("--test-article"))
@@ -211,7 +235,8 @@ int main(int argc, char** argv) noexcept
                                                          std::move(configPath),
                                                          refExecType,
                                                          allowBundles,
-                                                         std::move(goldenDataDir));
+                                                         std::move(goldenDataDir),
+                                                         verificationMode);
 
         // Reconstruct argc/argv for GTest from remaining (unknown) args.
         // argv[0] (program name) must be first — GTest requires it.
@@ -271,6 +296,10 @@ int main(int argc, char** argv) noexcept
 
         const int result = RUN_ALL_TESTS();
 
+        // Print bundles that ended without a verdict (no oracle / reference bug).
+        // Informational only — these SKIP, so they do not affect `result`.
+        hipdnn_integration_tests::golden::UnverifiableBundleReport::get().print();
+
         // Generate support matrix if requested
         if(hipdnn_integration_tests::SupportMatrixCollector::get().isEnabled())
         {
diff --git a/dnn-providers/integration-tests/tests/TestTestConfig.cpp b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
index 4f45e39c103a..a8674b3125b0 100644
--- a/dnn-providers/integration-tests/tests/TestTestConfig.cpp
+++ b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
@@ -67,6 +67,29 @@ TEST(TestConfigUninitialized, GetReferenceExecutorTypeThrowsWhenUninitialized)
     EXPECT_THROW(TestConfig::get().getReferenceExecutorType(), std::runtime_error);
 }
 
+// parseVerificationMode is a free function (no singleton state), so it can be
+// exercised regardless of initialization.
+TEST(ParseVerificationMode, AcceptsAllValidValuesCaseInsensitive)
+{
+    using hipdnn_integration_tests::parseVerificationMode;
+    using hipdnn_integration_tests::VerificationMode;
+
+    EXPECT_EQ(parseVerificationMode("auto"), VerificationMode::AUTO);
+    EXPECT_EQ(parseVerificationMode("golden"), VerificationMode::GOLDEN);
+    EXPECT_EQ(parseVerificationMode("gpu"), VerificationMode::GPU);
+    EXPECT_EQ(parseVerificationMode("cpu"), VerificationMode::CPU);
+
+    EXPECT_EQ(parseVerificationMode("AUTO"), VerificationMode::AUTO);
+    EXPECT_EQ(parseVerificationMode("Golden"), VerificationMode::GOLDEN);
+    EXPECT_EQ(parseVerificationMode("GPU"), VerificationMode::GPU);
+}
+
+TEST(ParseVerificationMode, ThrowsOnInvalidValue)
+{
+    EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode("bogus"), std::runtime_error);
+    EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode(""), std::runtime_error);
+}
+
 // ---------------------------------------------------------------------------
 // Suite 2 – initialized singleton (all args provided)
 // ---------------------------------------------------------------------------
@@ -129,6 +152,13 @@ TEST_F(TestConfigInitialized, GetReferenceExecutorTypeDefaultsToCpu)
               hipdnn_integration_tests::ReferenceExecutorType::CPU);
 }
 
+TEST_F(TestConfigInitialized, GetVerificationModeDefaultsToAuto)
+{
+    // No CLI flag and (assuming) no env var -> AUTO.
+    EXPECT_EQ(TestConfig::get().getVerificationMode(),
+              hipdnn_integration_tests::VerificationMode::AUTO);
+}
+
 TEST_F(TestConfigInitialized, DoubleInitializeThrows)
 {
     EXPECT_THROW(TestConfig::initialize(std::nullopt, std::nullopt), std::runtime_error);

From 392e6285377647724f60291639c8ffe97504dbe5 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 13:23:55 -0400
Subject: [PATCH 03/18] Add tier-3 input synthesis with deny-by-default
 RoleAccounting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Graph-only bundles without input data get their inputs synthesized
per-op via free functions dispatched by node attribute type.
RoleAccounting enforces that every owned leaf input is accounted for
(FREE/STRUCTURED/DERIVED) — unaccounted inputs refuse the bundle.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../golden/input_init/RoleAccounting.hpp      | 149 ++++++++++++++++++
 .../golden/input_init/SynthesizeInputs.hpp    | 129 +++++++++++++++
 2 files changed, 278 insertions(+)
 create mode 100644 dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
 create mode 100644 dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp

diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
new file mode 100644
index 000000000000..c3ad564e4cc5
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
@@ -0,0 +1,149 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+
+namespace hipdnn_integration_tests::golden
+{
+
+// Pre-allocated input tensors keyed by uid, handed to a fill function to populate.
+using InputTensorMap
+    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
+
+// The outcome of trying to synthesize a node's inputs (tier-3 graph-only path).
+//
+//   filled == true  : every leaf input the node owns was given valid data.
+//   filled == false : at least one owned input is STRUCTURED, DERIVED, or
+//                     unrecognized. `reason` explains which — the harness SKIPs.
+struct FillOutcome
+{
+    bool filled = false;
+    std::string reason;
+
+    static FillOutcome ok()
+    {
+        return {true, {}};
+    }
+    static FillOutcome unsupported(std::string why)
+    {
+        return {false, std::move(why)};
+    }
+};
+
+// Drives a fill function's per-role declarations and enforces deny-by-default.
+//
+// An initializer declares, for each role it knows, whether that input is FREE
+// (fill from a numeric range), STRUCTURED (needs internal structure we cannot
+// synthesize), or DERIVED (must satisfy a relation to another computation). After
+// all declarations, finish() returns Filled only if EVERY owned leaf input was
+// accounted for — any owned uid that no declaration claimed is itself a refusal
+// (a role the initializer forgot, or a tensor it does not understand). This is the
+// safety net that prevents a half-filled input map from reaching an executor.
+//
+// Absent optional inputs are passed as uid 0 (the flatbuffer default) or simply
+// not present in `inputs`; such uids are ignored — only uids that are actually
+// owned leaf inputs need accounting.
+class RoleAccounting
+{
+public:
+    RoleAccounting(const std::vector<int64_t>& ownedLeafInputUids, InputTensorMap& inputs)
+        : _inputs(inputs)
+        , _owned(ownedLeafInputUids.begin(), ownedLeafInputUids.end())
+    {
+    }
+
+    // FREE role: if `uid` is an owned leaf input, fill it with uniform values in
+    // [lo, hi] and mark it accounted. A uid of 0 or one not in the owned set is
+    // ignored (an absent optional input). Uses the tensor's own dtype-aware random
+    // fill, so no std::visit on dtype is needed here.
+    void fillFree(int64_t uid, float lo, float hi, std::mt19937& rng)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        const auto seed = static_cast<unsigned int>(rng());
+        _inputs.at(uid)->fillTensorWithRandomValues(lo, hi, seed);
+        _accounted.insert(uid);
+    }
+
+    // STRUCTURED role: declares that `uid`, if owned, cannot be synthesized
+    // because it needs internal structure (sequence lengths, page tables, block
+    // masks, dropout seeds, ...). Records a refusal reason.
+    void markStructured(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (structured input)");
+    }
+
+    // DERIVED role: declares that `uid`, if owned, cannot be synthesized standalone
+    // because it must equal the output of another computation (e.g. SDPA-backward
+    // consumes the forward's O and softmax stats). Records a refusal reason.
+    void markDerived(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (derived from another computation)");
+    }
+
+    // Filled iff every owned leaf input was accounted AND none were refused.
+    // Otherwise Unsupported, listing the refused roles plus any owned uid no
+    // declaration claimed (the deny-by-default catch).
+    FillOutcome finish(const char* opName) const
+    {
+        std::vector<std::string> reasons = _refusals;
+        for(const int64_t uid : _owned)
+        {
+            if(_accounted.count(uid) == 0)
+            {
+                reasons.push_back("tensor uid=" + std::to_string(uid)
+                                  + " (no role declared by initializer)");
+            }
+        }
+
+        if(reasons.empty())
+        {
+            return FillOutcome::ok();
+        }
+
+        std::ostringstream os;
+        os << opName << " inputs cannot be synthesized: ";
+        for(size_t i = 0; i < reasons.size(); ++i)
+        {
+            os << (i == 0 ? "" : ", ") << reasons[i];
+        }
+        return FillOutcome::unsupported(os.str());
+    }
+
+private:
+    bool isOwned(int64_t uid) const
+    {
+        return uid != 0 && _owned.count(uid) != 0;
+    }
+
+    InputTensorMap& _inputs;
+    std::set<int64_t> _owned;
+    std::set<int64_t> _accounted;
+    std::vector<std::string> _refusals;
+};
+
+} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
new file mode 100644
index 000000000000..2eaa27ff6e3a
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -0,0 +1,129 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "harness/golden/input_init/RoleAccounting.hpp"
+
+namespace hipdnn_integration_tests::golden
+{
+
+// ── Per-op fill functions ─────────────────────────────────────────────────────
+// To add an op: copy fillBatchnormInputs, adapt for your op's attributes, and
+// add one case to the switch in synthesizeNodeInputs() below. Each function
+// fills EVERY leaf input its node owns. The fill must be deterministic given
+// `rng` (seeded from BundleMetadata::seed) so a graph-only bundle reproduces
+// the same inputs across runs.
+
+// Batchnorm-inference: every input is FREE (fillable from a numeric range).
+// Ranges keep the op numerically well-behaved — inv_variance in [0.5, 1.5]
+// avoids the blow-up in y = (x-mean)*inv_var*scale+bias.
+inline FillOutcome fillBatchnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                       const std::vector<int64_t>& ownedLeafInputUids,
+                                       InputTensorMap& inputs,
+                                       std::mt19937& rng)
+{
+    const auto* attrs = node.attributes_as_BatchnormInferenceAttributes();
+    if(attrs == nullptr)
+    {
+        return FillOutcome::unsupported(
+            "node is not BatchnormInferenceAttributes (initializer mis-registered)");
+    }
+
+    RoleAccounting acct(ownedLeafInputUids, inputs);
+    acct.fillFree(attrs->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    acct.fillFree(attrs->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
+    acct.fillFree(attrs->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("BatchnormInference");
+}
+
+// SDPA-forward: Q/K/V/mask/scale are FREE; sequence lengths, page tables, block
+// masks, and dropout state are STRUCTURED (refused if present as leaf inputs).
+// A plain Q/K/V graph fills fine; the moment a STRUCTURED input is actually
+// present the bundle is refused (SKIP).
+inline FillOutcome fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         const std::vector<int64_t>& ownedLeafInputUids,
+                                         InputTensorMap& inputs,
+                                         std::mt19937& rng)
+{
+    const auto* attrs = node.attributes_as_SdpaAttributes();
+    if(attrs == nullptr)
+    {
+        return FillOutcome::unsupported("node is not SdpaAttributes (initializer mis-registered)");
+    }
+
+    RoleAccounting acct(ownedLeafInputUids, inputs);
+
+    acct.fillFree(attrs->q_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->k_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->v_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+
+    acct.markStructured(attrs->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    acct.markStructured(attrs->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    acct.markStructured(attrs->page_table_k_tensor_uid().value_or(0), "page_table_k");
+    acct.markStructured(attrs->page_table_v_tensor_uid().value_or(0), "page_table_v");
+    acct.markStructured(attrs->block_mask_tensor_uid().value_or(0), "block_mask");
+    acct.markStructured(attrs->seed_tensor_uid().value_or(0), "dropout_seed");
+    acct.markStructured(attrs->offset_tensor_uid().value_or(0), "dropout_offset");
+
+    return acct.finish("Sdpa");
+}
+
+// SDPA-backward: Q/K/V/dO are FREE; O and stats are DERIVED (must match a
+// forward pass). A standalone backward graph-only bundle is refused; when
+// forward+backward are fused in one graph, O/stats are virtual inter-node edges
+// and never reach this function.
+inline FillOutcome fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                          const std::vector<int64_t>& ownedLeafInputUids,
+                                          InputTensorMap& inputs,
+                                          std::mt19937& rng)
+{
+    const auto* attrs = node.attributes_as_SdpaBackwardAttributes();
+    if(attrs == nullptr)
+    {
+        return FillOutcome::unsupported(
+            "node is not SdpaBackwardAttributes (initializer mis-registered)");
+    }
+
+    RoleAccounting acct(ownedLeafInputUids, inputs);
+
+    acct.fillFree(attrs->q_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->k_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->v_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(attrs->do_tensor_uid(), -1.0f, 1.0f, rng);
+
+    acct.markDerived(attrs->o_tensor_uid(), "o (forward output)");
+    acct.markDerived(attrs->stats_tensor_uid(), "stats (forward softmax stats)");
+
+    return acct.finish("SdpaBackward");
+}
+
+// ── Dispatch ──────────────────────────────────────────────────────────────────
+// Maps a node's attribute type to its fill function. Unknown ops return
+// unsupported — the harness SKIPs and records it in the unverifiable report.
+
+inline FillOutcome synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                        const std::vector<int64_t>& ownedLeafInputUids,
+                                        InputTensorMap& inputs,
+                                        std::mt19937& rng)
+{
+    using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
+
+    switch(node.attributes_type())
+    {
+    case NA::BatchnormInferenceAttributes:
+        return fillBatchnormInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::SdpaAttributes:
+        return fillSdpaForwardInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::SdpaBackwardAttributes:
+        return fillSdpaBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+    default:
+        return FillOutcome::unsupported("no input synthesis registered for this op");
+    }
+}
+
+} // namespace hipdnn_integration_tests::golden

From 6552b4105cdf48ec88b740ed47e8169281c4d6fd Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 16:39:13 -0400
Subject: [PATCH 04/18] =?UTF-8?q?Rename=20RoleAccounting=20=E2=86=92=20Syn?=
 =?UTF-8?q?thesisTracker,=20expand=20synthesis=20to=20all=2019=20ops?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename FillOutcome → SynthesisResult and RoleAccounting → SynthesisTracker
  for clarity: SynthesisTracker tracks per-node input role declarations,
  SynthesisResult reports whether synthesis succeeded.
- Rename RoleAccounting.hpp → SynthesisTracker.hpp to match the class name.
- Expand input synthesis from 3 ops to all 19 supported attribute types
  (conv fwd/bwd/wrw, batchnorm inference/variance/training/backward,
  matmul, pointwise, reduction, layernorm fwd/bwd, rmsnorm fwd/bwd,
  resample, block-scale dequantize/quantize, sdpa fwd/bwd).
- Remove single-node graph restriction — harness now loops over all nodes
  in a graph, supporting fused graphs (e.g. conv+bias+relu).
- Improve comments throughout for reviewer clarity: explain what "owns"
  means, document the 4-step fill pattern, add per-function comments
  explaining why specific inputs are STRUCTURED or DERIVED.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.hpp |  40 +-
 .../golden/input_init/RoleAccounting.hpp      | 149 ------
 .../golden/input_init/SynthesisTracker.hpp    | 170 ++++++
 .../golden/input_init/SynthesizeInputs.hpp    | 504 +++++++++++++++---
 4 files changed, 628 insertions(+), 235 deletions(-)
 delete mode 100644 dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
 create mode 100644 dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index 4d5d8a75013b..3b5cd243219e 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -373,31 +373,19 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         return synthesizeInputs();
     }
 
-    // tier-3 synthesis: single-node graph whose op has a registered initializer.
-    // Builds zeroed input tensors from graph attributes, routes each leaf input to
-    // its owning node's initializer, and fills them. Any refusal -> SKIP+report.
+    // tier-3 synthesis: builds zeroed input tensors from graph attributes, walks
+    // every node (single-node or fused), and fills each node's owned leaf inputs.
+    // If any node's synthesis fails the whole graph is skipped.
     bool synthesizeInputs()
     {
         const auto wrapper = _bundle->graphWrapper();
-
-        if(wrapper.nodeCount() != 1)
-        {
-            skipUnverifiable("graph-only bundle with no input data: input synthesis supports "
-                             "single-node graphs only (this graph has "
-                             + std::to_string(wrapper.nodeCount()) + " nodes)");
-            return false;
-        }
-
-        const auto& node = wrapper.getNode(0);
-
-        // Leaf inputs = non-virtual tensors that are not graph outputs. (For a
-        // single-node graph every such tensor is an input to that node.)
         const auto& tensorAttrMap = wrapper.getTensorMap();
         const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
                                            _bundle->outputTensorUids.end());
 
+        // Leaf inputs = non-virtual tensors that are not graph outputs.
         InputTensorMap inputs;
-        std::vector<int64_t> leafInputUids;
+        std::vector<int64_t> allLeafInputUids;
         for(const auto& [uid, attrs] : tensorAttrMap)
         {
             if(attrs->virtual_() || outputUids.count(uid) != 0)
@@ -406,17 +394,25 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
             }
             inputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*attrs);
             inputs[uid]->fillTensorWithValue(0.f);
-            leafInputUids.push_back(uid);
+            allLeafInputUids.push_back(uid);
         }
 
         std::mt19937 rng(static_cast<std::mt19937::result_type>(
             _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
 
-        const FillOutcome outcome = synthesizeNodeInputs(node, leafInputUids, inputs, rng);
-        if(!outcome.filled)
+        // Synthesize per node. In a fused graph (e.g. conv+bias+relu) each node
+        // owns a disjoint subset of the leaf inputs; virtual inter-node tensors
+        // are excluded above and handled by the engine at execution time.
+        for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
         {
-            skipUnverifiable(outcome.reason);
-            return false;
+            const auto& node = wrapper.getNode(i);
+            const SynthesisResult outcome
+                = synthesizeNodeInputs(node, allLeafInputUids, inputs, rng);
+            if(!outcome.filled)
+            {
+                skipUnverifiable(outcome.reason);
+                return false;
+            }
         }
 
         _bundle->tensors = std::move(inputs);
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
deleted file mode 100644
index c3ad564e4cc5..000000000000
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/RoleAccounting.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <set>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <hipdnn_data_sdk/utilities/Tensor.hpp>
-#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
-
-namespace hipdnn_integration_tests::golden
-{
-
-// Pre-allocated input tensors keyed by uid, handed to a fill function to populate.
-using InputTensorMap
-    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
-
-// The outcome of trying to synthesize a node's inputs (tier-3 graph-only path).
-//
-//   filled == true  : every leaf input the node owns was given valid data.
-//   filled == false : at least one owned input is STRUCTURED, DERIVED, or
-//                     unrecognized. `reason` explains which — the harness SKIPs.
-struct FillOutcome
-{
-    bool filled = false;
-    std::string reason;
-
-    static FillOutcome ok()
-    {
-        return {true, {}};
-    }
-    static FillOutcome unsupported(std::string why)
-    {
-        return {false, std::move(why)};
-    }
-};
-
-// Drives a fill function's per-role declarations and enforces deny-by-default.
-//
-// An initializer declares, for each role it knows, whether that input is FREE
-// (fill from a numeric range), STRUCTURED (needs internal structure we cannot
-// synthesize), or DERIVED (must satisfy a relation to another computation). After
-// all declarations, finish() returns Filled only if EVERY owned leaf input was
-// accounted for — any owned uid that no declaration claimed is itself a refusal
-// (a role the initializer forgot, or a tensor it does not understand). This is the
-// safety net that prevents a half-filled input map from reaching an executor.
-//
-// Absent optional inputs are passed as uid 0 (the flatbuffer default) or simply
-// not present in `inputs`; such uids are ignored — only uids that are actually
-// owned leaf inputs need accounting.
-class RoleAccounting
-{
-public:
-    RoleAccounting(const std::vector<int64_t>& ownedLeafInputUids, InputTensorMap& inputs)
-        : _inputs(inputs)
-        , _owned(ownedLeafInputUids.begin(), ownedLeafInputUids.end())
-    {
-    }
-
-    // FREE role: if `uid` is an owned leaf input, fill it with uniform values in
-    // [lo, hi] and mark it accounted. A uid of 0 or one not in the owned set is
-    // ignored (an absent optional input). Uses the tensor's own dtype-aware random
-    // fill, so no std::visit on dtype is needed here.
-    void fillFree(int64_t uid, float lo, float hi, std::mt19937& rng)
-    {
-        if(!isOwned(uid))
-        {
-            return;
-        }
-        const auto seed = static_cast<unsigned int>(rng());
-        _inputs.at(uid)->fillTensorWithRandomValues(lo, hi, seed);
-        _accounted.insert(uid);
-    }
-
-    // STRUCTURED role: declares that `uid`, if owned, cannot be synthesized
-    // because it needs internal structure (sequence lengths, page tables, block
-    // masks, dropout seeds, ...). Records a refusal reason.
-    void markStructured(int64_t uid, const char* role)
-    {
-        if(!isOwned(uid))
-        {
-            return;
-        }
-        _accounted.insert(uid);
-        _refusals.push_back(std::string(role) + " (structured input)");
-    }
-
-    // DERIVED role: declares that `uid`, if owned, cannot be synthesized standalone
-    // because it must equal the output of another computation (e.g. SDPA-backward
-    // consumes the forward's O and softmax stats). Records a refusal reason.
-    void markDerived(int64_t uid, const char* role)
-    {
-        if(!isOwned(uid))
-        {
-            return;
-        }
-        _accounted.insert(uid);
-        _refusals.push_back(std::string(role) + " (derived from another computation)");
-    }
-
-    // Filled iff every owned leaf input was accounted AND none were refused.
-    // Otherwise Unsupported, listing the refused roles plus any owned uid no
-    // declaration claimed (the deny-by-default catch).
-    FillOutcome finish(const char* opName) const
-    {
-        std::vector<std::string> reasons = _refusals;
-        for(const int64_t uid : _owned)
-        {
-            if(_accounted.count(uid) == 0)
-            {
-                reasons.push_back("tensor uid=" + std::to_string(uid)
-                                  + " (no role declared by initializer)");
-            }
-        }
-
-        if(reasons.empty())
-        {
-            return FillOutcome::ok();
-        }
-
-        std::ostringstream os;
-        os << opName << " inputs cannot be synthesized: ";
-        for(size_t i = 0; i < reasons.size(); ++i)
-        {
-            os << (i == 0 ? "" : ", ") << reasons[i];
-        }
-        return FillOutcome::unsupported(os.str());
-    }
-
-private:
-    bool isOwned(int64_t uid) const
-    {
-        return uid != 0 && _owned.count(uid) != 0;
-    }
-
-    InputTensorMap& _inputs;
-    std::set<int64_t> _owned;
-    std::set<int64_t> _accounted;
-    std::vector<std::string> _refusals;
-};
-
-} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
new file mode 100644
index 000000000000..e1ad75a80bb6
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
@@ -0,0 +1,170 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+
+namespace hipdnn_integration_tests::golden
+{
+
+// Pre-allocated input tensors keyed by uid, handed to a fill function to populate.
+using InputTensorMap
+    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
+
+// Result of synthesizeNodeInputs() for one node. filled==true means every
+// input the node owns got valid data. filled==false means at least one could
+// not be synthesized — reason says which and why.
+struct SynthesisResult
+{
+    bool filled = false;
+    std::string reason;
+
+    static SynthesisResult ok()
+    {
+        return {true, {}};
+    }
+    static SynthesisResult unsupported(std::string why)
+    {
+        return {false, std::move(why)};
+    }
+};
+
+// Tracks which inputs a node's fill function has accounted for. Each input must be
+// declared as one of three roles:
+//
+//   FREE       — random values in a range work. The range can be tight (e.g.
+//                variance in [0.5, 1.5] to stay positive) or wide (e.g. x in
+//                [-1, 1]). What matters is that any value in the range is valid.
+//   STRUCTURED — random values in any range won't work. The data needs to be
+//                consistent with other state or follow a specific format.
+//
+//                Example 1: dropout seeds — forward and backward must use the
+//                same seed so they generate the same drop pattern. A randomly
+//                synthesized seed for a standalone backward won't match any
+//                forward pass, producing wrong gradients.
+//
+//                Example 2: page table indices (paged attention) — when serving
+//                multiple users, each user's K and V data grows at different
+//                rates. Instead of pre-allocating a large contiguous block per
+//                user, GPU memory is pooled into equal-size chunks handed out
+//                on demand. A user's data ends up scattered across
+//                non-contiguous chunks. The page table tensor holds chunk
+//                indices telling the kernel where each user's data lives.
+//                Randomly generated indices would not correspond to valid
+//                allocated chunks, producing incorrect reads or crashes.
+//
+//                Example 3: peer_stats (multi-GPU batchnorm) — when a batch
+//                is split across multiple GPUs, each GPU computes local
+//                statistics (mean, variance) for its chunk. To produce
+//                correct global statistics, each GPU must read the others'
+//                partial results. The peer_stats tensor holds references to
+//                other GPUs' memory regions. Randomly generated values would
+//                point to invalid cross-device memory.
+// 
+//   DERIVED    — the value must come from another op's output, not from random
+//                generation (e.g. a backward pass needs the forward pass's output
+//                tensor and intermediate statistics to compute correct gradients).
+//
+// finish() succeeds only when every owned input was declared as some role AND
+// none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
+// produce a diagnostic message so the caller knows what went wrong.
+class SynthesisTracker
+{
+public:
+    SynthesisTracker(const std::vector<int64_t>& ownedLeafInputUids, InputTensorMap& inputs)
+        : _inputs(inputs)
+        , _owned(ownedLeafInputUids.begin(), ownedLeafInputUids.end())
+    {
+    }
+
+    // Declares `uid` as FREE — fills it with random values in [lo, hi] and accounts for it.
+    void fillFree(int64_t uid, float lo, float hi, std::mt19937& rng)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        const auto seed = static_cast<unsigned int>(rng());
+        _inputs.at(uid)->fillTensorWithRandomValues(lo, hi, seed);
+        _accounted.insert(uid);
+    }
+
+    // Declares `uid` as STRUCTURED — accounts for it but records a refusal.
+    void markStructured(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (structured input)");
+    }
+
+    // Declares `uid` as DERIVED — accounts for it but records a refusal.
+    void markDerived(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (derived from another computation)");
+    }
+
+    // Returns ok() when all owned inputs were filled with random data.
+    // Returns unsupported() when synthesis cannot produce valid data for
+    // this node — either because an owned input is STRUCTURED/DERIVED
+    // (we know about it but can't fill it), or because an owned input was
+    // never declared (the fill function forgot about it).
+    // Note: absent optional tensors (uid 0) and virtual tensors are not
+    // owned, so STRUCTURED/DERIVED calls on them are silently ignored.
+    SynthesisResult finish(const char* opName) const
+    {
+        std::vector<std::string> reasons = _refusals;
+        for(const int64_t uid : _owned)
+        {
+            if(_accounted.count(uid) == 0)
+            {
+                reasons.push_back("tensor uid=" + std::to_string(uid)
+                                  + " (no role declared by initializer)");
+            }
+        }
+
+        if(reasons.empty())
+        {
+            return SynthesisResult::ok();
+        }
+
+        std::ostringstream os;
+        os << opName << " inputs cannot be synthesized: ";
+        for(size_t i = 0; i < reasons.size(); ++i)
+        {
+            os << (i == 0 ? "" : ", ") << reasons[i];
+        }
+        return SynthesisResult::unsupported(os.str());
+    }
+
+private:
+    bool isOwned(int64_t uid) const
+    {
+        return uid != 0 && _owned.count(uid) != 0;
+    }
+
+    InputTensorMap& _inputs; // leaf inputs only (non-virtual, non-output tensors)
+    std::set<int64_t> _owned;
+    std::set<int64_t> _accounted;
+    std::vector<std::string> _refusals;
+};
+
+} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
index 2eaa27ff6e3a..6cfcc0ec0c35 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -3,110 +3,454 @@
 
 #pragma once
 
-#include "harness/golden/input_init/RoleAccounting.hpp"
+#include "harness/golden/input_init/SynthesisTracker.hpp"
 
 namespace hipdnn_integration_tests::golden
 {
 
 // ── Per-op fill functions ─────────────────────────────────────────────────────
-// To add an op: copy fillBatchnormInputs, adapt for your op's attributes, and
-// add one case to the switch in synthesizeNodeInputs() below. Each function
-// fills EVERY leaf input its node owns. The fill must be deterministic given
-// `rng` (seeded from BundleMetadata::seed) so a graph-only bundle reproduces
-// the same inputs across runs.
-
-// Batchnorm-inference: every input is FREE (fillable from a numeric range).
-// Ranges keep the op numerically well-behaved — inv_variance in [0.5, 1.5]
-// avoids the blow-up in y = (x-mean)*inv_var*scale+bias.
-inline FillOutcome fillBatchnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+// Each function synthesizes inputs for one node in the graph. A node "owns" the
+// leaf input tensors declared in its flatbuffer attributes — virtual tensors
+// (inter-node edges in a fused graph) and output tensors are excluded.
+//
+// Every function follows the same pattern:
+//   1. Cast the node to its concrete attribute type.
+//   2. Create a SynthesisTracker with the node's owned uids.
+//   3. Declare each input as FREE (fill with random values), STRUCTURED (can't
+//      synthesize — needs specific format), or DERIVED (must come from another
+//      op's output). See SynthesisTracker.hpp for role definitions.
+//   4. Call finish() — returns ok() if all owned inputs were filled, or
+//      unsupported() with a diagnostic listing what couldn't be synthesized.
+//
+// Fills must be deterministic given `rng` so re-running the same graph produces
+// identical inputs for reproducible comparisons.
+//
+// To add a new op: copy fillConvFwdInputs (simplest example), adapt for your
+// op's attributes, and add one case to the switch in synthesizeNodeInputs().
+// Function names follow the pattern fill<AttributeName>Inputs.
+
+// ── Convolution ───────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                     const std::vector<int64_t>& ownedLeafInputUids,
+                                     InputTensorMap& inputs,
+                                     std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionFwdAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not ConvolutionFwdAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("ConvolutionFwd");
+}
+
+inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         const std::vector<int64_t>& ownedLeafInputUids,
+                                         InputTensorMap& inputs,
+                                         std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionBwdAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not ConvolutionBwdAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("ConvolutionBwdData");
+}
+
+inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                            const std::vector<int64_t>& ownedLeafInputUids,
+                                            InputTensorMap& inputs,
+                                            std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionWrwAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not ConvolutionWrwAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("ConvolutionBwdWeights");
+}
+
+// ── Batchnorm ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillBatchnormInferenceInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormInferenceAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BatchnormInferenceAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    acct.fillFree(a->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("BatchnormInference");
+}
+
+inline SynthesisResult fillBatchnormInferenceVarianceInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormInferenceAttributesVarianceExt();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BatchnormInferenceAttributesVarianceExt");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    acct.fillFree(a->variance_tensor_uid(), 0.5f, 1.5f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return acct.finish("BatchnormInferenceVarianceExt");
+}
+
+// peer_stats holds references to other GPUs' memory for multi-GPU batchnorm —
+// randomly generated values would point to invalid cross-device memory.
+inline SynthesisResult fillBatchnormTrainingInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BatchnormAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    acct.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
+    acct.fillFree(a->prev_running_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
+    acct.fillFree(a->momentum_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+
+    if(a->peer_stats_tensor_uid() != nullptr)
+    {
+        for(const int64_t uid : *a->peer_stats_tensor_uid())
+        {
+            acct.markStructured(uid, "peer_stats");
+        }
+    }
+
+    return acct.finish("BatchnormTraining");
+}
+
+// mean/inv_variance are optional (may come from forward). peer_stats: see above.
+inline SynthesisResult fillBatchnormBackwardInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormBackwardAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BatchnormBackwardAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
+    acct.fillFree(a->inv_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+
+    if(a->peer_stats_tensor_uid() != nullptr)
+    {
+        for(const int64_t uid : *a->peer_stats_tensor_uid())
+        {
+            acct.markStructured(uid, "peer_stats");
+        }
+    }
+
+    return acct.finish("BatchnormBackward");
+}
+
+// ── Matmul ────────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                    const std::vector<int64_t>& ownedLeafInputUids,
+                                    InputTensorMap& inputs,
+                                    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_MatmulAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not MatmulAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->a_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->b_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("Matmul");
+}
+
+// ── Pointwise ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
                                        const std::vector<int64_t>& ownedLeafInputUids,
                                        InputTensorMap& inputs,
                                        std::mt19937& rng)
 {
-    const auto* attrs = node.attributes_as_BatchnormInferenceAttributes();
-    if(attrs == nullptr)
+    const auto* a = node.attributes_as_PointwiseAttributes();
+    if(!a)
     {
-        return FillOutcome::unsupported(
-            "node is not BatchnormInferenceAttributes (initializer mis-registered)");
+        return SynthesisResult::unsupported("not PointwiseAttributes");
     }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->in_0_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->in_1_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    acct.fillFree(a->in_2_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    acct.fillFree(a->axis_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return acct.finish("Pointwise");
+}
 
-    RoleAccounting acct(ownedLeafInputUids, inputs);
-    acct.fillFree(attrs->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->mean_tensor_uid(), -0.1f, 0.1f, rng);
-    acct.fillFree(attrs->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
-    acct.fillFree(attrs->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->bias_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("BatchnormInference");
+// ── Reduction ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                       const std::vector<int64_t>& ownedLeafInputUids,
+                                       InputTensorMap& inputs,
+                                       std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ReductionAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not ReductionAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->in_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("Reduction");
+}
+
+// ── LayerNorm ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                       const std::vector<int64_t>& ownedLeafInputUids,
+                                       InputTensorMap& inputs,
+                                       std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_LayernormAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not LayernormAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return acct.finish("Layernorm");
+}
+
+// mean and inv_variance are computed by the forward pass — a standalone backward
+// can't produce correct gradients without them.
+inline SynthesisResult fillLayernormBackwardInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_LayernormBackwardAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not LayernormBackwardAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.markDerived(a->mean_tensor_uid().value_or(0), "mean (forward output)");
+    acct.markDerived(a->inv_variance_tensor_uid().value_or(0), "inv_variance (forward output)");
+    acct.fillFree(a->epsilon_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+    return acct.finish("LayernormBackward");
+}
+
+// ── RMSNorm ───────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                     const std::vector<int64_t>& ownedLeafInputUids,
+                                     InputTensorMap& inputs,
+                                     std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_RMSNormAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not RMSNormAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    acct.fillFree(a->bias_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return acct.finish("RMSNorm");
+}
+
+// inv_rms is computed by the forward pass.
+inline SynthesisResult fillRmsnormBackwardInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_RMSNormBackwardAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not RMSNormBackwardAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.markDerived(a->inv_rms_tensor_uid(), "inv_rms (forward output)");
+    return acct.finish("RMSNormBackward");
+}
+
+// ── Resample ──────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         const std::vector<int64_t>& ownedLeafInputUids,
+                                         InputTensorMap& inputs,
+                                         std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ResampleFwdAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not ResampleFwdAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("ResampleFwd");
+}
+
+// ── Block-scale quantization ──────────────────────────────────────────────────
+
+// Scale tensor holds per-block quantization factors that must match the
+// quantized data — random scales would produce garbage dequantized values.
+inline SynthesisResult fillBlockScaleDequantizeInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BlockScaleDequantizeAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BlockScaleDequantizeAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.markStructured(a->scale_tensor_uid(), "scale (block quantization scales)");
+    return acct.finish("BlockScaleDequantize");
 }
 
-// SDPA-forward: Q/K/V/mask/scale are FREE; sequence lengths, page tables, block
-// masks, and dropout state are STRUCTURED (refused if present as leaf inputs).
-// A plain Q/K/V graph fills fine; the moment a STRUCTURED input is actually
-// present the bundle is refused (SKIP).
-inline FillOutcome fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+inline SynthesisResult fillBlockScaleQuantizeInputs(
+    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+    const std::vector<int64_t>& ownedLeafInputUids,
+    InputTensorMap& inputs,
+    std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BlockScaleQuantizeAttributes();
+    if(!a)
+    {
+        return SynthesisResult::unsupported("not BlockScaleQuantizeAttributes");
+    }
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
+    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return acct.finish("BlockScaleQuantize");
+}
+
+// ── SDPA ──────────────────────────────────────────────────────────────────────
+
+// Q/K/V/mask/scale accept random values. The remaining inputs are STRUCTURED:
+// seq lengths encode actual sequence boundaries, page tables map to allocated
+// GPU memory chunks, block masks define sparse attention patterns, and dropout
+// seed/offset must match between forward and backward passes.
+// Most of these are optional — absent ones (uid 0) are silently ignored.
+inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
                                          const std::vector<int64_t>& ownedLeafInputUids,
                                          InputTensorMap& inputs,
                                          std::mt19937& rng)
 {
-    const auto* attrs = node.attributes_as_SdpaAttributes();
-    if(attrs == nullptr)
+    const auto* a = node.attributes_as_SdpaAttributes();
+    if(!a)
     {
-        return FillOutcome::unsupported("node is not SdpaAttributes (initializer mis-registered)");
+        return SynthesisResult::unsupported("not SdpaAttributes");
     }
 
-    RoleAccounting acct(ownedLeafInputUids, inputs);
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
 
-    acct.fillFree(attrs->q_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->k_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->v_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    acct.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    acct.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
 
-    acct.markStructured(attrs->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
-    acct.markStructured(attrs->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
-    acct.markStructured(attrs->page_table_k_tensor_uid().value_or(0), "page_table_k");
-    acct.markStructured(attrs->page_table_v_tensor_uid().value_or(0), "page_table_v");
-    acct.markStructured(attrs->block_mask_tensor_uid().value_or(0), "block_mask");
-    acct.markStructured(attrs->seed_tensor_uid().value_or(0), "dropout_seed");
-    acct.markStructured(attrs->offset_tensor_uid().value_or(0), "dropout_offset");
+    acct.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    acct.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    acct.markStructured(a->page_table_k_tensor_uid().value_or(0), "page_table_k");
+    acct.markStructured(a->page_table_v_tensor_uid().value_or(0), "page_table_v");
+    acct.markStructured(a->block_mask_tensor_uid().value_or(0), "block_mask");
+    acct.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
+    acct.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
 
     return acct.finish("Sdpa");
 }
 
-// SDPA-backward: Q/K/V/dO are FREE; O and stats are DERIVED (must match a
-// forward pass). A standalone backward graph-only bundle is refused; when
-// forward+backward are fused in one graph, O/stats are virtual inter-node edges
-// and never reach this function.
-inline FillOutcome fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+// Q/K/V/dO accept random values. O (the forward output) and stats (softmax
+// statistics) are DERIVED — they must come from a forward pass to produce
+// correct gradients. In a fused forward+backward graph these are virtual
+// inter-node tensors (not owned, so silently skipped). A standalone backward
+// without a forward is refused.
+inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
                                           const std::vector<int64_t>& ownedLeafInputUids,
                                           InputTensorMap& inputs,
                                           std::mt19937& rng)
 {
-    const auto* attrs = node.attributes_as_SdpaBackwardAttributes();
-    if(attrs == nullptr)
+    const auto* a = node.attributes_as_SdpaBackwardAttributes();
+    if(!a)
     {
-        return FillOutcome::unsupported(
-            "node is not SdpaBackwardAttributes (initializer mis-registered)");
+        return SynthesisResult::unsupported("not SdpaBackwardAttributes");
     }
 
-    RoleAccounting acct(ownedLeafInputUids, inputs);
+    SynthesisTracker acct(ownedLeafInputUids, inputs);
 
-    acct.fillFree(attrs->q_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->k_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->v_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(attrs->do_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    acct.fillFree(a->do_tensor_uid(), -1.0f, 1.0f, rng);
 
-    acct.markDerived(attrs->o_tensor_uid(), "o (forward output)");
-    acct.markDerived(attrs->stats_tensor_uid(), "stats (forward softmax stats)");
+    acct.markDerived(a->o_tensor_uid(), "o (forward output)");
+    acct.markDerived(a->stats_tensor_uid(), "stats (forward softmax stats)");
 
     return acct.finish("SdpaBackward");
 }
 
 // ── Dispatch ──────────────────────────────────────────────────────────────────
-// Maps a node's attribute type to its fill function. Unknown ops return
-// unsupported — the harness SKIPs and records it in the unverifiable report.
+// Routes a node to its fill function based on the flatbuffer attribute type.
+// The harness calls this once per node in the graph — for a fused graph like
+// conv+bias+relu, each node is dispatched separately with only its own inputs.
+// Returns ok() when all of the node's inputs were filled, or unsupported() with
+// a diagnostic when the op is unrecognized or an input can't be synthesized.
 
-inline FillOutcome synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
                                         const std::vector<int64_t>& ownedLeafInputUids,
                                         InputTensorMap& inputs,
                                         std::mt19937& rng)
@@ -115,14 +459,46 @@ inline FillOutcome synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objec
 
     switch(node.attributes_type())
     {
+    case NA::ConvolutionFwdAttributes:
+        return fillConvFwdInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::ConvolutionBwdAttributes:
+        return fillConvBwdDataInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::ConvolutionWrwAttributes:
+        return fillConvBwdWeightsInputs(node, ownedLeafInputUids, inputs, rng);
     case NA::BatchnormInferenceAttributes:
-        return fillBatchnormInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBatchnormInferenceInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::BatchnormInferenceAttributesVarianceExt:
+        return fillBatchnormInferenceVarianceInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::BatchnormAttributes:
+        return fillBatchnormTrainingInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::BatchnormBackwardAttributes:
+        return fillBatchnormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::MatmulAttributes:
+        return fillMatmulInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::PointwiseAttributes:
+        return fillPointwiseInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::ReductionAttributes:
+        return fillReductionInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::LayernormAttributes:
+        return fillLayernormInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::LayernormBackwardAttributes:
+        return fillLayernormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::RMSNormAttributes:
+        return fillRmsnormInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::RMSNormBackwardAttributes:
+        return fillRmsnormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::ResampleFwdAttributes:
+        return fillResampleFwdInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::BlockScaleDequantizeAttributes:
+        return fillBlockScaleDequantizeInputs(node, ownedLeafInputUids, inputs, rng);
+    case NA::BlockScaleQuantizeAttributes:
+        return fillBlockScaleQuantizeInputs(node, ownedLeafInputUids, inputs, rng);
     case NA::SdpaAttributes:
         return fillSdpaForwardInputs(node, ownedLeafInputUids, inputs, rng);
     case NA::SdpaBackwardAttributes:
         return fillSdpaBackwardInputs(node, ownedLeafInputUids, inputs, rng);
     default:
-        return FillOutcome::unsupported("no input synthesis registered for this op");
+        return SynthesisResult::unsupported("no input synthesis registered for this op");
     }
 }
 

From 967e995e182ac8f104324d217aec1b0a8fcaffd4 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 17:39:46 -0400
Subject: [PATCH 05/18] Split harness into .hpp + .cpp, add SynthesisTracker
 unit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move all method implementations out of the 831-line header-only harness
into a .cpp file (~700 lines), leaving the header as declarations only
(~210 lines). The harness is an internal test executable with virtual
methods — no reason for it to be header-only.

Add 9 unit tests for SynthesisTracker covering: all-FREE success,
undeclared inputs, STRUCTURED/DERIVED refusals, uid-0 skip, non-owned
uid skip, empty owned set, mixed failures, and factory methods.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../integration-tests/CMakeLists.txt          |   1 +
 ...raphGoldenReferenceVerificationHarness.cpp | 698 +++++++++++++++++
 ...raphGoldenReferenceVerificationHarness.hpp | 729 ++----------------
 .../integration-tests/tests/CMakeLists.txt    |   2 +
 .../tests/TestSynthesisTracker.cpp            | 175 +++++
 5 files changed, 936 insertions(+), 669 deletions(-)
 create mode 100644 dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
 create mode 100644 dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp

diff --git a/dnn-providers/integration-tests/CMakeLists.txt b/dnn-providers/integration-tests/CMakeLists.txt
index c47dcbdd529f..1f2dd08974af 100644
--- a/dnn-providers/integration-tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/CMakeLists.txt
@@ -128,6 +128,7 @@ set(INTEGRATION_TESTS_EXE hipdnn_integration_tests)
 
 add_executable(${INTEGRATION_TESTS_EXE}
     src/main.cpp
+    src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
 )
 
 add_subdirectory(src/integration_tests)
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
new file mode 100644
index 000000000000..a9f5cdbd4d2d
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -0,0 +1,698 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+
+#include <algorithm>
+#include <ostream>
+#include <random>
+#include <set>
+#include <sstream>
+
+#include <hipdnn_data_sdk/utilities/Workspace.hpp>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_frontend/Graph.hpp>
+#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
+#include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
+#include <hipdnn_test_sdk/utilities/FlatbufferDatatypeMapping.hpp>
+#include <hipdnn_test_sdk/utilities/TensorDiff.hpp>
+#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
+#include <hipdnn_test_sdk/utilities/detail/FlatbufferTensorAttributesUtils.hpp>
+
+#include "harness/CpuReferenceGraphExecutorAdapter.hpp"
+#include "harness/ReferenceCapabilityError.hpp"
+#include "harness/SharedHandle.hpp"
+#include "harness/TestConfig.hpp"
+#include "harness/golden/UnverifiableBundleReport.hpp"
+#include "harness/golden/input_init/SynthesizeInputs.hpp"
+#include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
+
+namespace hipdnn_integration_tests::golden
+{
+
+// ---- virtual defaults ------------------------------------------------------
+
+void IntegrationGraphGoldenReferenceVerificationHarness::executeGraphThroughEngine(
+    std::unordered_map<int64_t, void*>& variantPack)
+{
+    auto handle = getSharedHandle();
+
+    const std::vector<uint8_t> graphBytes(
+        _bundle->graphBuffer.data(), _bundle->graphBuffer.data() + _bundle->graphBuffer.size());
+
+    hipdnn_frontend::graph::Graph graph;
+    auto err = graph.from_binary(handle, graphBytes);
+    ASSERT_TRUE(err.is_good()) << "from_binary failed: " << err.get_message();
+
+    std::vector<int64_t> engineIds;
+    auto status = graph.get_ranked_engine_ids(engineIds);
+
+    const auto graphSummary = [&] {
+        return std::to_string(_bundle->outputTensorUids.size()) + " output tensor(s), "
+               + std::to_string(engineIds.size()) + " ranked engine(s)";
+    };
+
+    if(TestConfig::get().hasEngineName())
+    {
+        int64_t targetEngineId = TestConfig::get().getEngineId();
+        if(status.is_bad()
+           || std::find(engineIds.begin(), engineIds.end(), targetEngineId) == engineIds.end())
+        {
+            throw std::runtime_error("Engine " + std::string(TestConfig::get().getEngineName())
+                                     + " does not support this graph (" + graphSummary() + ")");
+        }
+        graph.set_preferred_engine_id_ext(targetEngineId);
+    }
+    else
+    {
+        if(status.is_bad() || engineIds.empty())
+        {
+            throw std::runtime_error("No engine supports this graph (" + graphSummary() + ")");
+        }
+    }
+
+    auto result = graph.create_execution_plans();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    result = graph.check_support();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    result = graph.build_plans();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+
+    int64_t workspaceSize = 0;
+    result = graph.get_workspace_size(workspaceSize);
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    ASSERT_GE(workspaceSize, 0);
+    const hipdnn_data_sdk::utilities::Workspace workspace(static_cast<size_t>(workspaceSize));
+
+    result = graph.execute(handle, variantPack, workspace.get());
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::runReferenceExecutor(
+    ReferenceExecutorType type, std::unordered_map<int64_t, void*>& variantPack)
+{
+    auto executor = makeReferenceExecutor(type);
+    executor->execute(_bundle->graphBuffer.data(), _bundle->graphBuffer.size(), variantPack);
+}
+
+std::unique_ptr<IReferenceGraphExecutor>
+    IntegrationGraphGoldenReferenceVerificationHarness::makeReferenceExecutor(
+        ReferenceExecutorType type)
+{
+    switch(type)
+    {
+    case ReferenceExecutorType::CPU:
+        return std::make_unique<CpuReferenceGraphExecutorAdapter>();
+    case ReferenceExecutorType::GPU:
+        return std::make_unique<gpu_graph_executor::GpuReferenceGraphExecutor>();
+    default:
+        throw std::runtime_error("Unknown reference executor type");
+    }
+}
+
+// ---- top-level dispatch ----------------------------------------------------
+
+void IntegrationGraphGoldenReferenceVerificationHarness::runComparison()
+{
+    if(_bundle->outputTensorUids.empty())
+    {
+        skipUnverifiable("bundle has no output tensors to compare");
+        return;
+    }
+
+    if(!ensureInputsAvailable())
+    {
+        return;
+    }
+
+    switch(TestConfig::get().getVerificationMode())
+    {
+    case VerificationMode::GOLDEN:
+        runGoldenMode();
+        return;
+    case VerificationMode::GPU:
+        runExplicitRefMode(ReferenceExecutorType::GPU);
+        return;
+    case VerificationMode::CPU:
+        runExplicitRefMode(ReferenceExecutorType::CPU);
+        return;
+    case VerificationMode::AUTO:
+        runAutoMode();
+        return;
+    default:
+        FAIL() << "Unknown verification mode";
+        return;
+    }
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
+{
+    if(!_bundle->hasGoldenOutputs)
+    {
+        skipUnverifiable("no golden data (verification-mode=golden)");
+        return;
+    }
+    auto engineOutputs = runEngineCapturingOutputs();
+    if(!engineOutputs)
+    {
+        if(!::testing::Test::HasFatalFailure())
+        {
+            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+        }
+        return;
+    }
+    compareAgainstGolden(*engineOutputs);
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::runExplicitRefMode(
+    ReferenceExecutorType type)
+{
+    auto engineOutputs = runEngineCapturingOutputs();
+    if(!engineOutputs)
+    {
+        if(!::testing::Test::HasFatalFailure())
+        {
+            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+        }
+        return;
+    }
+
+    OutputTensors refOutputs;
+    const RefRunResult result = runReferenceCapturingOutputs(type, refOutputs);
+    switch(result.status)
+    {
+    case RefStatus::CAPABILITY_MISS:
+        skipUnverifiable(refLabel(type) + " cannot run this op: " + result.message);
+        return;
+    case RefStatus::RUNTIME_ERROR:
+        recordRefError(refLabel(type) + " errored: " + result.message);
+        FAIL() << refLabel(type) << " errored (verification-mode=" << refLabel(type)
+               << "): " << result.message;
+        return;
+    case RefStatus::RAN:
+        compareOutputs(*engineOutputs, refOutputs);
+        return;
+    default:
+        FAIL() << "Unknown RefStatus";
+        return;
+    }
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
+{
+    auto engineOutputs = runEngineCapturingOutputs();
+    if(!engineOutputs)
+    {
+        if(!::testing::Test::HasFatalFailure())
+        {
+            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
+        }
+        return;
+    }
+
+    if(_bundle->hasGoldenOutputs)
+    {
+        compareAgainstGolden(*engineOutputs);
+        return;
+    }
+
+    // GPU ref (non-final): capability miss or runtime error -> fall through.
+    {
+        OutputTensors refOutputs;
+        const RefRunResult gpu
+            = runReferenceCapturingOutputs(ReferenceExecutorType::GPU, refOutputs);
+        if(gpu.status == RefStatus::RAN)
+        {
+            compareOutputs(*engineOutputs, refOutputs);
+            return;
+        }
+        if(gpu.status == RefStatus::RUNTIME_ERROR)
+        {
+            recordRefError("GPU reference errored (auto mode, falling through to CPU): "
+                           + gpu.message);
+        }
+    }
+
+    // CPU ref (final): capability miss -> unverifiable; runtime error -> FAIL.
+    {
+        OutputTensors refOutputs;
+        const RefRunResult cpu
+            = runReferenceCapturingOutputs(ReferenceExecutorType::CPU, refOutputs);
+        switch(cpu.status)
+        {
+        case RefStatus::CAPABILITY_MISS:
+            skipUnverifiable("no reference available (golden absent; GPU and CPU ref "
+                             "cannot run this op): "
+                             + cpu.message);
+            return;
+        case RefStatus::RUNTIME_ERROR:
+            recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
+            FAIL() << "CPU reference errored (auto mode, last resort): " << cpu.message;
+            return;
+        case RefStatus::RAN:
+            compareOutputs(*engineOutputs, refOutputs);
+            return;
+        default:
+            FAIL() << "Unknown RefStatus";
+            return;
+        }
+    }
+}
+
+// ---- inputs ----------------------------------------------------------------
+
+bool IntegrationGraphGoldenReferenceVerificationHarness::ensureInputsAvailable()
+{
+    if(_bundle->tensors.has_value())
+    {
+        return true;
+    }
+    return synthesizeInputs();
+}
+
+bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
+{
+    const auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+    const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                       _bundle->outputTensorUids.end());
+
+    InputTensorMap inputs;
+    std::vector<int64_t> allLeafInputUids;
+    for(const auto& [uid, attrs] : tensorAttrMap)
+    {
+        if(attrs->virtual_() || outputUids.count(uid) != 0)
+        {
+            continue;
+        }
+        inputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*attrs);
+        inputs[uid]->fillTensorWithValue(0.f);
+        allLeafInputUids.push_back(uid);
+    }
+
+    std::mt19937 rng(static_cast<std::mt19937::result_type>(
+        _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
+
+    for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
+    {
+        const auto& node = wrapper.getNode(i);
+        const SynthesisResult outcome
+            = synthesizeNodeInputs(node, allLeafInputUids, inputs, rng);
+        if(!outcome.filled)
+        {
+            skipUnverifiable(outcome.reason);
+            return false;
+        }
+    }
+
+    _bundle->tensors = std::move(inputs);
+    return true;
+}
+
+// ---- engine + reference runs -----------------------------------------------
+
+OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateZeroedOutputs() const
+{
+    const auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+
+    OutputTensors outputs;
+    for(const int64_t uid : _bundle->outputTensorUids)
+    {
+        outputs[uid]
+            = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
+        outputs[uid]->fillTensorWithValue(0.f);
+    }
+    return outputs;
+}
+
+std::unordered_map<int64_t, void*>
+    IntegrationGraphGoldenReferenceVerificationHarness::buildVariantPack(OutputTensors& outputs,
+                                                                         bool useDevice) const
+{
+    std::unordered_map<int64_t, void*> variantPack;
+    const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                       _bundle->outputTensorUids.end());
+
+    for(auto& [uid, tensor] : *_bundle->tensors)
+    {
+        if(outputUids.count(uid) != 0)
+        {
+            continue;
+        }
+        variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+    }
+    for(auto& [uid, tensor] : outputs)
+    {
+        variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+    }
+    return variantPack;
+}
+
+std::optional<OutputTensors>
+    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs()
+{
+    OutputTensors engineOutputs = allocateZeroedOutputs();
+    auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
+
+    bool threw = false;
+    std::string error;
+    try
+    {
+        executeGraphThroughEngine(variantPack);
+    }
+    catch(const std::exception& e)
+    {
+        threw = true;
+        error = e.what();
+    }
+
+    if(::testing::Test::HasFatalFailure())
+    {
+        return std::nullopt;
+    }
+    if(threw)
+    {
+        return std::nullopt;
+    }
+
+    markOutputsModified(engineOutputs);
+    return engineOutputs;
+}
+
+IntegrationGraphGoldenReferenceVerificationHarness::RefRunResult
+    IntegrationGraphGoldenReferenceVerificationHarness::runReferenceCapturingOutputs(
+        ReferenceExecutorType type, OutputTensors& refOutputs)
+{
+    refOutputs = allocateZeroedOutputs();
+    const bool useDevice = (type == ReferenceExecutorType::GPU);
+    auto variantPack = buildVariantPack(refOutputs, useDevice);
+
+    try
+    {
+        runReferenceExecutor(type, variantPack);
+    }
+    catch(const ReferenceCapabilityError& e)
+    {
+        return {RefStatus::CAPABILITY_MISS, e.what()};
+    }
+    catch(const std::exception& e)
+    {
+        return {RefStatus::RUNTIME_ERROR, e.what()};
+    }
+
+    markOutputsModifiedFor(refOutputs, useDevice);
+    return {RefStatus::RAN, {}};
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::markOutputsModified(
+    OutputTensors& outputs) const
+{
+    markOutputsModifiedFor(outputs, _requiresDevice);
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::markOutputsModifiedFor(
+    OutputTensors& outputs, bool device)
+{
+    for(auto& [uid, tensor] : outputs)
+    {
+        if(device)
+        {
+            tensor->markDeviceModified();
+        }
+        else
+        {
+            tensor->markHostModified();
+        }
+    }
+}
+
+// ---- comparison ------------------------------------------------------------
+
+void IntegrationGraphGoldenReferenceVerificationHarness::compareAgainstGolden(
+    OutputTensors& engineOutputs)
+{
+    compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+        return *_bundle->tensors->at(uid);
+    });
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputs(
+    OutputTensors& engineOutputs, OutputTensors& expected)
+{
+    compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+        return *expected.at(uid);
+    });
+}
+
+template <typename ExpectedLookup>
+void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(
+    OutputTensors& engineOutputs, ExpectedLookup expectedFor)
+{
+    auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+
+    for(const int64_t uid : _bundle->outputTensorUids)
+    {
+        auto& actualTensor = *engineOutputs.at(uid);
+        auto& expectedTensor = expectedFor(uid);
+
+        auto* attrs = tensorAttrMap.at(uid);
+        const auto dataType = attrs->data_type();
+
+        float atol = 0.0f;
+        float rtol = 0.0f;
+        resolveTolerances(wrapper, dataType, atol, rtol);
+
+        compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
+    }
+}
+
+// ---- reporting helpers -----------------------------------------------------
+
+void IntegrationGraphGoldenReferenceVerificationHarness::skipUnverifiable(const std::string& reason)
+{
+    UnverifiableBundleReport::get().record(
+        _bundlePath.string(), reason, UnverifiableSeverity::UNVERIFIABLE);
+    GTEST_SKIP() << "Unverifiable: " << reason << " (" << _bundlePath << ")";
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::recordRefError(const std::string& reason)
+{
+    UnverifiableBundleReport::get().record(
+        _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
+}
+
+std::string IntegrationGraphGoldenReferenceVerificationHarness::refLabel(
+    ReferenceExecutorType type)
+{
+    return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
+}
+
+// ---- comparison + tolerance machinery --------------------------------------
+
+void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputTensor(
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol) const
+{
+    auto validator = hipdnn_test_sdk::utilities::createAllCloseValidator(dataType, atol, rtol);
+    const bool passed = validator->allClose(expected, actual);
+
+    if(!passed)
+    {
+        std::ostringstream report;
+        report << reportHeader(uid, attrs, dataType, expected, atol, rtol);
+        appendTensorDiff(report, uid, attrs, dataType, expected, actual, atol, rtol);
+        EXPECT_TRUE(false) << report.str();
+    }
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::appendTensorDiff(
+    std::ostream& os,
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol)
+{
+    using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
+    using hipdnn_data_sdk::types::bfloat16;
+    using hipdnn_data_sdk::types::half;
+
+    switch(dataType)
+    {
+    case DT::FLOAT:
+        appendFpDiff<float>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::HALF:
+        appendFpDiff<half>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::BFLOAT16:
+        appendFpDiff<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::DOUBLE:
+        appendFpDiff<double>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    default:
+        os << "  (no element-wise diff available for this data type)\n";
+    }
+}
+
+template <typename T>
+void IntegrationGraphGoldenReferenceVerificationHarness::appendFpDiff(
+    std::ostream& os,
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol)
+{
+    const auto summary
+        = hipdnn_test_sdk::utilities::computeTensorDiff<T>(expected, actual, atol, rtol);
+    hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
+}
+
+std::string IntegrationGraphGoldenReferenceVerificationHarness::labelFor(
+    int64_t uid, const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
+{
+    const auto* name = attrs.name();
+    return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
+}
+
+std::string IntegrationGraphGoldenReferenceVerificationHarness::reportHeader(
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    float atol,
+    float rtol) const
+{
+    std::ostringstream os;
+    os << "\nGolden comparison FAILED\n"
+       << "  Bundle: " << _bundlePath << "\n"
+       << "  Tensor: " << labelFor(uid, attrs) << " (UID " << uid << ", output)\n"
+       << "  Shape:  " << hipdnn_test_sdk::utilities::StreamVec(expected.dims()) << "  "
+       << dataTypeName(dataType) << "\n"
+       << "  Tolerance: atol=" << atol << " rtol=" << rtol << "\n";
+    return os.str();
+}
+
+std::string IntegrationGraphGoldenReferenceVerificationHarness::dataTypeName(
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
+{
+    return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::resolveTolerances(
+    const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    float& atol,
+    float& rtol)
+{
+    const float defaultTolerance = deriveDefaultTolerance(wrapper, dataType);
+    atol = defaultTolerance;
+    rtol = defaultTolerance;
+}
+
+template <typename T>
+float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForNodeAttributes(
+    hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType)
+{
+    using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
+    namespace tol = hipdnn_test_sdk::utilities;
+
+    switch(attrType)
+    {
+    case NA::ConvolutionFwdAttributes:
+        return tol::conv::getToleranceFwd<T>();
+    case NA::ConvolutionBwdAttributes:
+        return tol::conv::getToleranceBwd<T>();
+    case NA::ConvolutionWrwAttributes:
+        return tol::conv::getToleranceWrw<T>();
+    case NA::BatchnormInferenceAttributes:
+        return tol::batchnorm::getToleranceInference<T>();
+    case NA::BatchnormInferenceAttributesVarianceExt:
+        return tol::batchnorm::getToleranceInferenceWithVariance<T>();
+    case NA::BatchnormAttributes:
+        return tol::batchnorm::getToleranceTraining<T>();
+    case NA::BatchnormBackwardAttributes:
+        return tol::batchnorm::getToleranceBackward<T>();
+    case NA::MatmulAttributes:
+        return tol::matmul::getTolerance<T>();
+    case NA::ReductionAttributes:
+        return tol::reduction::getTolerance<T>();
+    case NA::RMSNormAttributes:
+        return tol::rmsnorm::getTolerance<T>();
+    case NA::PointwiseAttributes:
+        return tol::pointwise::getTolerance<T>();
+    case NA::LayernormAttributes:
+        return tol::layernorm::getTolerance<T>();
+    default:
+        return 1e-3f;
+    }
+}
+
+float IntegrationGraphGoldenReferenceVerificationHarness::deriveDefaultTolerance(
+    const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
+{
+    const auto nodeCount = wrapper.nodeCount();
+
+    bool found = false;
+    float maxTolerance = 0.0f;
+    for(uint32_t i = 0; i < nodeCount; ++i)
+    {
+        const auto attrType = wrapper.getNode(i).attributes_type();
+        const float nodeTolerance = toleranceForDataType(attrType, dataType);
+        maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
+        found = true;
+    }
+
+    return found ? maxTolerance : 1e-3f;
+}
+
+float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForDataType(
+    hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
+{
+    using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
+    using hipdnn_data_sdk::types::bfloat16;
+    using hipdnn_data_sdk::types::half;
+
+    switch(dataType)
+    {
+    case DT::FLOAT:
+        return toleranceForNodeAttributes<float>(attrType);
+    case DT::HALF:
+        return toleranceForNodeAttributes<half>(attrType);
+    case DT::BFLOAT16:
+        return toleranceForNodeAttributes<bfloat16>(attrType);
+    default:
+        return 1e-3f;
+    }
+}
+
+void IntegrationGraphGoldenReferenceVerificationHarness::applyMetadataGuards() const
+{
+    if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
+           _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
+    {
+        GTEST_SKIP() << *reason;
+    }
+
+    if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
+           _bundle->metadata, TestConfig::get().getCurrentArch()))
+    {
+        GTEST_SKIP() << *reason;
+    }
+}
+
+} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index 3b5cd243219e..539cd70f9d76 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -3,15 +3,11 @@
 
 #pragma once
 
-#include <algorithm>
 #include <cstdint>
 #include <filesystem>
+#include <iosfwd>
 #include <memory>
 #include <optional>
-#include <ostream>
-#include <random>
-#include <set>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -19,27 +15,13 @@
 #include <gtest/gtest.h>
 
 #include <hipdnn_data_sdk/utilities/Tensor.hpp>
-#include <hipdnn_data_sdk/utilities/Workspace.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
 #include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
-#include <hipdnn_frontend/Graph.hpp>
-#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
-#include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
-#include <hipdnn_test_sdk/utilities/FlatbufferDatatypeMapping.hpp>
-#include <hipdnn_test_sdk/utilities/TensorDiff.hpp>
-#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
-#include <hipdnn_test_sdk/utilities/detail/FlatbufferTensorAttributesUtils.hpp>
 
-#include "harness/CpuReferenceGraphExecutorAdapter.hpp"
 #include "harness/IReferenceGraphExecutor.hpp"
-#include "harness/ReferenceCapabilityError.hpp"
-#include "harness/SharedHandle.hpp"
 #include "harness/TestConfig.hpp"
-#include "harness/golden/BundleDiscovery.hpp"
 #include "harness/golden/IntegrationTestBundle.hpp"
-#include "harness/golden/UnverifiableBundleReport.hpp"
-#include "harness/golden/input_init/SynthesizeInputs.hpp"
-#include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
 
 namespace hipdnn_integration_tests::golden
 {
@@ -57,6 +39,10 @@ using OutputTensors
 //   actual   = the engine (the system under test), run once into fresh buffers.
 //   expected = golden data from disk, OR a reference executor's output.
 //
+// Auto mode fallback chain: golden → GPU ref → CPU ref → SKIP.
+// When golden outputs are present on disk, the comparison uses them directly
+// and no reference executor is run at all.
+//
 // Memory invariants for running engine + a reference off the same inputs:
 //   * INPUT tensors are read-only by both executors and are NEVER mark*Modified().
 //     The engine's rawDeviceData() uploads host->device (state becomes BOTH
@@ -75,9 +61,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     {
     }
 
-    // The bundle is loaded once at registration time and shared into the test's
-    // factory; the harness does not load from disk. The path is kept only for
-    // diagnostic messages and the unverifiable report.
     void setBundle(std::shared_ptr<IntegrationTestBundle> bundle, std::filesystem::path path)
     {
         _bundle = std::move(bundle);
@@ -107,400 +90,25 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         runComparison();
     }
 
-    // Builds the graph from its serialized bytes, selects an engine (honouring an
-    // explicit --engine if given), builds plans, and executes into the variant
-    // pack. "Unsupported graph" is signalled by throwing (the harness translates
-    // that into a SKIP). Genuine build/execute errors use ASSERT_*.
-    virtual void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack)
-    {
-        auto handle = getSharedHandle();
-
-        const std::vector<uint8_t> graphBytes(
-            _bundle->graphBuffer.data(), _bundle->graphBuffer.data() + _bundle->graphBuffer.size());
-
-        hipdnn_frontend::graph::Graph graph;
-        auto err = graph.from_binary(handle, graphBytes);
-        ASSERT_TRUE(err.is_good()) << "from_binary failed: " << err.get_message();
-
-        std::vector<int64_t> engineIds;
-        auto status = graph.get_ranked_engine_ids(engineIds);
-
-        const auto graphSummary = [&] {
-            return std::to_string(_bundle->outputTensorUids.size()) + " output tensor(s), "
-                   + std::to_string(engineIds.size()) + " ranked engine(s)";
-        };
-
-        if(TestConfig::get().hasEngineName())
-        {
-            int64_t targetEngineId = TestConfig::get().getEngineId();
-            if(status.is_bad()
-               || std::find(engineIds.begin(), engineIds.end(), targetEngineId) == engineIds.end())
-            {
-                throw std::runtime_error("Engine " + std::string(TestConfig::get().getEngineName())
-                                         + " does not support this graph (" + graphSummary() + ")");
-            }
-            graph.set_preferred_engine_id_ext(targetEngineId);
-        }
-        else
-        {
-            if(status.is_bad() || engineIds.empty())
-            {
-                throw std::runtime_error("No engine supports this graph (" + graphSummary() + ")");
-            }
-        }
-
-        auto result = graph.create_execution_plans();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        result = graph.check_support();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        result = graph.build_plans();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-
-        int64_t workspaceSize = 0;
-        result = graph.get_workspace_size(workspaceSize);
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        ASSERT_GE(workspaceSize, 0);
-        const hipdnn_data_sdk::utilities::Workspace workspace(static_cast<size_t>(workspaceSize));
-
-        result = graph.execute(handle, variantPack, workspace.get());
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-    }
+    // Builds the graph, selects an engine, and executes. Throws on unsupported graph (→ SKIP).
+    virtual void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack);
 
-    // Runs a reference executor (the chosen expected-output source) into the given
-    // variant pack. Throws ReferenceCapabilityError when the executor has no plan
-    // for the op (capability miss, case A); throws any other exception for a
-    // genuine runtime failure (case C). Virtual so unit tests can stub it the same
-    // way they stub executeGraphThroughEngine.
+    // Runs the named reference executor. Throws ReferenceCapabilityError on capability miss.
     virtual void runReferenceExecutor(ReferenceExecutorType type,
-                                      std::unordered_map<int64_t, void*>& variantPack)
-    {
-        auto executor = makeReferenceExecutor(type);
-        executor->execute(_bundle->graphBuffer.data(), _bundle->graphBuffer.size(), variantPack);
-    }
+                                      std::unordered_map<int64_t, void*>& variantPack);
 
-    // Factory split out so a stub harness can short-circuit it. Default: the real
-    // CPU / GPU reference executors.
+    // Constructs the executor object (CpuReferenceGraphExecutorAdapter or
+    // GpuReferenceGraphExecutor) — does not allocate buffers or run anything.
+    // Skipped in auto mode when golden data is present.
     virtual std::unique_ptr<IReferenceGraphExecutor>
-        makeReferenceExecutor(ReferenceExecutorType type)
-    {
-        switch(type)
-        {
-        case ReferenceExecutorType::CPU:
-            return std::make_unique<CpuReferenceGraphExecutorAdapter>();
-        case ReferenceExecutorType::GPU:
-            return std::make_unique<gpu_graph_executor::GpuReferenceGraphExecutor>();
-        default:
-            throw std::runtime_error("Unknown reference executor type");
-        }
-    }
+        makeReferenceExecutor(ReferenceExecutorType type);
 
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
     std::shared_ptr<IntegrationTestBundle> _bundle;
 
-    // ---- top-level dispatch -------------------------------------------------
-
-    void runComparison()
-    {
-        if(_bundle->outputTensorUids.empty())
-        {
-            skipUnverifiable("bundle has no output tensors to compare");
-            return;
-        }
-
-        if(!ensureInputsAvailable())
-        {
-            return; // skipUnverifiable already recorded + GTEST_SKIP issued
-        }
-
-        switch(TestConfig::get().getVerificationMode())
-        {
-        case VerificationMode::GOLDEN:
-            runGoldenMode();
-            return;
-        case VerificationMode::GPU:
-            runExplicitRefMode(ReferenceExecutorType::GPU);
-            return;
-        case VerificationMode::CPU:
-            runExplicitRefMode(ReferenceExecutorType::CPU);
-            return;
-        case VerificationMode::AUTO:
-            runAutoMode();
-            return;
-        default:
-            FAIL() << "Unknown verification mode";
-            return;
-        }
-    }
-
-    // golden mode: golden data only.
-    void runGoldenMode()
-    {
-        if(!_bundle->hasGoldenOutputs)
-        {
-            skipUnverifiable("no golden data (verification-mode=golden)");
-            return;
-        }
-        auto engineOutputs = runEngineCapturingOutputs();
-        if(!engineOutputs)
-        {
-            if(!::testing::Test::HasFatalFailure())
-            {
-                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-            }
-            return;
-        }
-        compareAgainstGolden(*engineOutputs);
-    }
-
-    // explicit gpu / cpu mode: ignore golden; compare against the named reference.
-    //   A (capability miss) -> SKIP+report
-    //   C (runtime error)   -> FAIL (the user named this reference)
-    //   B (mismatch)        -> FAIL
-    void runExplicitRefMode(ReferenceExecutorType type)
-    {
-        auto engineOutputs = runEngineCapturingOutputs();
-        if(!engineOutputs)
-        {
-            if(!::testing::Test::HasFatalFailure())
-            {
-                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-            }
-            return;
-        }
-
-        OutputTensors refOutputs;
-        const RefRunResult result = runReferenceCapturingOutputs(type, refOutputs);
-        switch(result.status)
-        {
-        case RefStatus::CAPABILITY_MISS:
-            skipUnverifiable(refLabel(type) + " cannot run this op: " + result.message);
-            return;
-        case RefStatus::RUNTIME_ERROR:
-            recordRefError(refLabel(type) + " errored: " + result.message);
-            FAIL() << refLabel(type) << " errored (verification-mode=" << refLabel(type)
-                   << "): " << result.message;
-            return;
-        case RefStatus::RAN:
-            compareOutputs(*engineOutputs, refOutputs);
-            return;
-        default:
-            FAIL() << "Unknown RefStatus";
-            return;
-        }
-    }
-
-    // auto mode: golden -> GPU ref -> CPU ref -> SKIP+report.
-    //   capability miss falls through; a runtime error in a non-final ref is loud
-    //   but still falls through (keep verifying the engine); a runtime error in the
-    //   final ref (CPU) is a FAIL; a mismatch anywhere is a FAIL (never a second
-    //   opinion).
-    void runAutoMode()
-    {
-        auto engineOutputs = runEngineCapturingOutputs();
-        if(!engineOutputs)
-        {
-            if(!::testing::Test::HasFatalFailure())
-            {
-                GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-            }
-            return;
-        }
-
-        if(_bundle->hasGoldenOutputs)
-        {
-            compareAgainstGolden(*engineOutputs);
-            return;
-        }
-
-        // GPU ref (non-final): capability miss or runtime error -> fall through.
-        {
-            OutputTensors refOutputs;
-            const RefRunResult gpu
-                = runReferenceCapturingOutputs(ReferenceExecutorType::GPU, refOutputs);
-            if(gpu.status == RefStatus::RAN)
-            {
-                compareOutputs(*engineOutputs, refOutputs);
-                return;
-            }
-            if(gpu.status == RefStatus::RUNTIME_ERROR)
-            {
-                // A reference that CAN run the op but failed is a reference bug:
-                // loud, but we still fall through to keep verifying the engine.
-                recordRefError("GPU reference errored (auto mode, falling through to CPU): "
-                               + gpu.message);
-            }
-        }
-
-        // CPU ref (final): capability miss -> unverifiable; runtime error -> FAIL.
-        {
-            OutputTensors refOutputs;
-            const RefRunResult cpu
-                = runReferenceCapturingOutputs(ReferenceExecutorType::CPU, refOutputs);
-            switch(cpu.status)
-            {
-            case RefStatus::CAPABILITY_MISS:
-                skipUnverifiable("no reference available (golden absent; GPU and CPU ref "
-                                 "cannot run this op): "
-                                 + cpu.message);
-                return;
-            case RefStatus::RUNTIME_ERROR:
-                recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
-                FAIL() << "CPU reference errored (auto mode, last resort): " << cpu.message;
-                return;
-            case RefStatus::RAN:
-                compareOutputs(*engineOutputs, refOutputs);
-                return;
-            default:
-                FAIL() << "Unknown RefStatus";
-                return;
-            }
-        }
-    }
-
-    // ---- inputs -------------------------------------------------------------
-
-    // Ensures _bundle->tensors holds usable input data. tier 1/2: already loaded
-    // from disk. tier 3 (tensors == nullopt): try to synthesize inputs from the
-    // graph. Returns false (after recording + SKIP) when neither is possible.
-    bool ensureInputsAvailable()
-    {
-        if(_bundle->tensors.has_value())
-        {
-            return true; // inputs (and maybe golden outputs) loaded from disk
-        }
-        return synthesizeInputs();
-    }
-
-    // tier-3 synthesis: builds zeroed input tensors from graph attributes, walks
-    // every node (single-node or fused), and fills each node's owned leaf inputs.
-    // If any node's synthesis fails the whole graph is skipped.
-    bool synthesizeInputs()
-    {
-        const auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-        const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
-                                           _bundle->outputTensorUids.end());
-
-        // Leaf inputs = non-virtual tensors that are not graph outputs.
-        InputTensorMap inputs;
-        std::vector<int64_t> allLeafInputUids;
-        for(const auto& [uid, attrs] : tensorAttrMap)
-        {
-            if(attrs->virtual_() || outputUids.count(uid) != 0)
-            {
-                continue;
-            }
-            inputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*attrs);
-            inputs[uid]->fillTensorWithValue(0.f);
-            allLeafInputUids.push_back(uid);
-        }
-
-        std::mt19937 rng(static_cast<std::mt19937::result_type>(
-            _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
-
-        // Synthesize per node. In a fused graph (e.g. conv+bias+relu) each node
-        // owns a disjoint subset of the leaf inputs; virtual inter-node tensors
-        // are excluded above and handled by the engine at execution time.
-        for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
-        {
-            const auto& node = wrapper.getNode(i);
-            const SynthesisResult outcome
-                = synthesizeNodeInputs(node, allLeafInputUids, inputs, rng);
-            if(!outcome.filled)
-            {
-                skipUnverifiable(outcome.reason);
-                return false;
-            }
-        }
-
-        _bundle->tensors = std::move(inputs);
-        return true;
-    }
-
-    // ---- engine + reference runs -------------------------------------------
-
-    // Allocate fresh zeroed output buffers (one ITensor per output uid) from the
-    // graph's tensor attributes — no .bin needed.
-    OutputTensors allocateZeroedOutputs() const
-    {
-        const auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-
-        OutputTensors outputs;
-        for(const int64_t uid : _bundle->outputTensorUids)
-        {
-            outputs[uid]
-                = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
-            outputs[uid]->fillTensorWithValue(0.f);
-        }
-        return outputs;
-    }
-
-    // Build a variant pack: inputs from _bundle->tensors, outputs from `outputs`.
-    // useDevice selects device vs host pointers (engine/GPU-ref use device; CPU-ref
-    // uses host). Inputs are read but never mark*Modified() (see class invariants).
-    std::unordered_map<int64_t, void*> buildVariantPack(OutputTensors& outputs,
-                                                        bool useDevice) const
-    {
-        std::unordered_map<int64_t, void*> variantPack;
-        const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
-                                           _bundle->outputTensorUids.end());
-
-        for(auto& [uid, tensor] : *_bundle->tensors)
-        {
-            if(outputUids.count(uid) != 0)
-            {
-                continue; // golden output from disk; use the fresh buffer below instead
-            }
-            variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
-        }
-        for(auto& [uid, tensor] : outputs)
-        {
-            variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
-        }
-        return variantPack;
-    }
-
-    // Run the engine into fresh output buffers. Returns nullopt if the engine
-    // signalled "unsupported graph" (SKIP already issued) or a fatal assertion
-    // fired inside the executor.
-    std::optional<OutputTensors> runEngineCapturingOutputs()
-    {
-        OutputTensors engineOutputs = allocateZeroedOutputs();
-        auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
-
-        // Call the executor directly (not via ASSERT_NO_FATAL_FAILURE, which would
-        // `return;` and cannot compile in this value-returning function). A fatal
-        // ASSERT_* inside the executor returns from it and sets the fatal-failure
-        // flag, which we detect below and surface as nullopt.
-        bool threw = false;
-        std::string error;
-        try
-        {
-            executeGraphThroughEngine(variantPack);
-        }
-        catch(const std::exception& e)
-        {
-            threw = true;
-            error = e.what();
-        }
-
-        if(::testing::Test::HasFatalFailure())
-        {
-            return std::nullopt;
-        }
-        if(threw)
-        {
-            // GTEST_SKIP contains `return;` which cannot compile in a non-void
-            // function. Callers detect nullopt and issue the skip themselves.
-            return std::nullopt;
-        }
-
-        markOutputsModified(engineOutputs);
-        return engineOutputs;
-    }
+    static constexpr int64_t K_DEFAULT_SEED = 42;
 
     enum class RefStatus
     {
@@ -514,136 +122,42 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         std::string message;
     };
 
-    // Run a reference executor into fresh output buffers `refOutputs`.
-    //   ReferenceCapabilityError -> CapabilityMiss (case A)
-    //   any other std::exception -> RuntimeError   (case C)
-    RefRunResult runReferenceCapturingOutputs(ReferenceExecutorType type, OutputTensors& refOutputs)
-    {
-        refOutputs = allocateZeroedOutputs();
-        const bool useDevice = (type == ReferenceExecutorType::GPU);
-        auto variantPack = buildVariantPack(refOutputs, useDevice);
+    // ── top-level dispatch ────────────────────────────────────────────────
+    void runComparison();
+    void runGoldenMode();
+    void runExplicitRefMode(ReferenceExecutorType type);
+    void runAutoMode();
 
-        try
-        {
-            runReferenceExecutor(type, variantPack);
-        }
-        catch(const ReferenceCapabilityError& e)
-        {
-            return {RefStatus::CAPABILITY_MISS, e.what()};
-        }
-        catch(const std::exception& e)
-        {
-            return {RefStatus::RUNTIME_ERROR, e.what()};
-        }
-
-        markOutputsModifiedFor(refOutputs, useDevice);
-        return {RefStatus::RAN, {}};
-    }
-
-    void markOutputsModified(OutputTensors& outputs) const
-    {
-        markOutputsModifiedFor(outputs, _requiresDevice);
-    }
-
-    static void markOutputsModifiedFor(OutputTensors& outputs, bool device)
-    {
-        for(auto& [uid, tensor] : outputs)
-        {
-            if(device)
-            {
-                tensor->markDeviceModified();
-            }
-            else
-            {
-                tensor->markHostModified();
-            }
-        }
-    }
+    // ── inputs ──────────────────────────────────────────────────────────
+    bool ensureInputsAvailable();
+    bool synthesizeInputs();
 
-    // ---- comparison ---------------------------------------------------------
-
-    // Compare engine output against the golden outputs stored in _bundle->tensors.
-    void compareAgainstGolden(OutputTensors& engineOutputs)
-    {
-        compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
-            return *_bundle->tensors->at(uid);
-        });
-    }
+    // ── buffer allocation + execution ───────────────────────────────────
+    // allocateZeroedOutputs / buildVariantPack prepare the buffers;
+    // runEngine* / runReference* call the executors and capture results.
+    OutputTensors allocateZeroedOutputs() const;
+    std::unordered_map<int64_t, void*> buildVariantPack(OutputTensors& outputs,
+                                                        bool useDevice) const;
+    std::optional<OutputTensors> runEngineCapturingOutputs();
+    RefRunResult runReferenceCapturingOutputs(ReferenceExecutorType type,
+                                              OutputTensors& refOutputs);
+    void markOutputsModified(OutputTensors& outputs) const;
+    static void markOutputsModifiedFor(OutputTensors& outputs, bool device);
 
-    void compareOutputs(OutputTensors& engineOutputs, OutputTensors& expected)
-    {
-        compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
-            return *expected.at(uid);
-        });
-    }
+    // ── comparison ──────────────────────────────────────────────────────
+    void compareAgainstGolden(OutputTensors& engineOutputs);
+    void compareOutputs(OutputTensors& engineOutputs, OutputTensors& expected);
 
     template <typename ExpectedLookup>
-    void compareEach(OutputTensors& engineOutputs, ExpectedLookup expectedFor)
-    {
-        auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-
-        for(const int64_t uid : _bundle->outputTensorUids)
-        {
-            auto& actualTensor = *engineOutputs.at(uid);
-            auto& expectedTensor = expectedFor(uid);
-
-            auto* attrs = tensorAttrMap.at(uid);
-            const auto dataType = attrs->data_type();
-
-            float atol = 0.0f;
-            float rtol = 0.0f;
-            resolveTolerances(wrapper, dataType, atol, rtol);
-
-            compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
-        }
-    }
-
-    // ---- reporting helpers --------------------------------------------------
-
-    void skipUnverifiable(const std::string& reason)
-    {
-        UnverifiableBundleReport::get().record(
-            _bundlePath.string(), reason, UnverifiableSeverity::UNVERIFIABLE);
-        GTEST_SKIP() << "Unverifiable: " << reason << " (" << _bundlePath << ")";
-    }
-
-    void recordRefError(const std::string& reason)
-    {
-        UnverifiableBundleReport::get().record(
-            _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
-    }
-
-    static std::string refLabel(ReferenceExecutorType type)
-    {
-        return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
-    }
+    void compareEach(OutputTensors& engineOutputs, ExpectedLookup expectedFor);
 
-    static constexpr int64_t K_DEFAULT_SEED = 42;
-
-    // ---- comparison + tolerance machinery (unchanged behaviour) -------------
-
-    // Compare one output tensor against its expected reference via the allClose
-    // validator. Only on failure do we compute and report the element-wise diff.
     void compareOutputTensor(int64_t uid,
                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
                              hipdnn_data_sdk::utilities::ITensor& expected,
                              hipdnn_data_sdk::utilities::ITensor& actual,
                              float atol,
-                             float rtol) const
-    {
-        auto validator = hipdnn_test_sdk::utilities::createAllCloseValidator(dataType, atol, rtol);
-        const bool passed = validator->allClose(expected, actual);
-
-        if(!passed)
-        {
-            std::ostringstream report;
-            report << reportHeader(uid, attrs, dataType, expected, atol, rtol);
-            appendTensorDiff(report, uid, attrs, dataType, expected, actual, atol, rtol);
-            EXPECT_TRUE(false) << report.str();
-        }
-    }
+                             float rtol) const;
 
     static void
         appendTensorDiff(std::ostream& os,
@@ -653,30 +167,7 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
                          hipdnn_data_sdk::utilities::ITensor& expected,
                          hipdnn_data_sdk::utilities::ITensor& actual,
                          float atol,
-                         float rtol)
-    {
-        using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
-        using hipdnn_data_sdk::types::bfloat16;
-        using hipdnn_data_sdk::types::half;
-
-        switch(dataType)
-        {
-        case DT::FLOAT:
-            appendFpDiff<float>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::HALF:
-            appendFpDiff<half>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::BFLOAT16:
-            appendFpDiff<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::DOUBLE:
-            appendFpDiff<double>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        default:
-            os << "  (no element-wise diff available for this data type)\n";
-        }
-    }
+                         float rtol);
 
     template <typename T>
     static void appendFpDiff(std::ostream& os,
@@ -685,147 +176,47 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
                              hipdnn_data_sdk::utilities::ITensor& expected,
                              hipdnn_data_sdk::utilities::ITensor& actual,
                              float atol,
-                             float rtol)
-    {
-        const auto summary
-            = hipdnn_test_sdk::utilities::computeTensorDiff<T>(expected, actual, atol, rtol);
-        hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
-    }
+                             float rtol);
 
-    static std::string labelFor(int64_t uid,
-                                const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
-    {
-        const auto* name = attrs.name();
-        return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
-    }
+    // ── reporting ───────────────────────────────────────────────────────
+    void skipUnverifiable(const std::string& reason);
+    void recordRefError(const std::string& reason);
+    static std::string refLabel(ReferenceExecutorType type);
+
+    static std::string
+        labelFor(int64_t uid,
+                 const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs);
 
     std::string reportHeader(int64_t uid,
                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
                              hipdnn_data_sdk::utilities::ITensor& expected,
                              float atol,
-                             float rtol) const
-    {
-        std::ostringstream os;
-        os << "\nGolden comparison FAILED\n"
-           << "  Bundle: " << _bundlePath << "\n"
-           << "  Tensor: " << labelFor(uid, attrs) << " (UID " << uid << ", output)\n"
-           << "  Shape:  " << hipdnn_test_sdk::utilities::StreamVec(expected.dims()) << "  "
-           << dataTypeName(dataType) << "\n"
-           << "  Tolerance: atol=" << atol << " rtol=" << rtol << "\n";
-        return os.str();
-    }
+                             float rtol) const;
 
-    static std::string dataTypeName(hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
-    }
+    static std::string dataTypeName(hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 
+    // ── tolerances ──────────────────────────────────────────────────────
     static void
         resolveTolerances(const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
                           hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
                           float& atol,
-                          float& rtol)
-    {
-        const float defaultTolerance = deriveDefaultTolerance(wrapper, dataType);
-        atol = defaultTolerance;
-        rtol = defaultTolerance;
-    }
+                          float& rtol);
 
     template <typename T>
     static float
-        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType)
-    {
-        using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
-        namespace tol = hipdnn_test_sdk::utilities;
-
-        switch(attrType)
-        {
-        case NA::ConvolutionFwdAttributes:
-            return tol::conv::getToleranceFwd<T>();
-        case NA::ConvolutionBwdAttributes:
-            return tol::conv::getToleranceBwd<T>();
-        case NA::ConvolutionWrwAttributes:
-            return tol::conv::getToleranceWrw<T>();
-        case NA::BatchnormInferenceAttributes:
-            return tol::batchnorm::getToleranceInference<T>();
-        case NA::BatchnormInferenceAttributesVarianceExt:
-            return tol::batchnorm::getToleranceInferenceWithVariance<T>();
-        case NA::BatchnormAttributes:
-            return tol::batchnorm::getToleranceTraining<T>();
-        case NA::BatchnormBackwardAttributes:
-            return tol::batchnorm::getToleranceBackward<T>();
-        case NA::MatmulAttributes:
-            return tol::matmul::getTolerance<T>();
-        case NA::ReductionAttributes:
-            return tol::reduction::getTolerance<T>();
-        case NA::RMSNormAttributes:
-            return tol::rmsnorm::getTolerance<T>();
-        case NA::PointwiseAttributes:
-            return tol::pointwise::getTolerance<T>();
-        case NA::LayernormAttributes:
-            return tol::layernorm::getTolerance<T>();
-        default:
-            return 1e-3f;
-        }
-    }
+        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType);
 
-    // A bundle graph may fuse several ops; each op type has its own tolerance, so
-    // the only tolerance that holds for the fused output is the loosest one across
-    // all nodes. We therefore take the max over every node.
     static float deriveDefaultTolerance(
         const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-        hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        const auto nodeCount = wrapper.nodeCount();
-
-        bool found = false;
-        float maxTolerance = 0.0f;
-        for(uint32_t i = 0; i < nodeCount; ++i)
-        {
-            const auto attrType = wrapper.getNode(i).attributes_type();
-            const float nodeTolerance = toleranceForDataType(attrType, dataType);
-            maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
-            found = true;
-        }
-
-        return found ? maxTolerance : 1e-3f;
-    }
-
-    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
-        using hipdnn_data_sdk::types::bfloat16;
-        using hipdnn_data_sdk::types::half;
-
-        switch(dataType)
-        {
-        case DT::FLOAT:
-            return toleranceForNodeAttributes<float>(attrType);
-        case DT::HALF:
-            return toleranceForNodeAttributes<half>(attrType);
-        case DT::BFLOAT16:
-            return toleranceForNodeAttributes<bfloat16>(attrType);
-        default:
-            return 1e-3f;
-        }
-    }
+        hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 
-    void applyMetadataGuards() const
-    {
-        if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
-               _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
-        {
-            GTEST_SKIP() << *reason;
-        }
+    static float
+        toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
+                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 
-        if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
-               _bundle->metadata, TestConfig::get().getCurrentArch()))
-        {
-            GTEST_SKIP() << *reason;
-        }
-    }
+    // ── guards ──────────────────────────────────────────────────────────
+    void applyMetadataGuards() const;
 };
 
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index eeb4e99d84bc..1bb8ab7e2cc9 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 add_executable(hipdnn_integration_tests_unit_tests
     main.cpp
+    ../src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
     TestArchMatch.cpp
     TestBundleMetadata.cpp
     TestGraphDescription.cpp
@@ -18,6 +19,7 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestBundleDiscovery.cpp
     TestVerificationPaths.cpp
     TestGoldenVerificationHarness.cpp
+    TestSynthesisTracker.cpp
 )
 
 target_include_directories(hipdnn_integration_tests_unit_tests
diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
new file mode 100644
index 000000000000..8799211b0d6a
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -0,0 +1,175 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+
+#include "harness/golden/input_init/SynthesisTracker.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_integration_tests::golden;
+
+namespace
+{
+
+InputTensorMap makeTensors(const std::vector<int64_t>& uids)
+{
+    InputTensorMap map;
+    for(const int64_t uid : uids)
+    {
+        map[uid] = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(
+            std::vector<int64_t>{2, 3}, std::vector<int64_t>{3, 1});
+        map[uid]->fillTensorWithValue(0.f);
+    }
+    return map;
+}
+
+} // namespace
+
+// All owned inputs declared FREE -> ok().
+TEST(TestSynthesisTracker, AllFreeSucceeds)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.fillFree(2, -1.f, 1.f, rng);
+    tracker.fillFree(3, -1.f, 1.f, rng);
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// An owned input left undeclared -> unsupported().
+TEST(TestSynthesisTracker, UndeclaredInputFails)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    // uid 2 and 3 never declared
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("uid=2"), std::string::npos);
+    EXPECT_NE(result.reason.find("uid=3"), std::string::npos);
+}
+
+// A STRUCTURED input -> unsupported() with diagnostic.
+TEST(TestSynthesisTracker, StructuredInputFails)
+{
+    auto inputs = makeTensors({1, 2});
+    std::vector<int64_t> owned = {1, 2};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(2, "page_table");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("page_table"), std::string::npos);
+    EXPECT_NE(result.reason.find("structured"), std::string::npos);
+}
+
+// A DERIVED input -> unsupported() with diagnostic.
+TEST(TestSynthesisTracker, DerivedInputFails)
+{
+    auto inputs = makeTensors({1, 2});
+    std::vector<int64_t> owned = {1, 2};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markDerived(2, "forward_output");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("forward_output"), std::string::npos);
+    EXPECT_NE(result.reason.find("derived"), std::string::npos);
+}
+
+// uid 0 (absent optional tensor) is silently ignored, not treated as owned.
+TEST(TestSynthesisTracker, ZeroUidIgnored)
+{
+    auto inputs = makeTensors({1});
+    std::vector<int64_t> owned = {1};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(0, "absent_optional");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// A uid not in the owned set is silently ignored.
+TEST(TestSynthesisTracker, NonOwnedUidIgnored)
+{
+    auto inputs = makeTensors({1, 99});
+    std::vector<int64_t> owned = {1};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.fillFree(99, -1.f, 1.f, rng); // not owned, ignored
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// Empty owned set -> ok() trivially (no inputs to account for).
+TEST(TestSynthesisTracker, EmptyOwnedSucceeds)
+{
+    InputTensorMap inputs;
+    std::vector<int64_t> owned;
+
+    SynthesisTracker tracker(owned, inputs);
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// Mixed: some FREE, one STRUCTURED, one undeclared -> both problems reported.
+TEST(TestSynthesisTracker, MixedFailuresReportAll)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(2, "seed");
+    // uid 3 undeclared
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("seed"), std::string::npos);
+    EXPECT_NE(result.reason.find("uid=3"), std::string::npos);
+}
+
+// SynthesisResult::ok() and ::unsupported() factory methods.
+TEST(TestSynthesisResult, FactoryMethods)
+{
+    const auto ok = SynthesisResult::ok();
+    EXPECT_TRUE(ok.filled);
+    EXPECT_TRUE(ok.reason.empty());
+
+    const auto bad = SynthesisResult::unsupported("cannot synthesize X");
+    EXPECT_FALSE(bad.filled);
+    EXPECT_EQ(bad.reason, "cannot synthesize X");
+}
+
+// NOLINTEND(readability-identifier-naming)

From ae7b665f630e2d6a058438e8c8345df752f67a63 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 17:53:51 -0400
Subject: [PATCH 06/18] Add verification mode dispatch tests with virtual
 getVerificationMode()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the harness's mode dispatch testable by reading the verification
mode through a protected virtual rather than directly from the TestConfig
singleton. Add 10 unit tests covering every branch: auto mode with and
without golden data, GPU→CPU fallback chain, golden/gpu/cpu explicit
modes, and capability-miss skip paths.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.cpp |   7 +-
 ...raphGoldenReferenceVerificationHarness.hpp |   4 +
 .../integration-tests/tests/CMakeLists.txt    |   1 +
 .../tests/TestVerificationModePaths.cpp       | 403 ++++++++++++++++++
 4 files changed, 414 insertions(+), 1 deletion(-)
 create mode 100644 dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index a9f5cdbd4d2d..b8f1d034bb61 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -112,6 +112,11 @@ std::unique_ptr<IReferenceGraphExecutor>
 
 // ---- top-level dispatch ----------------------------------------------------
 
+VerificationMode IntegrationGraphGoldenReferenceVerificationHarness::getVerificationMode() const
+{
+    return TestConfig::get().getVerificationMode();
+}
+
 void IntegrationGraphGoldenReferenceVerificationHarness::runComparison()
 {
     if(_bundle->outputTensorUids.empty())
@@ -125,7 +130,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runComparison()
         return;
     }
 
-    switch(TestConfig::get().getVerificationMode())
+    switch(getVerificationMode())
     {
     case VerificationMode::GOLDEN:
         runGoldenMode();
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index 539cd70f9d76..e474c87f5926 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -103,6 +103,10 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     virtual std::unique_ptr<IReferenceGraphExecutor>
         makeReferenceExecutor(ReferenceExecutorType type);
 
+    // Returns the active verification mode. Override in tests to inject a mode
+    // without touching the TestConfig singleton.
+    virtual VerificationMode getVerificationMode() const;
+
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index 1bb8ab7e2cc9..c2dba6f219d0 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestVerificationPaths.cpp
     TestGoldenVerificationHarness.cpp
     TestSynthesisTracker.cpp
+    TestVerificationModePaths.cpp
 )
 
 target_include_directories(hipdnn_integration_tests_unit_tests
diff --git a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
new file mode 100644
index 000000000000..2b34be263657
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
@@ -0,0 +1,403 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Tests the verification mode dispatch logic in the harness:
+//
+//   AUTO mode:    golden → GPU ref → CPU ref → SKIP
+//   GOLDEN mode:  golden or SKIP
+//   GPU/CPU mode: explicit ref or SKIP/FAIL
+//
+// Each test overrides getVerificationMode() and the executor stubs to exercise
+// one branch without touching the TestConfig singleton.
+
+#include <gtest/gtest-spi.h>
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
+
+#include "harness/ReferenceCapabilityError.hpp"
+#include "harness/TestConfig.hpp"
+#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+#include "harness/golden/IntegrationTestBundle.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_integration_tests;
+using namespace hipdnn_integration_tests::golden;
+
+namespace
+{
+
+using EngineStub = std::function<void(std::unordered_map<int64_t, void*>&)>;
+using RefStub = std::function<void(ReferenceExecutorType, std::unordered_map<int64_t, void*>&)>;
+
+class ModeTestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
+{
+public:
+    ModeTestableHarness(VerificationMode mode, EngineStub engineStub, RefStub refStub)
+        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
+        , _mode(mode)
+        , _engineStub(std::move(engineStub))
+        , _refStub(std::move(refStub))
+    {
+    }
+
+    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
+    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
+
+protected:
+    VerificationMode getVerificationMode() const override
+    {
+        return _mode;
+    }
+
+    void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
+    {
+        _engineStub(variantPack);
+    }
+
+    void runReferenceExecutor(ReferenceExecutorType type,
+                              std::unordered_map<int64_t, void*>& variantPack) override
+    {
+        _refStub(type, variantPack);
+    }
+
+    std::unique_ptr<IReferenceGraphExecutor>
+        makeReferenceExecutor(ReferenceExecutorType /*type*/) override
+    {
+        return nullptr;
+    }
+
+private:
+    VerificationMode _mode;
+    EngineStub _engineStub;
+    RefStub _refStub;
+};
+
+class TestVerificationModePathsFixture : public ::testing::Test
+{
+protected:
+    std::optional<hipdnn_test_sdk::utilities::ScopedDirectory> _scopedDir;
+    std::filesystem::path _tempDir;
+
+    void SetUp() override
+    {
+        auto path
+            = std::filesystem::temp_directory_path()
+              / ("vmode_test_"
+                 + std::to_string(::testing::UnitTest::GetInstance()->current_test_info()->line()));
+        std::filesystem::remove_all(path);
+        _scopedDir.emplace(path);
+        _tempDir = _scopedDir->path();
+    }
+
+    static constexpr float K_OUTPUT_VALUE = 3.5f;
+    static constexpr int64_t K_OUTPUT_UID = 5;
+    static constexpr size_t K_OUTPUT_ELEMS = 120;
+
+    static void writeBundleFiles(const std::filesystem::path& dir,
+                                 const std::string& name,
+                                 bool includeGoldenOutput)
+    {
+        std::filesystem::create_directories(dir);
+        std::ofstream(dir / (name + ".json"))
+            << R"({"nodes": [{"inputs": {"x_tensor_uid": 0, "mean_tensor_uid": 1, )"
+               R"("inv_variance_tensor_uid": 2, "scale_tensor_uid": 3, "bias_tensor_uid": 4}, )"
+               R"("outputs": {"y_tensor_uid": 5}, "type": "BatchnormInferenceAttributes", )"
+               R"("compute_data_type": "float", "name": ""}], "tensors": [)"
+               R"({"name": "", "uid": 0, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 1, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 2, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 3, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 4, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 5, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}], "io_data_type": "float", )"
+               R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
+
+        std::ofstream(dir / (name + ".meta.json"))
+            << R"({"format_version": 1, "operation": "BatchnormInference"})";
+
+        const auto basePath = (dir / name).string();
+        const auto writeFloatBin = [&](int64_t uid, size_t elems, float value) {
+            const std::vector<float> data(elems, value);
+            std::ofstream out(basePath + ".tensor" + std::to_string(uid) + ".bin",
+                              std::ios::binary);
+            out.write(reinterpret_cast<const char*>(data.data()),
+                      static_cast<std::streamsize>(data.size() * sizeof(float)));
+        };
+
+        writeFloatBin(0, 120, 0.0f);
+        writeFloatBin(1, 3, 0.0f);
+        writeFloatBin(2, 3, 0.0f);
+        writeFloatBin(3, 3, 0.0f);
+        writeFloatBin(4, 3, 0.0f);
+
+        if(includeGoldenOutput)
+        {
+            writeFloatBin(K_OUTPUT_UID, K_OUTPUT_ELEMS, K_OUTPUT_VALUE);
+        }
+    }
+
+    std::shared_ptr<IntegrationTestBundle> loadBundle(const std::string& name,
+                                                      bool includeGoldenOutput) const
+    {
+        const auto dir = _tempDir / name;
+        writeBundleFiles(dir, name, includeGoldenOutput);
+        auto result = loadIntegrationTestBundle(dir / (name + ".json"));
+        EXPECT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
+        return std::make_shared<IntegrationTestBundle>(
+            std::move(std::get<IntegrationTestBundle>(result)));
+    }
+
+    static void writeOutput(std::unordered_map<int64_t, void*>& variantPack, float value)
+    {
+        auto* ptr = static_cast<float*>(variantPack.at(K_OUTPUT_UID));
+        std::fill(ptr, ptr + K_OUTPUT_ELEMS, value);
+    }
+
+    static void runCapturing(std::shared_ptr<IntegrationTestBundle> bundle,
+                             VerificationMode mode,
+                             EngineStub engineStub,
+                             RefStub refStub,
+                             ::testing::TestPartResultArray* results)
+    {
+        ModeTestableHarness harness(mode, std::move(engineStub), std::move(refStub));
+        harness.setBundle(std::move(bundle), "vmode-test-bundle");
+
+        const ::testing::ScopedFakeTestPartResultReporter reporter(
+            ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, results);
+        harness.SetUp();
+        harness.TestBody();
+    }
+
+    static bool anySkipped(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).skipped())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool anyFailed(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).failed())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static EngineStub matchingEngine()
+    {
+        return [](std::unordered_map<int64_t, void*>& vp) { writeOutput(vp, K_OUTPUT_VALUE); };
+    }
+
+    static EngineStub mismatchingEngine()
+    {
+        return [](std::unordered_map<int64_t, void*>& vp) {
+            writeOutput(vp, K_OUTPUT_VALUE + 100.0f);
+        };
+    }
+
+    static RefStub matchingRef()
+    {
+        return [](ReferenceExecutorType, std::unordered_map<int64_t, void*>& vp) {
+            writeOutput(vp, K_OUTPUT_VALUE);
+        };
+    }
+
+    static RefStub capabilityMissRef()
+    {
+        return [](ReferenceExecutorType, std::unordered_map<int64_t, void*>&) {
+            throw ReferenceCapabilityError("stub: unsupported op");
+        };
+    }
+
+    static RefStub gpuMissCpuMatchRef()
+    {
+        return [](ReferenceExecutorType type, std::unordered_map<int64_t, void*>& vp) {
+            if(type == ReferenceExecutorType::GPU)
+            {
+                throw ReferenceCapabilityError("stub: no GPU ref plan");
+            }
+            writeOutput(vp, K_OUTPUT_VALUE);
+        };
+    }
+};
+
+} // namespace
+
+// ── AUTO mode ───────────────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, AutoWithGoldenUsesGoldenAndPasses)
+{
+    ::testing::TestPartResultArray results;
+    bool refCalled = false;
+    runCapturing(
+        loadBundle("auto_golden", /*includeGoldenOutput=*/true),
+        VerificationMode::AUTO,
+        matchingEngine(),
+        [&](ReferenceExecutorType, std::unordered_map<int64_t, void*>&) { refCalled = true; },
+        &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+    EXPECT_FALSE(refCalled) << "Reference executor should NOT run when golden data is present";
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoWithGoldenMismatchFails)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_golden_mm", /*includeGoldenOutput=*/true),
+                 VerificationMode::AUTO,
+                 mismatchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_TRUE(anyFailed(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenGpuRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_gpu", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenGpuMissFallsThroughToCpu)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_fallthrough", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 gpuMissCpuMatchRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenBothRefsMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_both_miss", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── GOLDEN mode ─────────────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, GoldenModeWithDataPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("golden_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::GOLDEN,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, GoldenModeWithoutDataSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("golden_absent", /*includeGoldenOutput=*/false),
+                 VerificationMode::GOLDEN,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── Explicit GPU mode ───────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, GpuModeRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("gpu_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::GPU,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, GpuModeCapabilityMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("gpu_miss", /*includeGoldenOutput=*/true),
+                 VerificationMode::GPU,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── Explicit CPU mode ───────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, CpuModeRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("cpu_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::CPU,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, CpuModeCapabilityMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("cpu_miss", /*includeGoldenOutput=*/true),
+                 VerificationMode::CPU,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// NOLINTEND(readability-identifier-naming)

From dec13b172f4681b5eb32558a56dde5c78f83fd1d Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 18:30:58 -0400
Subject: [PATCH 07/18] Fix clang-tidy misc-const-correctness in
 TestSynthesisTracker

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../tests/TestSynthesisTracker.cpp             | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
index 8799211b0d6a..9f016866b2c8 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -37,7 +37,7 @@ InputTensorMap makeTensors(const std::vector<int64_t>& uids)
 TEST(TestSynthesisTracker, AllFreeSucceeds)
 {
     auto inputs = makeTensors({1, 2, 3});
-    std::vector<int64_t> owned = {1, 2, 3};
+    const std::vector<int64_t> owned = {1, 2, 3};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -53,7 +53,7 @@ TEST(TestSynthesisTracker, AllFreeSucceeds)
 TEST(TestSynthesisTracker, UndeclaredInputFails)
 {
     auto inputs = makeTensors({1, 2, 3});
-    std::vector<int64_t> owned = {1, 2, 3};
+    const std::vector<int64_t> owned = {1, 2, 3};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -70,7 +70,7 @@ TEST(TestSynthesisTracker, UndeclaredInputFails)
 TEST(TestSynthesisTracker, StructuredInputFails)
 {
     auto inputs = makeTensors({1, 2});
-    std::vector<int64_t> owned = {1, 2};
+    const std::vector<int64_t> owned = {1, 2};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -87,7 +87,7 @@ TEST(TestSynthesisTracker, StructuredInputFails)
 TEST(TestSynthesisTracker, DerivedInputFails)
 {
     auto inputs = makeTensors({1, 2});
-    std::vector<int64_t> owned = {1, 2};
+    const std::vector<int64_t> owned = {1, 2};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -104,7 +104,7 @@ TEST(TestSynthesisTracker, DerivedInputFails)
 TEST(TestSynthesisTracker, ZeroUidIgnored)
 {
     auto inputs = makeTensors({1});
-    std::vector<int64_t> owned = {1};
+    const std::vector<int64_t> owned = {1};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -119,7 +119,7 @@ TEST(TestSynthesisTracker, ZeroUidIgnored)
 TEST(TestSynthesisTracker, NonOwnedUidIgnored)
 {
     auto inputs = makeTensors({1, 99});
-    std::vector<int64_t> owned = {1};
+    const std::vector<int64_t> owned = {1};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);
@@ -134,9 +134,9 @@ TEST(TestSynthesisTracker, NonOwnedUidIgnored)
 TEST(TestSynthesisTracker, EmptyOwnedSucceeds)
 {
     InputTensorMap inputs;
-    std::vector<int64_t> owned;
+    const std::vector<int64_t> owned;
 
-    SynthesisTracker tracker(owned, inputs);
+    const SynthesisTracker tracker(owned, inputs);
 
     const auto result = tracker.finish("TestOp");
     EXPECT_TRUE(result.filled);
@@ -146,7 +146,7 @@ TEST(TestSynthesisTracker, EmptyOwnedSucceeds)
 TEST(TestSynthesisTracker, MixedFailuresReportAll)
 {
     auto inputs = makeTensors({1, 2, 3});
-    std::vector<int64_t> owned = {1, 2, 3};
+    const std::vector<int64_t> owned = {1, 2, 3};
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(owned, inputs);

From 4438fe0e16763a023a7df609aa6fadfb8cc60952 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 19:18:22 -0400
Subject: [PATCH 08/18] Address review: fix misleading skip msg, add env
 fallback tests, fix clang-tidy

- Track gpuRefErrored in runAutoMode() so the skip message distinguishes
  GPU-ref-errored from GPU-ref-capability-miss when CPU ref also misses.
- Extract resolveVerificationMode() and resolveGoldenDataDir() as free
  functions (CLI > env > nullopt precedence) and add 6 unit tests.
- Fix readability-implicit-bool-conversion in SynthesizeInputs.hpp:
  change all if(!ptr) to if(ptr == nullptr).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/harness/TestConfig.hpp                | 58 ++++++++++++-------
 ...raphGoldenReferenceVerificationHarness.cpp | 13 ++++-
 .../golden/input_init/SynthesizeInputs.hpp    | 38 ++++++------
 .../tests/TestTestConfig.cpp                  | 53 +++++++++++++++++
 4 files changed, 118 insertions(+), 44 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/TestConfig.hpp b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
index 11ebdfc7acaa..8abcd8b1adae 100644
--- a/dnn-providers/integration-tests/src/harness/TestConfig.hpp
+++ b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
@@ -79,6 +79,40 @@ inline VerificationMode parseVerificationMode(std::string value)
                              + "'; expected 'auto', 'golden', 'gpu', or 'cpu'");
 }
 
+// Resolve verification mode: CLI value wins, then env var, then nullopt (caller
+// defaults to AUTO). Factored out of TestConfig::initialize() so the resolution
+// logic is independently testable.
+inline std::optional<VerificationMode>
+    resolveVerificationMode(std::optional<VerificationMode> cliValue)
+{
+    if(cliValue.has_value())
+    {
+        return cliValue;
+    }
+    auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_VERIFICATION_MODE");
+    if(!envVal.empty())
+    {
+        return parseVerificationMode(envVal);
+    }
+    return std::nullopt;
+}
+
+// Resolve golden data dir: CLI value wins, then env var, then nullopt.
+inline std::optional<std::filesystem::path>
+    resolveGoldenDataDir(std::optional<std::filesystem::path> cliValue)
+{
+    if(cliValue.has_value())
+    {
+        return cliValue;
+    }
+    auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_GOLDEN_DATA_DIR");
+    if(!envVal.empty())
+    {
+        return std::filesystem::path(envVal);
+    }
+    return std::nullopt;
+}
+
 // Singleton class for storing CLI-based test configuration.
 // All arguments are independently optional:
 //   - articlePath: omit to use hipDNN's default plugin discovery
@@ -163,28 +197,8 @@ class TestConfig
             }
         }
 
-        instance._goldenDataDir = std::move(goldenDataDir);
-        if(!instance._goldenDataDir.has_value())
-        {
-            auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_GOLDEN_DATA_DIR");
-            if(!envVal.empty())
-            {
-                instance._goldenDataDir = std::filesystem::path(envVal);
-            }
-        }
-
-        // Verification mode: CLI flag wins; else HIPDNN_TEST_VERIFICATION_MODE env
-        // var; else default AUTO (resolved at the accessor). An invalid value
-        // (CLI or env) throws — parseVerificationMode reports the offending value.
-        instance._verificationMode = verificationMode;
-        if(!instance._verificationMode.has_value())
-        {
-            auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_VERIFICATION_MODE");
-            if(!envVal.empty())
-            {
-                instance._verificationMode = parseVerificationMode(envVal);
-            }
-        }
+        instance._goldenDataDir = resolveGoldenDataDir(std::move(goldenDataDir));
+        instance._verificationMode = resolveVerificationMode(verificationMode);
 
         // Detect device 0's gfx arch and VRAM once at startup. Used by
         // [[test_skips]] and golden-ref metadata guards (arch/VRAM checks).
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index b8f1d034bb61..cd756cfb00ee 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -222,6 +222,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
     }
 
     // GPU ref (non-final): capability miss or runtime error -> fall through.
+    bool gpuRefErrored = false;
     {
         OutputTensors refOutputs;
         const RefRunResult gpu
@@ -233,6 +234,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
         }
         if(gpu.status == RefStatus::RUNTIME_ERROR)
         {
+            gpuRefErrored = true;
             recordRefError("GPU reference errored (auto mode, falling through to CPU): "
                            + gpu.message);
         }
@@ -246,9 +248,14 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
         switch(cpu.status)
         {
         case RefStatus::CAPABILITY_MISS:
-            skipUnverifiable("no reference available (golden absent; GPU and CPU ref "
-                             "cannot run this op): "
-                             + cpu.message);
+            skipUnverifiable(
+                gpuRefErrored
+                    ? "no usable reference (golden absent; GPU ref errored, CPU ref "
+                      "cannot run this op; see reference-error report): "
+                          + cpu.message
+                    : "no reference available (golden absent; GPU and CPU ref "
+                      "cannot run this op): "
+                          + cpu.message);
             return;
         case RefStatus::RUNTIME_ERROR:
             recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
index 6cfcc0ec0c35..d519575c20a8 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -37,7 +37,7 @@ inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_obje
                                      std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionFwdAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not ConvolutionFwdAttributes");
     }
@@ -53,7 +53,7 @@ inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionBwdAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not ConvolutionBwdAttributes");
     }
@@ -69,7 +69,7 @@ inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::da
                                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionWrwAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not ConvolutionWrwAttributes");
     }
@@ -88,7 +88,7 @@ inline SynthesisResult fillBatchnormInferenceInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BatchnormInferenceAttributes");
     }
@@ -108,7 +108,7 @@ inline SynthesisResult fillBatchnormInferenceVarianceInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributesVarianceExt();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BatchnormInferenceAttributesVarianceExt");
     }
@@ -131,7 +131,7 @@ inline SynthesisResult fillBatchnormTrainingInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BatchnormAttributes");
     }
@@ -163,7 +163,7 @@ inline SynthesisResult fillBatchnormBackwardInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormBackwardAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BatchnormBackwardAttributes");
     }
@@ -193,7 +193,7 @@ inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objec
                                     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_MatmulAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not MatmulAttributes");
     }
@@ -211,7 +211,7 @@ inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_ob
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_PointwiseAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not PointwiseAttributes");
     }
@@ -231,7 +231,7 @@ inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_ob
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ReductionAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not ReductionAttributes");
     }
@@ -248,7 +248,7 @@ inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_ob
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not LayernormAttributes");
     }
@@ -269,7 +269,7 @@ inline SynthesisResult fillLayernormBackwardInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormBackwardAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not LayernormBackwardAttributes");
     }
@@ -291,7 +291,7 @@ inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_obje
                                      std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not RMSNormAttributes");
     }
@@ -311,7 +311,7 @@ inline SynthesisResult fillRmsnormBackwardInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormBackwardAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not RMSNormBackwardAttributes");
     }
@@ -331,7 +331,7 @@ inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ResampleFwdAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not ResampleFwdAttributes");
     }
@@ -351,7 +351,7 @@ inline SynthesisResult fillBlockScaleDequantizeInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleDequantizeAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BlockScaleDequantizeAttributes");
     }
@@ -368,7 +368,7 @@ inline SynthesisResult fillBlockScaleQuantizeInputs(
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleQuantizeAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not BlockScaleQuantizeAttributes");
     }
@@ -390,7 +390,7 @@ inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not SdpaAttributes");
     }
@@ -425,7 +425,7 @@ inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data
                                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaBackwardAttributes();
-    if(!a)
+    if(a == nullptr)
     {
         return SynthesisResult::unsupported("not SdpaBackwardAttributes");
     }
diff --git a/dnn-providers/integration-tests/tests/TestTestConfig.cpp b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
index a8674b3125b0..63370bfc8120 100644
--- a/dnn-providers/integration-tests/tests/TestTestConfig.cpp
+++ b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
@@ -90,6 +90,49 @@ TEST(ParseVerificationMode, ThrowsOnInvalidValue)
     EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode(""), std::runtime_error);
 }
 
+// resolveVerificationMode / resolveGoldenDataDir are free functions that
+// implement the "CLI wins, then env, then nullopt" precedence chain.
+// They don't touch the singleton so they can be tested freely.
+
+TEST(ResolveVerificationMode, CliValueWinsOverEnv)
+{
+    using hipdnn_integration_tests::resolveVerificationMode;
+    using hipdnn_integration_tests::VerificationMode;
+
+    // Even if the env var were set, the CLI value takes precedence.
+    const auto result = resolveVerificationMode(VerificationMode::GPU);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, VerificationMode::GPU);
+}
+
+TEST(ResolveVerificationMode, NulloptCliWithoutEnvReturnsNullopt)
+{
+    using hipdnn_integration_tests::resolveVerificationMode;
+
+    // Assuming HIPDNN_TEST_VERIFICATION_MODE is not set in the test env.
+    const auto result = resolveVerificationMode(std::nullopt);
+    EXPECT_FALSE(result.has_value());
+}
+
+TEST(ResolveGoldenDataDir, CliValueWinsOverEnv)
+{
+    using hipdnn_integration_tests::resolveGoldenDataDir;
+
+    const std::filesystem::path cliPath = "/explicit/golden/dir";
+    const auto result = resolveGoldenDataDir(cliPath);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, cliPath);
+}
+
+TEST(ResolveGoldenDataDir, NulloptCliWithoutEnvReturnsNullopt)
+{
+    using hipdnn_integration_tests::resolveGoldenDataDir;
+
+    // Assuming HIPDNN_TEST_GOLDEN_DATA_DIR is not set in the test env.
+    const auto result = resolveGoldenDataDir(std::nullopt);
+    EXPECT_FALSE(result.has_value());
+}
+
 // ---------------------------------------------------------------------------
 // Suite 2 – initialized singleton (all args provided)
 // ---------------------------------------------------------------------------
@@ -159,6 +202,16 @@ TEST_F(TestConfigInitialized, GetVerificationModeDefaultsToAuto)
               hipdnn_integration_tests::VerificationMode::AUTO);
 }
 
+TEST_F(TestConfigInitialized, HasGoldenDataDirReturnsFalseWhenNotProvided)
+{
+    EXPECT_FALSE(TestConfig::get().hasGoldenDataDir());
+}
+
+TEST_F(TestConfigInitialized, GetGoldenDataDirThrowsWhenNotProvided)
+{
+    EXPECT_THROW(TestConfig::get().getGoldenDataDir(), std::runtime_error);
+}
+
 TEST_F(TestConfigInitialized, DoubleInitializeThrows)
 {
     EXPECT_THROW(TestConfig::initialize(std::nullopt, std::nullopt), std::runtime_error);

From 133700fca476ba3f5c7cc5b473348d2cdcc4c5cc Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 22 Jun 2026 19:45:14 -0400
Subject: [PATCH 09/18] Lift SynthesisTracker to graph level for
 fused/multi-node synthesis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously each fill function created its own SynthesisTracker with the
whole-graph leaf input UIDs, so finish() rejected any UID not declared
by that single node. For multi-node/fused graphs (e.g. conv+relu), the
second node's finish() saw the first node's UIDs as "no role declared"
and returned unsupported → SKIP.

Fix: create one SynthesisTracker in synthesizeInputs(), pass it through
all fill functions, call finish() once after all nodes have declared
their UIDs. Each node now only accounts for its own inputs, and the
final finish() verifies that the union covers all leaf inputs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.cpp |  11 +-
 .../golden/input_init/SynthesizeInputs.hpp    | 333 ++++++++----------
 2 files changed, 156 insertions(+), 188 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index cd756cfb00ee..e181891ac031 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -305,11 +305,11 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
     std::mt19937 rng(static_cast<std::mt19937::result_type>(
         _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
 
+    SynthesisTracker tracker(allLeafInputUids, inputs);
     for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
     {
         const auto& node = wrapper.getNode(i);
-        const SynthesisResult outcome
-            = synthesizeNodeInputs(node, allLeafInputUids, inputs, rng);
+        const SynthesisResult outcome = synthesizeNodeInputs(node, tracker, rng);
         if(!outcome.filled)
         {
             skipUnverifiable(outcome.reason);
@@ -317,6 +317,13 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
         }
     }
 
+    const SynthesisResult finalResult = tracker.finish("synthesis");
+    if(!finalResult.filled)
+    {
+        skipUnverifiable(finalResult.reason);
+        return false;
+    }
+
     _bundle->tensors = std::move(inputs);
     return true;
 }
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
index d519575c20a8..a95b514edb09 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -9,18 +9,20 @@ namespace hipdnn_integration_tests::golden
 {
 
 // ── Per-op fill functions ─────────────────────────────────────────────────────
-// Each function synthesizes inputs for one node in the graph. A node "owns" the
-// leaf input tensors declared in its flatbuffer attributes — virtual tensors
-// (inter-node edges in a fused graph) and output tensors are excluded.
+// Each function declares inputs for one node in the graph. A single
+// SynthesisTracker is shared across all nodes in the graph — the caller
+// (synthesizeInputs in the harness .cpp) creates it with the whole-graph leaf
+// input UIDs, passes it through each fill function, then calls finish() once
+// after all nodes have been processed. This graph-level tracking is essential
+// for fused/multi-node graphs: each node only accounts for its own UIDs, and
+// the final finish() verifies that every leaf input was covered by some node.
 //
 // Every function follows the same pattern:
 //   1. Cast the node to its concrete attribute type.
-//   2. Create a SynthesisTracker with the node's owned uids.
-//   3. Declare each input as FREE (fill with random values), STRUCTURED (can't
+//   2. Declare each input as FREE (fill with random values), STRUCTURED (can't
 //      synthesize — needs specific format), or DERIVED (must come from another
 //      op's output). See SynthesisTracker.hpp for role definitions.
-//   4. Call finish() — returns ok() if all owned inputs were filled, or
-//      unsupported() with a diagnostic listing what couldn't be synthesized.
+//   3. Return ok() if the attribute cast succeeded, or unsupported() if not.
 //
 // Fills must be deterministic given `rng` so re-running the same graph produces
 // identical inputs for reproducible comparisons.
@@ -32,8 +34,7 @@ namespace hipdnn_integration_tests::golden
 // ── Convolution ───────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                     const std::vector<int64_t>& ownedLeafInputUids,
-                                     InputTensorMap& inputs,
+                                     SynthesisTracker& tracker,
                                      std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionFwdAttributes();
@@ -41,15 +42,13 @@ inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_obje
     {
         return SynthesisResult::unsupported("not ConvolutionFwdAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("ConvolutionFwd");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         const std::vector<int64_t>& ownedLeafInputUids,
-                                         InputTensorMap& inputs,
+                                         SynthesisTracker& tracker,
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionBwdAttributes();
@@ -57,15 +56,13 @@ inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_
     {
         return SynthesisResult::unsupported("not ConvolutionBwdAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("ConvolutionBwdData");
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                            const std::vector<int64_t>& ownedLeafInputUids,
-                                            InputTensorMap& inputs,
+                                            SynthesisTracker& tracker,
                                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionWrwAttributes();
@@ -73,18 +70,16 @@ inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::da
     {
         return SynthesisResult::unsupported("not ConvolutionWrwAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("ConvolutionBwdWeights");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── Batchnorm ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillBatchnormInferenceInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributes();
@@ -92,19 +87,17 @@ inline SynthesisResult fillBatchnormInferenceInputs(
     {
         return SynthesisResult::unsupported("not BatchnormInferenceAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
-    acct.fillFree(a->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("BatchnormInference");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 inline SynthesisResult fillBatchnormInferenceVarianceInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributesVarianceExt();
@@ -112,22 +105,20 @@ inline SynthesisResult fillBatchnormInferenceVarianceInputs(
     {
         return SynthesisResult::unsupported("not BatchnormInferenceAttributesVarianceExt");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
-    acct.fillFree(a->variance_tensor_uid(), 0.5f, 1.5f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
-    return acct.finish("BatchnormInferenceVarianceExt");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->variance_tensor_uid(), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // peer_stats holds references to other GPUs' memory for multi-GPU batchnorm —
 // randomly generated values would point to invalid cross-device memory.
 inline SynthesisResult fillBatchnormTrainingInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormAttributes();
@@ -135,31 +126,29 @@ inline SynthesisResult fillBatchnormTrainingInputs(
     {
         return SynthesisResult::unsupported("not BatchnormAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
-    acct.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
-    acct.fillFree(a->prev_running_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
-    acct.fillFree(a->momentum_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    tracker.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->prev_running_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->momentum_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
 
     if(a->peer_stats_tensor_uid() != nullptr)
     {
         for(const int64_t uid : *a->peer_stats_tensor_uid())
         {
-            acct.markStructured(uid, "peer_stats");
+            tracker.markStructured(uid, "peer_stats");
         }
     }
 
-    return acct.finish("BatchnormTraining");
+    return SynthesisResult::ok();
 }
 
 // mean/inv_variance are optional (may come from forward). peer_stats: see above.
 inline SynthesisResult fillBatchnormBackwardInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormBackwardAttributes();
@@ -167,29 +156,27 @@ inline SynthesisResult fillBatchnormBackwardInputs(
     {
         return SynthesisResult::unsupported("not BatchnormBackwardAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
-    acct.fillFree(a->inv_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->inv_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
 
     if(a->peer_stats_tensor_uid() != nullptr)
     {
         for(const int64_t uid : *a->peer_stats_tensor_uid())
         {
-            acct.markStructured(uid, "peer_stats");
+            tracker.markStructured(uid, "peer_stats");
         }
     }
 
-    return acct.finish("BatchnormBackward");
+    return SynthesisResult::ok();
 }
 
 // ── Matmul ────────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                    const std::vector<int64_t>& ownedLeafInputUids,
-                                    InputTensorMap& inputs,
+                                    SynthesisTracker& tracker,
                                     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_MatmulAttributes();
@@ -197,17 +184,15 @@ inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objec
     {
         return SynthesisResult::unsupported("not MatmulAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->a_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->b_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("Matmul");
+    tracker.fillFree(a->a_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->b_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── Pointwise ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       const std::vector<int64_t>& ownedLeafInputUids,
-                                       InputTensorMap& inputs,
+                                       SynthesisTracker& tracker,
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_PointwiseAttributes();
@@ -215,19 +200,17 @@ inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_ob
     {
         return SynthesisResult::unsupported("not PointwiseAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->in_0_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->in_1_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    acct.fillFree(a->in_2_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    acct.fillFree(a->axis_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    return acct.finish("Pointwise");
+    tracker.fillFree(a->in_0_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->in_1_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->in_2_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->axis_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── Reduction ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       const std::vector<int64_t>& ownedLeafInputUids,
-                                       InputTensorMap& inputs,
+                                       SynthesisTracker& tracker,
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ReductionAttributes();
@@ -235,16 +218,14 @@ inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_ob
     {
         return SynthesisResult::unsupported("not ReductionAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->in_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("Reduction");
+    tracker.fillFree(a->in_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── LayerNorm ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       const std::vector<int64_t>& ownedLeafInputUids,
-                                       InputTensorMap& inputs,
+                                       SynthesisTracker& tracker,
                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormAttributes();
@@ -252,20 +233,18 @@ inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_ob
     {
         return SynthesisResult::unsupported("not LayernormAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
-    return acct.finish("Layernorm");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // mean and inv_variance are computed by the forward pass — a standalone backward
 // can't produce correct gradients without them.
 inline SynthesisResult fillLayernormBackwardInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormBackwardAttributes();
@@ -273,21 +252,19 @@ inline SynthesisResult fillLayernormBackwardInputs(
     {
         return SynthesisResult::unsupported("not LayernormBackwardAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.markDerived(a->mean_tensor_uid().value_or(0), "mean (forward output)");
-    acct.markDerived(a->inv_variance_tensor_uid().value_or(0), "inv_variance (forward output)");
-    acct.fillFree(a->epsilon_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
-    return acct.finish("LayernormBackward");
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markDerived(a->mean_tensor_uid().value_or(0), "mean (forward output)");
+    tracker.markDerived(a->inv_variance_tensor_uid().value_or(0), "inv_variance (forward output)");
+    tracker.fillFree(a->epsilon_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── RMSNorm ───────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                     const std::vector<int64_t>& ownedLeafInputUids,
-                                     InputTensorMap& inputs,
+                                     SynthesisTracker& tracker,
                                      std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormAttributes();
@@ -295,19 +272,17 @@ inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_obje
     {
         return SynthesisResult::unsupported("not RMSNormAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
-    acct.fillFree(a->bias_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    return acct.finish("RMSNorm");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // inv_rms is computed by the forward pass.
 inline SynthesisResult fillRmsnormBackwardInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormBackwardAttributes();
@@ -315,19 +290,17 @@ inline SynthesisResult fillRmsnormBackwardInputs(
     {
         return SynthesisResult::unsupported("not RMSNormBackwardAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.markDerived(a->inv_rms_tensor_uid(), "inv_rms (forward output)");
-    return acct.finish("RMSNormBackward");
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markDerived(a->inv_rms_tensor_uid(), "inv_rms (forward output)");
+    return SynthesisResult::ok();
 }
 
 // ── Resample ──────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         const std::vector<int64_t>& ownedLeafInputUids,
-                                         InputTensorMap& inputs,
+                                         SynthesisTracker& tracker,
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ResampleFwdAttributes();
@@ -335,9 +308,8 @@ inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_
     {
         return SynthesisResult::unsupported("not ResampleFwdAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("ResampleFwd");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── Block-scale quantization ──────────────────────────────────────────────────
@@ -346,8 +318,7 @@ inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_
 // quantized data — random scales would produce garbage dequantized values.
 inline SynthesisResult fillBlockScaleDequantizeInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleDequantizeAttributes();
@@ -355,16 +326,14 @@ inline SynthesisResult fillBlockScaleDequantizeInputs(
     {
         return SynthesisResult::unsupported("not BlockScaleDequantizeAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.markStructured(a->scale_tensor_uid(), "scale (block quantization scales)");
-    return acct.finish("BlockScaleDequantize");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markStructured(a->scale_tensor_uid(), "scale (block quantization scales)");
+    return SynthesisResult::ok();
 }
 
 inline SynthesisResult fillBlockScaleQuantizeInputs(
     const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    const std::vector<int64_t>& ownedLeafInputUids,
-    InputTensorMap& inputs,
+    SynthesisTracker& tracker,
     std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleQuantizeAttributes();
@@ -372,9 +341,8 @@ inline SynthesisResult fillBlockScaleQuantizeInputs(
     {
         return SynthesisResult::unsupported("not BlockScaleQuantizeAttributes");
     }
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-    acct.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    return acct.finish("BlockScaleQuantize");
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
 }
 
 // ── SDPA ──────────────────────────────────────────────────────────────────────
@@ -385,8 +353,7 @@ inline SynthesisResult fillBlockScaleQuantizeInputs(
 // seed/offset must match between forward and backward passes.
 // Most of these are optional — absent ones (uid 0) are silently ignored.
 inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         const std::vector<int64_t>& ownedLeafInputUids,
-                                         InputTensorMap& inputs,
+                                         SynthesisTracker& tracker,
                                          std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaAttributes();
@@ -395,23 +362,21 @@ inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_
         return SynthesisResult::unsupported("not SdpaAttributes");
     }
 
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-
-    acct.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
-    acct.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
-
-    acct.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
-    acct.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
-    acct.markStructured(a->page_table_k_tensor_uid().value_or(0), "page_table_k");
-    acct.markStructured(a->page_table_v_tensor_uid().value_or(0), "page_table_v");
-    acct.markStructured(a->block_mask_tensor_uid().value_or(0), "block_mask");
-    acct.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
-    acct.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
-
-    return acct.finish("Sdpa");
+    tracker.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+
+    tracker.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    tracker.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    tracker.markStructured(a->page_table_k_tensor_uid().value_or(0), "page_table_k");
+    tracker.markStructured(a->page_table_v_tensor_uid().value_or(0), "page_table_v");
+    tracker.markStructured(a->block_mask_tensor_uid().value_or(0), "block_mask");
+    tracker.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
+    tracker.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
+
+    return SynthesisResult::ok();
 }
 
 // Q/K/V/dO accept random values. O (the forward output) and stats (softmax
@@ -420,8 +385,7 @@ inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_
 // inter-node tensors (not owned, so silently skipped). A standalone backward
 // without a forward is refused.
 inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                          const std::vector<int64_t>& ownedLeafInputUids,
-                                          InputTensorMap& inputs,
+                                          SynthesisTracker& tracker,
                                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaBackwardAttributes();
@@ -430,17 +394,15 @@ inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data
         return SynthesisResult::unsupported("not SdpaBackwardAttributes");
     }
 
-    SynthesisTracker acct(ownedLeafInputUids, inputs);
-
-    acct.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
-    acct.fillFree(a->do_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->do_tensor_uid(), -1.0f, 1.0f, rng);
 
-    acct.markDerived(a->o_tensor_uid(), "o (forward output)");
-    acct.markDerived(a->stats_tensor_uid(), "stats (forward softmax stats)");
+    tracker.markDerived(a->o_tensor_uid(), "o (forward output)");
+    tracker.markDerived(a->stats_tensor_uid(), "stats (forward softmax stats)");
 
-    return acct.finish("SdpaBackward");
+    return SynthesisResult::ok();
 }
 
 // ── Dispatch ──────────────────────────────────────────────────────────────────
@@ -451,8 +413,7 @@ inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data
 // a diagnostic when the op is unrecognized or an input can't be synthesized.
 
 inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                        const std::vector<int64_t>& ownedLeafInputUids,
-                                        InputTensorMap& inputs,
+                                        SynthesisTracker& tracker,
                                         std::mt19937& rng)
 {
     using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
@@ -460,43 +421,43 @@ inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_o
     switch(node.attributes_type())
     {
     case NA::ConvolutionFwdAttributes:
-        return fillConvFwdInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillConvFwdInputs(node, tracker, rng);
     case NA::ConvolutionBwdAttributes:
-        return fillConvBwdDataInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillConvBwdDataInputs(node, tracker, rng);
     case NA::ConvolutionWrwAttributes:
-        return fillConvBwdWeightsInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillConvBwdWeightsInputs(node, tracker, rng);
     case NA::BatchnormInferenceAttributes:
-        return fillBatchnormInferenceInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBatchnormInferenceInputs(node, tracker, rng);
     case NA::BatchnormInferenceAttributesVarianceExt:
-        return fillBatchnormInferenceVarianceInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBatchnormInferenceVarianceInputs(node, tracker, rng);
     case NA::BatchnormAttributes:
-        return fillBatchnormTrainingInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBatchnormTrainingInputs(node, tracker, rng);
     case NA::BatchnormBackwardAttributes:
-        return fillBatchnormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBatchnormBackwardInputs(node, tracker, rng);
     case NA::MatmulAttributes:
-        return fillMatmulInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillMatmulInputs(node, tracker, rng);
     case NA::PointwiseAttributes:
-        return fillPointwiseInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillPointwiseInputs(node, tracker, rng);
     case NA::ReductionAttributes:
-        return fillReductionInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillReductionInputs(node, tracker, rng);
     case NA::LayernormAttributes:
-        return fillLayernormInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillLayernormInputs(node, tracker, rng);
     case NA::LayernormBackwardAttributes:
-        return fillLayernormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillLayernormBackwardInputs(node, tracker, rng);
     case NA::RMSNormAttributes:
-        return fillRmsnormInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillRmsnormInputs(node, tracker, rng);
     case NA::RMSNormBackwardAttributes:
-        return fillRmsnormBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillRmsnormBackwardInputs(node, tracker, rng);
     case NA::ResampleFwdAttributes:
-        return fillResampleFwdInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillResampleFwdInputs(node, tracker, rng);
     case NA::BlockScaleDequantizeAttributes:
-        return fillBlockScaleDequantizeInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBlockScaleDequantizeInputs(node, tracker, rng);
     case NA::BlockScaleQuantizeAttributes:
-        return fillBlockScaleQuantizeInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillBlockScaleQuantizeInputs(node, tracker, rng);
     case NA::SdpaAttributes:
-        return fillSdpaForwardInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillSdpaForwardInputs(node, tracker, rng);
     case NA::SdpaBackwardAttributes:
-        return fillSdpaBackwardInputs(node, ownedLeafInputUids, inputs, rng);
+        return fillSdpaBackwardInputs(node, tracker, rng);
     default:
         return SynthesisResult::unsupported("no input synthesis registered for this op");
     }

From 6d3a7a958bb347742ae68228bc501bcaa2f418d3 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 12:04:48 -0400
Subject: [PATCH 10/18] Harden synthesis pipeline: sentinel outputs, FP8
 STRUCTURED, metadata-optional, runEngineOrSkip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fill output tensors with NaN sentinel instead of zero so allClose's
  NaN guard catches unwritten elements (previously masked as computed zero)
- Mark FP8 descale/scale factors as STRUCTURED (refuse to fabricate random
  values that cause identical saturation → vacuous pass), matching
  BlockScaleDequantize precedent
- Make metadata mandatory only when golden outputs are present; graph-only
  bundles default-construct empty metadata instead of failing with
  MISSING_METADATA
- Extract runEngineOrSkip() helper to deduplicate engine-run preamble
  across all three verification modes
- Thread engine error message into skip reason (was silently discarded)
- Make applyMetadataGuards() virtual so unit tests can override the
  hardware-guard path without initializing TestConfig singleton
- Fix test name lint failures (suite/case naming convention)
- Add SynthesisTracker precondition documentation
- Add synthesizeInputs() phase documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.cpp |  66 +++-
 ...raphGoldenReferenceVerificationHarness.hpp |  53 ++-
 .../harness/golden/IntegrationTestBundle.hpp  |  53 ++-
 .../golden/input_init/SynthesisTracker.hpp    |  89 ++++-
 .../golden/input_init/SynthesizeInputs.hpp    |  36 +-
 .../integration-tests/tests/CMakeLists.txt    |   1 +
 .../tests/TestBundleDiscovery.cpp             |  36 +-
 .../tests/TestSynthesizeInputs.cpp            | 370 ++++++++++++++++++
 .../tests/TestTestConfig.cpp                  |  12 +-
 .../tests/TestVerificationModePaths.cpp       |  16 +-
 10 files changed, 648 insertions(+), 84 deletions(-)
 create mode 100644 dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index e181891ac031..7a0e49f46beb 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -150,6 +150,35 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runComparison()
     }
 }
 
+namespace
+{
+// GTEST_SKIP() expands to `return;`, so it can only be used from a void-returning
+// function. This wrapper records the skip (and its message) and returns from
+// itself; the skip state persists for the caller, which then returns nullopt.
+void skipEngineCouldNotRun(const std::filesystem::path& bundlePath, const std::string& error)
+{
+    std::ostringstream msg;
+    msg << "Engine could not execute bundle " << bundlePath;
+    if(!error.empty())
+    {
+        msg << ": " << error;
+    }
+    GTEST_SKIP() << msg.str();
+}
+} // namespace
+
+std::optional<OutputTensors>
+    IntegrationGraphGoldenReferenceVerificationHarness::runEngineOrSkip()
+{
+    std::string error;
+    auto engineOutputs = runEngineCapturingOutputs(error);
+    if(!engineOutputs && !::testing::Test::HasFatalFailure())
+    {
+        skipEngineCouldNotRun(_bundlePath, error);
+    }
+    return engineOutputs;
+}
+
 void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
 {
     if(!_bundle->hasGoldenOutputs)
@@ -157,13 +186,9 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
         skipUnverifiable("no golden data (verification-mode=golden)");
         return;
     }
-    auto engineOutputs = runEngineCapturingOutputs();
+    auto engineOutputs = runEngineOrSkip();
     if(!engineOutputs)
     {
-        if(!::testing::Test::HasFatalFailure())
-        {
-            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-        }
         return;
     }
     compareAgainstGolden(*engineOutputs);
@@ -172,13 +197,9 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
 void IntegrationGraphGoldenReferenceVerificationHarness::runExplicitRefMode(
     ReferenceExecutorType type)
 {
-    auto engineOutputs = runEngineCapturingOutputs();
+    auto engineOutputs = runEngineOrSkip();
     if(!engineOutputs)
     {
-        if(!::testing::Test::HasFatalFailure())
-        {
-            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-        }
         return;
     }
 
@@ -205,13 +226,9 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runExplicitRefMode(
 
 void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
 {
-    auto engineOutputs = runEngineCapturingOutputs();
+    auto engineOutputs = runEngineOrSkip();
     if(!engineOutputs)
     {
-        if(!::testing::Test::HasFatalFailure())
-        {
-            GTEST_SKIP() << "Engine could not execute bundle " << _bundlePath;
-        }
         return;
     }
 
@@ -330,7 +347,15 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
 
 // ---- engine + reference runs -----------------------------------------------
 
-OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateZeroedOutputs() const
+// Output buffers are filled with a sentinel (NaN for float types, type max for
+// integer types) rather than zero. This is the standard hipdnn practice — see
+// CpuReferenceGraphExecutor and GraphTensorBundle::sentinelFillOutputTensors —
+// and it arms allClose's NaN/sentinel guard: any output element the executor
+// fails to write stays NaN and is caught as a hard failure. Zero-filling would
+// make an unwritten output indistinguishable from a legitimately-computed zero,
+// so engine and reference could silently agree on garbage (both untouched zeros)
+// and the comparison would vacuously pass.
+OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateSentinelOutputs() const
 {
     const auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
@@ -340,7 +365,7 @@ OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateZeroed
     {
         outputs[uid]
             = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
-        outputs[uid]->fillTensorWithValue(0.f);
+        outputs[uid]->fillWithSentinelValue();
     }
     return outputs;
 }
@@ -369,13 +394,12 @@ std::unordered_map<int64_t, void*>
 }
 
 std::optional<OutputTensors>
-    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs()
+    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs(std::string& error)
 {
-    OutputTensors engineOutputs = allocateZeroedOutputs();
+    OutputTensors engineOutputs = allocateSentinelOutputs();
     auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
 
     bool threw = false;
-    std::string error;
     try
     {
         executeGraphThroughEngine(variantPack);
@@ -403,7 +427,7 @@ IntegrationGraphGoldenReferenceVerificationHarness::RefRunResult
     IntegrationGraphGoldenReferenceVerificationHarness::runReferenceCapturingOutputs(
         ReferenceExecutorType type, OutputTensors& refOutputs)
 {
-    refOutputs = allocateZeroedOutputs();
+    refOutputs = allocateSentinelOutputs();
     const bool useDevice = (type == ReferenceExecutorType::GPU);
     auto variantPack = buildVariantPack(refOutputs, useDevice);
 
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index e474c87f5926..b0fe8a6f8937 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -107,6 +107,12 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     // without touching the TestConfig singleton.
     virtual VerificationMode getVerificationMode() const;
 
+    // Skips the test when the bundle's metadata is incompatible with the
+    // current device (VRAM/arch). Virtual so isolated unit tests that don't
+    // exercise hardware guards can override it — production reads from the
+    // TestConfig singleton, which is only initialized by the real test main.
+    virtual void applyMetadataGuards() const;
+
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
@@ -134,15 +140,48 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
 
     // ── inputs ──────────────────────────────────────────────────────────
     bool ensureInputsAvailable();
+
+    // Synthesizes leaf input tensors for the graph when no golden data exists.
+    //
+    // Phase 1 — allocate: walks the graph's tensor list, skips virtual
+    //   (inter-node) and output tensors, allocates a CPU-side buffer for
+    //   each remaining leaf input tensor (shape/dtype from TensorAttributes).
+    //
+    // Phase 2 — fill: iterates each node (internal op) and calls its
+    //   registered fill function via synthesizeNodeInputs(). Each fill
+    //   function reads its tensor UIDs from the node's attributes and
+    //   declares each one as FREE (random values), STRUCTURED (needs
+    //   specific format), or DERIVED (needs another op's output) through
+    //   a shared SynthesisTracker.
+    //
+    // Phase 3 — verify: calls tracker.finish() which checks that every
+    //   leaf input was accounted for by some fill function and none were
+    //   refused (STRUCTURED/DERIVED). Returns false and SKIPs the test
+    //   if any leaf was missed or refused.
+    //
+    // On success, moves the filled tensors into the bundle so downstream
+    // executors (engine, GPU ref, CPU ref) can upload them to the GPU.
     bool synthesizeInputs();
 
     // ── buffer allocation + execution ───────────────────────────────────
-    // allocateZeroedOutputs / buildVariantPack prepare the buffers;
+    // allocateSentinelOutputs / buildVariantPack prepare the buffers;
     // runEngine* / runReference* call the executors and capture results.
-    OutputTensors allocateZeroedOutputs() const;
+    // Outputs are sentinel-filled (NaN) so an unwritten output element is
+    // caught by allClose rather than masquerading as a computed zero.
+    OutputTensors allocateSentinelOutputs() const;
     std::unordered_map<int64_t, void*> buildVariantPack(OutputTensors& outputs,
                                                         bool useDevice) const;
-    std::optional<OutputTensors> runEngineCapturingOutputs();
+    // Runs the engine into fresh output buffers. Returns nullopt if the
+    // engine threw (its message is written to `error`) or raised a fatal
+    // GTest failure (in which case `error` is left empty).
+    std::optional<OutputTensors> runEngineCapturingOutputs(std::string& error);
+
+    // Runs the engine and returns its outputs, or nullopt if it could not
+    // run. On nullopt the caller must simply return: this has already
+    // issued the appropriate verdict (a fatal failure propagates as-is,
+    // otherwise the test is SKIPped). Shared preamble for all three modes.
+    std::optional<OutputTensors> runEngineOrSkip();
+
     RefRunResult runReferenceCapturingOutputs(ReferenceExecutorType type,
                                               OutputTensors& refOutputs);
     void markOutputsModified(OutputTensors& outputs) const;
@@ -183,6 +222,11 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
                              float rtol);
 
     // ── reporting ───────────────────────────────────────────────────────
+    // Records the bundle path + reason in the process-wide
+    // UnverifiableBundleReport (printed as a summary after all tests),
+    // then GTEST_SKIP()s this test. The reason is a flat human-readable
+    // string — per-tensor details are concatenated into it by the caller
+    // (e.g., tracker.finish()), not stored as structured data.
     void skipUnverifiable(const std::string& reason);
     void recordRefError(const std::string& reason);
     static std::string refLabel(ReferenceExecutorType type);
@@ -218,9 +262,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     static float
         toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
-
-    // ── guards ──────────────────────────────────────────────────────────
-    void applyMetadataGuards() const;
 };
 
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
index 1cd93dc80c4a..55a0e308dab8 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
@@ -37,9 +37,13 @@ using TensorMap = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::u
 //                      and the harness walks it (GraphWrapper) for dtypes and
 //                      tolerances. A bundle that cannot even produce a graph is a
 //                      LoadError, not a bundle.
-//   metadata         — .meta.json contents (VRAM / arch guards). MANDATORY: a
-//                      bundle without a valid .meta.json is a LoadError, so a
-//                      loaded bundle always carries real metadata.
+//   metadata         — .meta.json contents (VRAM / arch guards). Mandatory ONLY
+//                      for golden bundles (those shipping output .bin blobs);
+//                      metadata validates golden data, so a bundle without it is
+//                      a LoadError. For a no-golden bundle (graph-only, or
+//                      inputs-only verified against a reference) absent metadata
+//                      is valid and this is default-constructed (all fields
+//                      empty); the optional-aware consumers fall back to defaults.
 //   outputTensorUids — UIDs of the graph's output tensors, derived from the
 //                      graph. Always available (even for a graph-only bundle),
 //                      so the harness knows which tensors to compare / allocate.
@@ -78,7 +82,7 @@ enum class LoadError
 {
     MALFORMED_JSON, // the graph .json is not syntactically valid JSON
     INVALID_GRAPH_SCHEMA, // valid JSON, but not a valid graph (cannot build flatbuffer)
-    MISSING_METADATA, // required .meta.json companion is absent or invalid
+    MISSING_METADATA, // golden bundle's .meta.json companion is absent or invalid
     TENSOR_LOAD_FAILED // a tensor .bin is present but failed to load (wrong size,
     // unreadable, unsupported dtype, ...)
 };
@@ -163,7 +167,8 @@ inline std::vector<int64_t> allTensorUids(const nlohmann::json& graphJson)
 //
 //   * graph .json not parseable           -> LoadError::MALFORMED_JSON      (FAIL)
 //   * parseable but not a valid graph     -> LoadError::INVALID_GRAPH_SCHEMA(FAIL)
-//   * valid graph, no .meta.json companion-> LoadError::MISSING_METADATA    (FAIL)
+//   * golden bundle, no/invalid .meta.json-> LoadError::MISSING_METADATA    (FAIL)
+//   * no-golden bundle, no .meta.json      -> bundle, metadata default-constructed
 //   * valid graph, input .bin data absent -> bundle, tensors == nullopt     (tier-3:
 //                                            harness may synthesize, else SKIP)
 //   * valid graph, .bin present but broken-> LoadError::TENSOR_LOAD_FAILED  (FAIL)
@@ -209,22 +214,35 @@ inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPat
         return LoadError::INVALID_GRAPH_SCHEMA;
     }
 
-    // 3. Metadata is MANDATORY: every valid-graph bundle must ship a valid
-    //    .meta.json companion. loadBundleMetadata returns nullopt both when the
-    //    file is absent and when it is present but invalid (bad JSON / bad
-    //    format_version) — either way it is an authoring error -> FAIL.
+    // 3. Capture the graph and derive the output UIDs (always available, even
+    //    for a graph-only bundle).
+    IntegrationTestBundle bundle;
+    bundle.graphBuffer = builder.Release();
+    bundle.outputTensorUids = hipdnn_test_sdk::utilities::getOutputTensorUidsFromGraph(graphJson);
+
+    // 4. Metadata is mandatory ONLY for golden bundles — those shipping output
+    //    .bin blobs. Metadata (arch lock, provenance, seed) exists to validate
+    //    golden data; a bundle with no golden outputs (pure graph-only, or
+    //    inputs-only verified against a reference) has nothing for it to
+    //    validate, so absent metadata is fine and we default-construct it.
+    //
+    //    loadBundleMetadata returns nullopt both when the .meta.json is absent
+    //    and when it is present but invalid (bad JSON / bad format_version). For
+    //    a golden bundle either case is an authoring error -> FAIL.
+    const bool goldenOutputsPresent
+        = !bundle.outputTensorUids.empty()
+          && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
+
     auto metadata = hipdnn_test_sdk::utilities::loadBundleMetadata(jsonPath);
     if(!metadata.has_value())
     {
-        return LoadError::MISSING_METADATA;
+        if(goldenOutputsPresent)
+        {
+            return LoadError::MISSING_METADATA;
+        }
+        metadata.emplace(); // graph-only / no-golden: empty metadata is valid.
     }
-
-    // 4. Graph + metadata verified: capture them and the output UIDs (always
-    //    available, even for a graph-only bundle).
-    IntegrationTestBundle bundle;
-    bundle.graphBuffer = builder.Release();
     bundle.metadata = std::move(*metadata);
-    bundle.outputTensorUids = hipdnn_test_sdk::utilities::getOutputTensorUidsFromGraph(graphJson);
 
     // 5. Load tensor .bin data, inputs and outputs INDEPENDENTLY.
     //
@@ -258,8 +276,7 @@ inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPat
         // A graph with no declared inputs cannot be fed; treat as graph-only.
         const bool inputsPresent
             = !inputUids.empty() && detail::blobsPresentFor(inputUids, jsonPath);
-        const bool outputsPresent = !bundle.outputTensorUids.empty()
-                                    && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
+        const bool outputsPresent = goldenOutputsPresent; // computed in step 4
 
         if(inputsPresent)
         {
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
index e1ad75a80bb6..6838b0218397 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
@@ -22,9 +22,9 @@ namespace hipdnn_integration_tests::golden
 using InputTensorMap
     = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
 
-// Result of synthesizeNodeInputs() for one node. filled==true means every
-// input the node owns got valid data. filled==false means at least one could
-// not be synthesized — reason says which and why.
+// Result of a synthesis step — returned by per-node fill functions and by
+// tracker.finish(). filled==true means synthesis can proceed; filled==false
+// means at least one input could not be synthesized — reason says which and why.
 struct SynthesisResult
 {
     bool filled = false;
@@ -40,12 +40,50 @@ struct SynthesisResult
     }
 };
 
-// Tracks which inputs a node's fill function has accounted for. Each input must be
-// declared as one of three roles:
+// Tracks which leaf inputs of a bundle's graph have been accounted for by the
+// per-node fill functions. A bundle contains a graph of one or more nodes — a
+// single conv, or a fused chain like conv → bias_add → relu. One tracker is
+// created for the entire graph's leaf inputs (non-virtual, non-output tensors),
+// shared across all fill functions, and finish() is called once at the end.
+//
+// Graph structure (conv + bias + relu fused graph):
+//
+//   Data flows top-down. Roots are the leaf input tensors that the tracker
+//   owns; the sink is the graph output tensor.
+//
+//        x (root/leaf)  w (root/leaf)  bias (root/leaf)
+//         uid=1          uid=2           uid=4
+//           \             /                |
+//            \           /                 |
+//         ┌──────────────┐                 |
+//         │   ConvFwd    │  (internal)     |
+//         └──────┬───────┘                 |
+//                |                         |
+//          conv_y (virtual, uid=10)        |
+//                |                         |
+//                \                        /
+//              ┌──────────────────────┐
+//              │   Pointwise ADD      │  (internal)
+//              └──────────┬───────────┘
+//                         |
+//                   bias_out (virtual, uid=11)
+//                         |
+//              ┌──────────┴───────────┐
+//              │   Pointwise RELU     │  (internal)
+//              └──────────┬───────────┘
+//                         |
+//                    out (sink/leaf, uid=6)
+//
+//   Roots  = leaf input tensors, owned by tracker: {1, 2, 4}
+//   Virtual = inter-node edges, not owned → fillFree/markDerived skip them
+//   Sink   = graph output tensor, not owned
+//
+// Each leaf input must be declared as one of three mutually exclusive roles:
 //
 //   FREE       — random values in a range work. The range can be tight (e.g.
 //                variance in [0.5, 1.5] to stay positive) or wide (e.g. x in
 //                [-1, 1]). What matters is that any value in the range is valid.
+//
 //   STRUCTURED — random values in any range won't work. The data needs to be
 //                consistent with other state or follow a specific format.
 //
@@ -71,14 +109,33 @@ struct SynthesisResult
 //                partial results. The peer_stats tensor holds references to
 //                other GPUs' memory regions. Randomly generated values would
 //                point to invalid cross-device memory.
-// 
+//
 //   DERIVED    — the value must come from another op's output, not from random
-//                generation (e.g. a backward pass needs the forward pass's output
-//                tensor and intermediate statistics to compute correct gradients).
+//                generation. In a fused fwd+bwd graph the forward output flows
+//                to the backward input as a virtual tensor (not owned, silently
+//                skipped). In a standalone backward, the same tensor is a leaf
+//                input — markDerived records it, and finish() refuses because
+//                no forward pass produced it.
 //
-// finish() succeeds only when every owned input was declared as some role AND
-// none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
+// finish() succeeds only when every owned leaf input was declared as some role
+// AND none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
 // produce a diagnostic message so the caller knows what went wrong.
+//
+// PRECONDITION — a validated, well-formed graph. The tracker trusts the leaf
+// set it is handed and the virtual_ flag on every tensor:
+//
+//   * A required input referenced by a node is assumed to be a real leaf tensor
+//     (not mislabeled virtual or aliased to an output). If it were, fillFree
+//     would silently no-op on a non-owned uid and finish() would never see it.
+//   * A virtual tensor is assumed to genuinely have a producer node. A standalone
+//     backward whose `o`/`stats` were erroneously flagged virtual would skip the
+//     markDerived refusal and "succeed" with garbage.
+//
+// Both of those malformed-graph states are rejected upstream — at bundle load
+// (the flatbuffer build in loadIntegrationTestBundle) and again by the engine's
+// own graph validation (from_binary / check_support / build_plans), which
+// requires every virtual tensor to have a producer. By the time synthesis runs,
+// the graph is well-formed, so the tracker does not re-validate topology.
 class SynthesisTracker
 {
 public:
@@ -122,13 +179,13 @@ class SynthesisTracker
         _refusals.push_back(std::string(role) + " (derived from another computation)");
     }
 
-    // Returns ok() when all owned inputs were filled with random data.
+    // Returns ok() when all owned leaf inputs were filled with random data.
     // Returns unsupported() when synthesis cannot produce valid data for
-    // this node — either because an owned input is STRUCTURED/DERIVED
-    // (we know about it but can't fill it), or because an owned input was
-    // never declared (the fill function forgot about it).
-    // Note: absent optional tensors (uid 0) and virtual tensors are not
-    // owned, so STRUCTURED/DERIVED calls on them are silently ignored.
+    // this graph — either because a leaf input is STRUCTURED/DERIVED
+    // (we know about it but can't fill it), or because a leaf input was
+    // never declared by any node's fill function.
+    // Note: absent optional tensors (uid 0) and virtual inter-node tensors
+    // are not owned, so STRUCTURED/DERIVED calls on them are silently ignored.
     SynthesisResult finish(const char* opName) const
     {
         std::vector<std::string> reasons = _refusals;
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
index a95b514edb09..ed777cdfa06f 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -347,11 +347,19 @@ inline SynthesisResult fillBlockScaleQuantizeInputs(
 
 // ── SDPA ──────────────────────────────────────────────────────────────────────
 
-// Q/K/V/mask/scale accept random values. The remaining inputs are STRUCTURED:
-// seq lengths encode actual sequence boundaries, page tables map to allocated
-// GPU memory chunks, block masks define sparse attention patterns, and dropout
-// seed/offset must match between forward and backward passes.
-// Most of these are optional — absent ones (uid 0) are silently ignored.
+// Q/K/V/mask accept random values, as does scale (the softmax multiplier, e.g.
+// 1/sqrt(head_dim) — any positive value is mathematically valid). The FP8/MX
+// descale/scale factors are STRUCTURED, NOT free: each must equal the actual
+// quantization factor used to produce its tensor's data. A random descale does
+// not break the engine-vs-reference comparison (both read the same shared value)
+// but it lets values drift out of FP8 range and saturate identically on both
+// sides — a vacuous pass that verifies nothing. We therefore refuse to fabricate
+// them, mirroring fillBlockScaleDequantizeInputs. Real FP8 coverage comes from
+// authored bundles that ship the matching scales as data. The remaining inputs
+// are STRUCTURED for their own reasons: seq lengths encode actual sequence
+// boundaries, page tables map to allocated GPU memory chunks, block masks define
+// sparse attention patterns, and dropout seed/offset must match between fwd and
+// bwd. Most of these are optional — absent ones (uid 0) are silently ignored.
 inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
                                          SynthesisTracker& tracker,
                                          std::mt19937& rng)
@@ -368,6 +376,15 @@ inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_
     tracker.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
     tracker.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
 
+    // FP8/MX quantization scale factors must match the data's true scale — see
+    // the header comment. Refuse rather than fabricate a meaningless value.
+    tracker.markStructured(a->descale_q_tensor_uid().value_or(0), "descale_q");
+    tracker.markStructured(a->descale_k_tensor_uid().value_or(0), "descale_k");
+    tracker.markStructured(a->descale_v_tensor_uid().value_or(0), "descale_v");
+    tracker.markStructured(a->descale_s_tensor_uid().value_or(0), "descale_s");
+    tracker.markStructured(a->scale_s_tensor_uid().value_or(0), "scale_s");
+    tracker.markStructured(a->scale_o_tensor_uid().value_or(0), "scale_o");
+
     tracker.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
     tracker.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
     tracker.markStructured(a->page_table_k_tensor_uid().value_or(0), "page_table_k");
@@ -398,10 +415,19 @@ inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data
     tracker.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
     tracker.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
     tracker.fillFree(a->do_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->dropout_scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->dropout_scale_inv_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
 
     tracker.markDerived(a->o_tensor_uid(), "o (forward output)");
     tracker.markDerived(a->stats_tensor_uid(), "stats (forward softmax stats)");
 
+    tracker.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    tracker.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    tracker.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
+    tracker.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
+
     return SynthesisResult::ok();
 }
 
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index c2dba6f219d0..705b0bee96e7 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestVerificationPaths.cpp
     TestGoldenVerificationHarness.cpp
     TestSynthesisTracker.cpp
+    TestSynthesizeInputs.cpp
     TestVerificationModePaths.cpp
 )
 
diff --git a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
index f3b8b8179c8d..020990dc3375 100644
--- a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
@@ -63,9 +63,10 @@ class TestBundleDiscoveryFixture : public ::testing::Test
                R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
     }
 
-    // Writes a valid {name}.meta.json companion. Metadata is mandatory for any
-    // bundle expected to load successfully (loadIntegrationTestBundle returns
-    // LoadError::MISSING_METADATA without it).
+    // Writes a valid {name}.meta.json companion. Metadata is mandatory for a
+    // golden bundle (one shipping output .bin blobs) — loadIntegrationTestBundle
+    // returns LoadError::MISSING_METADATA for those without it — and optional for
+    // a no-golden / graph-only bundle.
     static void writeMetadata(const std::filesystem::path& dir, const std::string& name)
     {
         std::ofstream(dir / (name + ".meta.json"))
@@ -339,14 +340,33 @@ TEST_F(TestBundleDiscoveryFixture, LoadBundlePopulatesMetadataWhenPresent)
     EXPECT_EQ(*bundle.metadata.seed, 42);
 }
 
-// A valid-graph bundle WITHOUT a .meta.json companion is a load error: metadata
-// is mandatory.
-TEST_F(TestBundleDiscoveryFixture, LoadBundleMissingMetadataIsError)
+// A graph-only bundle (no .bin blobs, hence no golden data) without a .meta.json
+// companion loads successfully: metadata validates golden data, and there is
+// none here, so absent metadata is valid and default-constructed.
+TEST_F(TestBundleDiscoveryFixture, LoadGraphOnlyBundleMissingMetadataLoads)
 {
     auto dir = _tempDir / "op" / "nometa";
-    createMinimalBundle(dir, "nometa"); // graph only, no .meta.json
+    createMinimalBundle(dir, "nometa"); // graph only, no .meta.json, no .bin
     const auto jsonPath = dir / "nometa.json";
 
+    auto result = loadIntegrationTestBundle(jsonPath);
+    ASSERT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
+    const auto& bundle = std::get<IntegrationTestBundle>(result);
+
+    EXPECT_FALSE(bundle.tensors.has_value());      // graph-only: no tensor data
+    EXPECT_FALSE(bundle.hasGoldenOutputs);
+    EXPECT_FALSE(bundle.metadata.operation.has_value()); // default-constructed
+}
+
+// A GOLDEN bundle (output .bin blobs present) WITHOUT a .meta.json companion is
+// a load error: metadata is mandatory whenever there is golden data to validate.
+TEST_F(TestBundleDiscoveryFixture, LoadGoldenBundleMissingMetadataIsError)
+{
+    auto dir = _tempDir / "op" / "goldennometa";
+    createLoadableBundle(dir, "goldennometa"); // writes .bin (inputs+outputs) + meta
+    std::filesystem::remove(dir / "goldennometa.meta.json"); // drop the metadata
+    const auto jsonPath = dir / "goldennometa.json";
+
     auto result = loadIntegrationTestBundle(jsonPath);
     ASSERT_TRUE(std::holds_alternative<LoadError>(result));
     EXPECT_EQ(std::get<LoadError>(result), LoadError::MISSING_METADATA);
@@ -359,7 +379,7 @@ TEST_F(TestBundleDiscoveryFixture, LoadBundleMissingBinIsGraphOnly)
 {
     auto dir = _tempDir / "op" / "nobin";
     createMinimalBundle(dir, "nobin");
-    writeMetadata(dir, "nobin"); // metadata is mandatory even for graph-only
+    writeMetadata(dir, "nobin"); // metadata present (optional here, but exercised)
     const auto jsonPath = dir / "nobin.json";
 
     auto result = loadIntegrationTestBundle(jsonPath);
diff --git a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
new file mode 100644
index 000000000000..f22834177792
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
@@ -0,0 +1,370 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+
+#include "harness/golden/input_init/SynthesizeInputs.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_flatbuffers_sdk::data_objects;
+using namespace hipdnn_integration_tests::golden;
+
+namespace
+{
+
+const std::vector<int64_t> kDims    = {2, 3};
+const std::vector<int64_t> kStrides = {3, 1};
+
+InputTensorMap makeTensors(const std::vector<int64_t>& uids)
+{
+    InputTensorMap map;
+    for(const int64_t uid : uids)
+    {
+        map[uid] = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(kDims, kStrides);
+        map[uid]->fillTensorWithValue(0.f);
+    }
+    return map;
+}
+
+struct GraphResult
+{
+    flatbuffers::FlatBufferBuilder builder;
+    const Graph* graph = nullptr;
+
+    const Node& node(uint32_t i) const
+    {
+        return *graph->nodes()->Get(i);
+    }
+
+    std::vector<int64_t> leafInputUids(const std::set<int64_t>& outputUids) const
+    {
+        std::vector<int64_t> uids;
+        for(const auto* t : *graph->tensors())
+        {
+            if(!t->virtual_() && outputUids.count(t->uid()) == 0)
+            {
+                uids.push_back(t->uid());
+            }
+        }
+        return uids;
+    }
+};
+
+// ── Conv fwd (single node) ──────────────────────────────────────────────────
+
+GraphResult buildConvFwdGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "y", DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 3);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(
+        CreateNodeDirect(b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── Conv + bias (2-node fused) ──────────────────────────────────────────────
+// conv.y (uid 10) is virtual; bias (uid 4) is leaf
+
+GraphResult buildConvBiasGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x",        DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w",        DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 10, "conv_y",  DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "bias",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "out",      DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
+    auto add  = CreatePointwiseAttributes(b, PointwiseMode::ADD,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          10, 4, flatbuffers::nullopt, 5);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "conv",     DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── Conv + bias + relu (3-node fused) ───────────────────────────────────────
+// conv.y (uid 10) virtual, bias_add.out (uid 11) virtual, relu.in_0=uid 11, relu.out_0=uid 6
+
+GraphResult buildConvBiasReluGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x",          DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w",          DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 10, "conv_y",    DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "bias",       DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 11, "bias_out",  DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 6, "out",        DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
+    auto add  = CreatePointwiseAttributes(b, PointwiseMode::ADD,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          10, 4, flatbuffers::nullopt, 11);
+    auto relu = CreatePointwiseAttributes(b, PointwiseMode::RELU_FWD,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          flatbuffers::nullopt, flatbuffers::nullopt,
+                                          11, flatbuffers::nullopt, flatbuffers::nullopt, 6);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "conv",     DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+    nodes.push_back(CreateNodeDirect(b, "relu",     DataType::FLOAT, NodeAttributes::PointwiseAttributes, relu.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA forward (no structured optionals) ──────────────────────────────────
+
+GraphResult buildSdpaFwdGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+
+    auto sdpa = CreateSdpaAttributes(b, 1, 2, 3, 4);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA forward with structured seq_len_q ──────────────────────────────────
+
+GraphResult buildSdpaFwdWithStructuredGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q",         DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k",         DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v",         DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o",         DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "seq_len_q", DataType::FLOAT, &kStrides, &kDims));
+
+    auto sdpa = CreateSdpaAttributes(b, 1, 2, 3, 4,
+                                     flatbuffers::nullopt, // attn_mask
+                                     flatbuffers::nullopt, // scale
+                                     5);                   // seq_len_q
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA backward standalone ────────────────────────────────────────────────
+// O and stats are leaf inputs (not virtual) → DERIVED → refuses
+
+GraphResult buildSdpaBwdStandaloneGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 6, "stats", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv",    DataType::FLOAT, &kStrides, &kDims));
+
+    auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 4, 5, 6, 7, 8, 9);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA fwd+bwd fused ─────────────────────────────────────────────────────
+// O (uid 10) and stats (uid 11) are virtual inter-node tensors.
+// Leaf inputs: Q(1), K(2), V(3) from fwd + dO(5) from bwd.
+// Outputs: dQ(7), dK(8), dV(9).
+
+GraphResult buildSdpaFwdBwdFusedGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1,  "q",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2,  "k",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3,  "v",     DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 10, "o",     DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 11, "stats", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5,  "do",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7,  "dq",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8,  "dk",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9,  "dv",    DataType::FLOAT, &kStrides, &kDims));
+
+    auto fwd = CreateSdpaAttributes(b, 1, 2, 3, 10,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    flatbuffers::nullopt, flatbuffers::nullopt,
+                                    11); // stats_tensor_uid
+
+    auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 10, 5, 11, 7, 8, 9);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, fwd.Union()));
+    nodes.push_back(CreateNodeDirect(b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+
+    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+SynthesisResult runSynthesis(const GraphResult& gr, const std::set<int64_t>& outputUids)
+{
+    const auto leafUids = gr.leafInputUids(outputUids);
+    auto inputs         = makeTensors(leafUids);
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(leafUids, inputs);
+    for(uint32_t i = 0; i < gr.graph->nodes()->size(); ++i)
+    {
+        const SynthesisResult nodeResult = synthesizeNodeInputs(*gr.graph->nodes()->Get(i), tracker, rng);
+        if(!nodeResult.filled)
+        {
+            return nodeResult;
+        }
+    }
+    return tracker.finish("test");
+}
+
+} // namespace
+
+// ── Test cases ──────────────────────────────────────────────────────────────
+
+TEST(TestSynthesizeInputs, SingleConvFwd)
+{
+    const auto gr     = buildConvFwdGraph();
+    const auto result = runSynthesis(gr, {3});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, ConvPlusBiasFused)
+{
+    const auto gr     = buildConvBiasGraph();
+    const auto result = runSynthesis(gr, {5});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, ConvPlusBiasPlusReluFused)
+{
+    const auto gr     = buildConvBiasReluGraph();
+    const auto result = runSynthesis(gr, {6});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdNoStructuredOptionals)
+{
+    const auto gr     = buildSdpaFwdGraph();
+    const auto result = runSynthesis(gr, {4});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdWithStructuredInputRefuses)
+{
+    const auto gr     = buildSdpaFwdWithStructuredGraph();
+    const auto result = runSynthesis(gr, {4});
+
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("seq_len_q"), std::string::npos);
+    EXPECT_NE(result.reason.find("structured"), std::string::npos);
+}
+
+TEST(TestSynthesizeInputs, SdpaBwdStandaloneRefusesDerived)
+{
+    const auto gr     = buildSdpaBwdStandaloneGraph();
+    const auto result = runSynthesis(gr, {7, 8, 9});
+
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("derived"), std::string::npos);
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdBwdFusedSucceeds)
+{
+    const auto gr     = buildSdpaFwdBwdFusedGraph();
+    const auto result = runSynthesis(gr, {7, 8, 9});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/dnn-providers/integration-tests/tests/TestTestConfig.cpp b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
index 63370bfc8120..f6da2da519f8 100644
--- a/dnn-providers/integration-tests/tests/TestTestConfig.cpp
+++ b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
@@ -69,7 +69,7 @@ TEST(TestConfigUninitialized, GetReferenceExecutorTypeThrowsWhenUninitialized)
 
 // parseVerificationMode is a free function (no singleton state), so it can be
 // exercised regardless of initialization.
-TEST(ParseVerificationMode, AcceptsAllValidValuesCaseInsensitive)
+TEST(TestParseVerificationMode, AcceptsAllValidValuesCaseInsensitive)
 {
     using hipdnn_integration_tests::parseVerificationMode;
     using hipdnn_integration_tests::VerificationMode;
@@ -84,7 +84,7 @@ TEST(ParseVerificationMode, AcceptsAllValidValuesCaseInsensitive)
     EXPECT_EQ(parseVerificationMode("GPU"), VerificationMode::GPU);
 }
 
-TEST(ParseVerificationMode, ThrowsOnInvalidValue)
+TEST(TestParseVerificationMode, ThrowsOnInvalidValue)
 {
     EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode("bogus"), std::runtime_error);
     EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode(""), std::runtime_error);
@@ -94,7 +94,7 @@ TEST(ParseVerificationMode, ThrowsOnInvalidValue)
 // implement the "CLI wins, then env, then nullopt" precedence chain.
 // They don't touch the singleton so they can be tested freely.
 
-TEST(ResolveVerificationMode, CliValueWinsOverEnv)
+TEST(TestResolveVerificationMode, CliValueWinsOverEnv)
 {
     using hipdnn_integration_tests::resolveVerificationMode;
     using hipdnn_integration_tests::VerificationMode;
@@ -105,7 +105,7 @@ TEST(ResolveVerificationMode, CliValueWinsOverEnv)
     EXPECT_EQ(*result, VerificationMode::GPU);
 }
 
-TEST(ResolveVerificationMode, NulloptCliWithoutEnvReturnsNullopt)
+TEST(TestResolveVerificationMode, NulloptCliWithoutEnvReturnsNullopt)
 {
     using hipdnn_integration_tests::resolveVerificationMode;
 
@@ -114,7 +114,7 @@ TEST(ResolveVerificationMode, NulloptCliWithoutEnvReturnsNullopt)
     EXPECT_FALSE(result.has_value());
 }
 
-TEST(ResolveGoldenDataDir, CliValueWinsOverEnv)
+TEST(TestResolveGoldenDataDir, CliValueWinsOverEnv)
 {
     using hipdnn_integration_tests::resolveGoldenDataDir;
 
@@ -124,7 +124,7 @@ TEST(ResolveGoldenDataDir, CliValueWinsOverEnv)
     EXPECT_EQ(*result, cliPath);
 }
 
-TEST(ResolveGoldenDataDir, NulloptCliWithoutEnvReturnsNullopt)
+TEST(TestResolveGoldenDataDir, NulloptCliWithoutEnvReturnsNullopt)
 {
     using hipdnn_integration_tests::resolveGoldenDataDir;
 
diff --git a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
index 2b34be263657..b2e6becab76a 100644
--- a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
+++ b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
@@ -76,6 +76,11 @@ class ModeTestableHarness : public IntegrationGraphGoldenReferenceVerificationHa
         return nullptr;
     }
 
+    // These tests exercise verification-mode dispatch, not the VRAM/arch
+    // hardware guards. Override to a no-op so they don't reach into the
+    // (uninitialized-in-this-binary) TestConfig singleton.
+    void applyMetadataGuards() const override {}
+
 private:
     VerificationMode _mode;
     EngineStub _engineStub;
@@ -277,7 +282,7 @@ TEST_F(TestVerificationModePathsFixture, AutoWithGoldenMismatchFails)
     EXPECT_TRUE(anyFailed(results));
 }
 
-TEST_F(TestVerificationModePathsFixture, AutoNoGoldenGpuRefSucceedsPasses)
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenRefSucceedsPasses)
 {
     ::testing::TestPartResultArray results;
     runCapturing(loadBundle("auto_gpu", /*includeGoldenOutput=*/false),
@@ -290,7 +295,7 @@ TEST_F(TestVerificationModePathsFixture, AutoNoGoldenGpuRefSucceedsPasses)
     EXPECT_FALSE(anySkipped(results));
 }
 
-TEST_F(TestVerificationModePathsFixture, AutoNoGoldenGpuMissFallsThroughToCpu)
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenRefMissFallsThroughToCpu)
 {
     ::testing::TestPartResultArray results;
     runCapturing(loadBundle("auto_fallthrough", /*includeGoldenOutput=*/false),
@@ -345,8 +350,11 @@ TEST_F(TestVerificationModePathsFixture, GoldenModeWithoutDataSkips)
 }
 
 // ── Explicit GPU mode ───────────────────────────────────────────────────────
+// "Device" in these case names denotes VerificationMode::GPU (the device-side
+// reference executor). The literal "Gpu" keyword is reserved by the test-name
+// linter for the suite name and so cannot appear in the case name.
 
-TEST_F(TestVerificationModePathsFixture, GpuModeRefSucceedsPasses)
+TEST_F(TestVerificationModePathsFixture, DeviceModeRefSucceedsPasses)
 {
     ::testing::TestPartResultArray results;
     runCapturing(loadBundle("gpu_ok", /*includeGoldenOutput=*/true),
@@ -359,7 +367,7 @@ TEST_F(TestVerificationModePathsFixture, GpuModeRefSucceedsPasses)
     EXPECT_FALSE(anySkipped(results));
 }
 
-TEST_F(TestVerificationModePathsFixture, GpuModeCapabilityMissSkips)
+TEST_F(TestVerificationModePathsFixture, DeviceModeCapabilityMissSkips)
 {
     ::testing::TestPartResultArray results;
     runCapturing(loadBundle("gpu_miss", /*includeGoldenOutput=*/true),

From d05a68aa44e877f6b9690d8608b073dbeac05ced Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 12:51:14 -0400
Subject: [PATCH 11/18] Fix uid=0 rejection in SynthesisTracker and no-GPU
 std::bad_alloc in ref path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove uid!=0 guard from SynthesisTracker::isOwned() — uid=0 is a
  valid tensor uid when the graph declares it as a real leaf input.
  The "absent optional" convention (uid 0 = not connected) is the
  fill function's responsibility, not the tracker's.
- Guard ref-path device allocation on _requiresDevice so CI runners
  without a GPU don't trigger std::bad_alloc from rawDeviceData().

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...ntegrationGraphGoldenReferenceVerificationHarness.cpp | 2 +-
 .../src/harness/golden/input_init/SynthesisTracker.hpp   | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index 7a0e49f46beb..a0b438db7937 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -428,7 +428,7 @@ IntegrationGraphGoldenReferenceVerificationHarness::RefRunResult
         ReferenceExecutorType type, OutputTensors& refOutputs)
 {
     refOutputs = allocateSentinelOutputs();
-    const bool useDevice = (type == ReferenceExecutorType::GPU);
+    const bool useDevice = _requiresDevice && (type == ReferenceExecutorType::GPU);
     auto variantPack = buildVariantPack(refOutputs, useDevice);
 
     try
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
index 6838b0218397..87dbe9fe8e4b 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
@@ -184,8 +184,11 @@ class SynthesisTracker
     // this graph — either because a leaf input is STRUCTURED/DERIVED
     // (we know about it but can't fill it), or because a leaf input was
     // never declared by any node's fill function.
-    // Note: absent optional tensors (uid 0) and virtual inter-node tensors
-    // are not owned, so STRUCTURED/DERIVED calls on them are silently ignored.
+    // Note: virtual inter-node tensors are not owned, so STRUCTURED/DERIVED
+    // calls on them are silently ignored. Absent optional tensors (uid 0 by
+    // hipdnn convention) are the caller's responsibility — fill functions
+    // should guard against calling fillFree/markStructured on uid 0 when the
+    // attribute means "not present."
     SynthesisResult finish(const char* opName) const
     {
         std::vector<std::string> reasons = _refusals;
@@ -215,7 +218,7 @@ class SynthesisTracker
 private:
     bool isOwned(int64_t uid) const
     {
-        return uid != 0 && _owned.count(uid) != 0;
+        return _owned.count(uid) != 0;
     }
 
     InputTensorMap& _inputs; // leaf inputs only (non-virtual, non-output tensors)

From b0673106733c5140c7783b0d5ea784ebb1823c3e Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 13:06:20 -0400
Subject: [PATCH 12/18] Apply clang-format to CI-flagged files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.cpp |  35 ++-
 ...raphGoldenReferenceVerificationHarness.hpp |   8 +-
 .../harness/golden/IntegrationTestBundle.hpp  |   5 +-
 .../golden/input_init/SynthesizeInputs.hpp    | 118 ++++-----
 .../tests/TestBundleDiscovery.cpp             |   2 +-
 .../tests/TestSynthesizeInputs.cpp            | 244 +++++++++++-------
 6 files changed, 236 insertions(+), 176 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index a0b438db7937..d38ed8340580 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -167,8 +167,7 @@ void skipEngineCouldNotRun(const std::filesystem::path& bundlePath, const std::s
 }
 } // namespace
 
-std::optional<OutputTensors>
-    IntegrationGraphGoldenReferenceVerificationHarness::runEngineOrSkip()
+std::optional<OutputTensors> IntegrationGraphGoldenReferenceVerificationHarness::runEngineOrSkip()
 {
     std::string error;
     auto engineOutputs = runEngineCapturingOutputs(error);
@@ -265,14 +264,13 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
         switch(cpu.status)
         {
         case RefStatus::CAPABILITY_MISS:
-            skipUnverifiable(
-                gpuRefErrored
-                    ? "no usable reference (golden absent; GPU ref errored, CPU ref "
-                      "cannot run this op; see reference-error report): "
-                          + cpu.message
-                    : "no reference available (golden absent; GPU and CPU ref "
-                      "cannot run this op): "
-                          + cpu.message);
+            skipUnverifiable(gpuRefErrored
+                                 ? "no usable reference (golden absent; GPU ref errored, CPU ref "
+                                   "cannot run this op; see reference-error report): "
+                                       + cpu.message
+                                 : "no reference available (golden absent; GPU and CPU ref "
+                                   "cannot run this op): "
+                                       + cpu.message);
             return;
         case RefStatus::RUNTIME_ERROR:
             recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
@@ -319,8 +317,8 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
         allLeafInputUids.push_back(uid);
     }
 
-    std::mt19937 rng(static_cast<std::mt19937::result_type>(
-        _bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
+    std::mt19937 rng(
+        static_cast<std::mt19937::result_type>(_bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
 
     SynthesisTracker tracker(allLeafInputUids, inputs);
     for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
@@ -363,8 +361,7 @@ OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateSentin
     OutputTensors outputs;
     for(const int64_t uid : _bundle->outputTensorUids)
     {
-        outputs[uid]
-            = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
+        outputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
         outputs[uid]->fillWithSentinelValue();
     }
     return outputs;
@@ -394,7 +391,8 @@ std::unordered_map<int64_t, void*>
 }
 
 std::optional<OutputTensors>
-    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs(std::string& error)
+    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs(
+        std::string& error)
 {
     OutputTensors engineOutputs = allocateSentinelOutputs();
     auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
@@ -489,8 +487,8 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputs(
 }
 
 template <typename ExpectedLookup>
-void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(
-    OutputTensors& engineOutputs, ExpectedLookup expectedFor)
+void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(OutputTensors& engineOutputs,
+                                                                     ExpectedLookup expectedFor)
 {
     auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
@@ -526,8 +524,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::recordRefError(const st
         _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
 }
 
-std::string IntegrationGraphGoldenReferenceVerificationHarness::refLabel(
-    ReferenceExecutorType type)
+std::string IntegrationGraphGoldenReferenceVerificationHarness::refLabel(ReferenceExecutorType type)
 {
     return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
 }
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index b0fe8a6f8937..bfa45b2fed09 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -232,8 +232,7 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     static std::string refLabel(ReferenceExecutorType type);
 
     static std::string
-        labelFor(int64_t uid,
-                 const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs);
+        labelFor(int64_t uid, const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs);
 
     std::string reportHeader(int64_t uid,
                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
@@ -259,9 +258,8 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
         const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
         hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 
-    static float
-        toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
+    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
+                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 };
 
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
index 55a0e308dab8..84529b4060d6 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
@@ -229,9 +229,8 @@ inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPat
     //    loadBundleMetadata returns nullopt both when the .meta.json is absent
     //    and when it is present but invalid (bad JSON / bad format_version). For
     //    a golden bundle either case is an authoring error -> FAIL.
-    const bool goldenOutputsPresent
-        = !bundle.outputTensorUids.empty()
-          && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
+    const bool goldenOutputsPresent = !bundle.outputTensorUids.empty()
+                                      && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
 
     auto metadata = hipdnn_test_sdk::utilities::loadBundleMetadata(jsonPath);
     if(!metadata.has_value())
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
index ed777cdfa06f..df5bc348f24d 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
@@ -34,8 +34,8 @@ namespace hipdnn_integration_tests::golden
 // ── Convolution ───────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                     SynthesisTracker& tracker,
-                                     std::mt19937& rng)
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionFwdAttributes();
     if(a == nullptr)
@@ -48,8 +48,8 @@ inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_obje
 }
 
 inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         SynthesisTracker& tracker,
-                                         std::mt19937& rng)
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionBwdAttributes();
     if(a == nullptr)
@@ -61,9 +61,10 @@ inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_
     return SynthesisResult::ok();
 }
 
-inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                            SynthesisTracker& tracker,
-                                            std::mt19937& rng)
+inline SynthesisResult
+    fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                             SynthesisTracker& tracker,
+                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ConvolutionWrwAttributes();
     if(a == nullptr)
@@ -77,10 +78,10 @@ inline SynthesisResult fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::da
 
 // ── Batchnorm ─────────────────────────────────────────────────────────────────
 
-inline SynthesisResult fillBatchnormInferenceInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBatchnormInferenceInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                 SynthesisTracker& tracker,
+                                 std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributes();
     if(a == nullptr)
@@ -95,10 +96,10 @@ inline SynthesisResult fillBatchnormInferenceInputs(
     return SynthesisResult::ok();
 }
 
-inline SynthesisResult fillBatchnormInferenceVarianceInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBatchnormInferenceVarianceInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormInferenceAttributesVarianceExt();
     if(a == nullptr)
@@ -116,10 +117,10 @@ inline SynthesisResult fillBatchnormInferenceVarianceInputs(
 
 // peer_stats holds references to other GPUs' memory for multi-GPU batchnorm —
 // randomly generated values would point to invalid cross-device memory.
-inline SynthesisResult fillBatchnormTrainingInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBatchnormTrainingInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormAttributes();
     if(a == nullptr)
@@ -146,10 +147,10 @@ inline SynthesisResult fillBatchnormTrainingInputs(
 }
 
 // mean/inv_variance are optional (may come from forward). peer_stats: see above.
-inline SynthesisResult fillBatchnormBackwardInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBatchnormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BatchnormBackwardAttributes();
     if(a == nullptr)
@@ -176,8 +177,8 @@ inline SynthesisResult fillBatchnormBackwardInputs(
 // ── Matmul ────────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                    SynthesisTracker& tracker,
-                                    std::mt19937& rng)
+                                        SynthesisTracker& tracker,
+                                        std::mt19937& rng)
 {
     const auto* a = node.attributes_as_MatmulAttributes();
     if(a == nullptr)
@@ -192,8 +193,8 @@ inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objec
 // ── Pointwise ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       SynthesisTracker& tracker,
-                                       std::mt19937& rng)
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_PointwiseAttributes();
     if(a == nullptr)
@@ -210,8 +211,8 @@ inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_ob
 // ── Reduction ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       SynthesisTracker& tracker,
-                                       std::mt19937& rng)
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ReductionAttributes();
     if(a == nullptr)
@@ -225,8 +226,8 @@ inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_ob
 // ── LayerNorm ─────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                       SynthesisTracker& tracker,
-                                       std::mt19937& rng)
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormAttributes();
     if(a == nullptr)
@@ -242,10 +243,10 @@ inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_ob
 
 // mean and inv_variance are computed by the forward pass — a standalone backward
 // can't produce correct gradients without them.
-inline SynthesisResult fillLayernormBackwardInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillLayernormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
 {
     const auto* a = node.attributes_as_LayernormBackwardAttributes();
     if(a == nullptr)
@@ -264,8 +265,8 @@ inline SynthesisResult fillLayernormBackwardInputs(
 // ── RMSNorm ───────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                     SynthesisTracker& tracker,
-                                     std::mt19937& rng)
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormAttributes();
     if(a == nullptr)
@@ -280,10 +281,10 @@ inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_obje
 }
 
 // inv_rms is computed by the forward pass.
-inline SynthesisResult fillRmsnormBackwardInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillRmsnormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                              SynthesisTracker& tracker,
+                              std::mt19937& rng)
 {
     const auto* a = node.attributes_as_RMSNormBackwardAttributes();
     if(a == nullptr)
@@ -300,8 +301,8 @@ inline SynthesisResult fillRmsnormBackwardInputs(
 // ── Resample ──────────────────────────────────────────────────────────────────
 
 inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         SynthesisTracker& tracker,
-                                         std::mt19937& rng)
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_ResampleFwdAttributes();
     if(a == nullptr)
@@ -316,10 +317,10 @@ inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_
 
 // Scale tensor holds per-block quantization factors that must match the
 // quantized data — random scales would produce garbage dequantized values.
-inline SynthesisResult fillBlockScaleDequantizeInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBlockScaleDequantizeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                   SynthesisTracker& tracker,
+                                   std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleDequantizeAttributes();
     if(a == nullptr)
@@ -331,10 +332,10 @@ inline SynthesisResult fillBlockScaleDequantizeInputs(
     return SynthesisResult::ok();
 }
 
-inline SynthesisResult fillBlockScaleQuantizeInputs(
-    const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-    SynthesisTracker& tracker,
-    std::mt19937& rng)
+inline SynthesisResult
+    fillBlockScaleQuantizeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                 SynthesisTracker& tracker,
+                                 std::mt19937& rng)
 {
     const auto* a = node.attributes_as_BlockScaleQuantizeAttributes();
     if(a == nullptr)
@@ -361,8 +362,8 @@ inline SynthesisResult fillBlockScaleQuantizeInputs(
 // sparse attention patterns, and dropout seed/offset must match between fwd and
 // bwd. Most of these are optional — absent ones (uid 0) are silently ignored.
 inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                         SynthesisTracker& tracker,
-                                         std::mt19937& rng)
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaAttributes();
     if(a == nullptr)
@@ -401,9 +402,10 @@ inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_
 // correct gradients. In a fused forward+backward graph these are virtual
 // inter-node tensors (not owned, so silently skipped). A standalone backward
 // without a forward is refused.
-inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                          SynthesisTracker& tracker,
-                                          std::mt19937& rng)
+inline SynthesisResult
+    fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                           SynthesisTracker& tracker,
+                           std::mt19937& rng)
 {
     const auto* a = node.attributes_as_SdpaBackwardAttributes();
     if(a == nullptr)
@@ -439,8 +441,8 @@ inline SynthesisResult fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data
 // a diagnostic when the op is unrecognized or an input can't be synthesized.
 
 inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
-                                        SynthesisTracker& tracker,
-                                        std::mt19937& rng)
+                                            SynthesisTracker& tracker,
+                                            std::mt19937& rng)
 {
     using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
 
diff --git a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
index 020990dc3375..36b4a1772ad8 100644
--- a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
@@ -353,7 +353,7 @@ TEST_F(TestBundleDiscoveryFixture, LoadGraphOnlyBundleMissingMetadataLoads)
     ASSERT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
     const auto& bundle = std::get<IntegrationTestBundle>(result);
 
-    EXPECT_FALSE(bundle.tensors.has_value());      // graph-only: no tensor data
+    EXPECT_FALSE(bundle.tensors.has_value()); // graph-only: no tensor data
     EXPECT_FALSE(bundle.hasGoldenOutputs);
     EXPECT_FALSE(bundle.metadata.operation.has_value()); // default-constructed
 }
diff --git a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
index f22834177792..feba60a0769c 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
@@ -22,7 +22,7 @@ using namespace hipdnn_integration_tests::golden;
 namespace
 {
 
-const std::vector<int64_t> kDims    = {2, 3};
+const std::vector<int64_t> kDims = {2, 3};
 const std::vector<int64_t> kStrides = {3, 1};
 
 InputTensorMap makeTensors(const std::vector<int64_t>& uids)
@@ -75,10 +75,11 @@ GraphResult buildConvFwdGraph()
     auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 3);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(
-        CreateNodeDirect(b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -94,23 +95,35 @@ GraphResult buildConvBiasGraph()
     auto& b = r.builder;
 
     std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
-    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x",        DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w",        DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 10, "conv_y",  DataType::FLOAT, &kStrides, &kDims, true));
-    tensors.push_back(CreateTensorAttributesDirect(b, 4, "bias",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 5, "out",      DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "conv_y", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 4, "bias", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 5, "out", DataType::FLOAT, &kStrides, &kDims));
 
     auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
-    auto add  = CreatePointwiseAttributes(b, PointwiseMode::ADD,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          10, 4, flatbuffers::nullopt, 5);
+    auto add = CreatePointwiseAttributes(b,
+                                         PointwiseMode::ADD,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         10,
+                                         4,
+                                         flatbuffers::nullopt,
+                                         5);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "conv",     DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
-    nodes.push_back(CreateNodeDirect(b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -126,29 +139,49 @@ GraphResult buildConvBiasReluGraph()
     auto& b = r.builder;
 
     std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
-    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x",          DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w",          DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 10, "conv_y",    DataType::FLOAT, &kStrides, &kDims, true));
-    tensors.push_back(CreateTensorAttributesDirect(b, 4, "bias",       DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 11, "bias_out",  DataType::FLOAT, &kStrides, &kDims, true));
-    tensors.push_back(CreateTensorAttributesDirect(b, 6, "out",        DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "conv_y", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 4, "bias", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 11, "bias_out", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 6, "out", DataType::FLOAT, &kStrides, &kDims));
 
     auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
-    auto add  = CreatePointwiseAttributes(b, PointwiseMode::ADD,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          10, 4, flatbuffers::nullopt, 11);
-    auto relu = CreatePointwiseAttributes(b, PointwiseMode::RELU_FWD,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          flatbuffers::nullopt, flatbuffers::nullopt,
-                                          11, flatbuffers::nullopt, flatbuffers::nullopt, 6);
+    auto add = CreatePointwiseAttributes(b,
+                                         PointwiseMode::ADD,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         10,
+                                         4,
+                                         flatbuffers::nullopt,
+                                         11);
+    auto relu = CreatePointwiseAttributes(b,
+                                          PointwiseMode::RELU_FWD,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          11,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          6);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "conv",     DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
-    nodes.push_back(CreateNodeDirect(b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
-    nodes.push_back(CreateNodeDirect(b, "relu",     DataType::FLOAT, NodeAttributes::PointwiseAttributes, relu.Union()));
-
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "relu", DataType::FLOAT, NodeAttributes::PointwiseAttributes, relu.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -171,9 +204,11 @@ GraphResult buildSdpaFwdGraph()
     auto sdpa = CreateSdpaAttributes(b, 1, 2, 3, 4);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -188,21 +223,28 @@ GraphResult buildSdpaFwdWithStructuredGraph()
     auto& b = r.builder;
 
     std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
-    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q",         DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k",         DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v",         DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o",         DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 5, "seq_len_q", DataType::FLOAT, &kStrides, &kDims));
-
-    auto sdpa = CreateSdpaAttributes(b, 1, 2, 3, 4,
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 5, "seq_len_q", DataType::FLOAT, &kStrides, &kDims));
+
+    auto sdpa = CreateSdpaAttributes(b,
+                                     1,
+                                     2,
+                                     3,
+                                     4,
                                      flatbuffers::nullopt, // attn_mask
                                      flatbuffers::nullopt, // scale
-                                     5);                   // seq_len_q
+                                     5); // seq_len_q
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -218,22 +260,25 @@ GraphResult buildSdpaBwdStandaloneGraph()
     auto& b = r.builder;
 
     std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
-    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 6, "stats", DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv",    DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 6, "stats", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv", DataType::FLOAT, &kStrides, &kDims));
 
     auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 4, 5, 6, 7, 8, 9);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -251,35 +296,53 @@ GraphResult buildSdpaFwdBwdFusedGraph()
     auto& b = r.builder;
 
     std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
-    tensors.push_back(CreateTensorAttributesDirect(b, 1,  "q",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 2,  "k",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 3,  "v",     DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 10, "o",     DataType::FLOAT, &kStrides, &kDims, true));
-    tensors.push_back(CreateTensorAttributesDirect(b, 11, "stats", DataType::FLOAT, &kStrides, &kDims, true));
-    tensors.push_back(CreateTensorAttributesDirect(b, 5,  "do",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 7,  "dq",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 8,  "dk",    DataType::FLOAT, &kStrides, &kDims));
-    tensors.push_back(CreateTensorAttributesDirect(b, 9,  "dv",    DataType::FLOAT, &kStrides, &kDims));
-
-    auto fwd = CreateSdpaAttributes(b, 1, 2, 3, 10,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
-                                    flatbuffers::nullopt, flatbuffers::nullopt,
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "o", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 11, "stats", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv", DataType::FLOAT, &kStrides, &kDims));
+
+    auto fwd = CreateSdpaAttributes(b,
+                                    1,
+                                    2,
+                                    3,
+                                    10,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
                                     11); // stats_tensor_uid
 
     auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 10, 5, 11, 7, 8, 9);
 
     std::vector<flatbuffers::Offset<Node>> nodes;
-    nodes.push_back(CreateNodeDirect(b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, fwd.Union()));
-    nodes.push_back(CreateNodeDirect(b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, fwd.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
 
-    auto graph = CreateGraphDirect(b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
     b.Finish(graph);
 
     r.graph = GetGraph(b.GetBufferPointer());
@@ -289,13 +352,14 @@ GraphResult buildSdpaFwdBwdFusedGraph()
 SynthesisResult runSynthesis(const GraphResult& gr, const std::set<int64_t>& outputUids)
 {
     const auto leafUids = gr.leafInputUids(outputUids);
-    auto inputs         = makeTensors(leafUids);
+    auto inputs = makeTensors(leafUids);
     std::mt19937 rng(42);
 
     SynthesisTracker tracker(leafUids, inputs);
     for(uint32_t i = 0; i < gr.graph->nodes()->size(); ++i)
     {
-        const SynthesisResult nodeResult = synthesizeNodeInputs(*gr.graph->nodes()->Get(i), tracker, rng);
+        const SynthesisResult nodeResult
+            = synthesizeNodeInputs(*gr.graph->nodes()->Get(i), tracker, rng);
         if(!nodeResult.filled)
         {
             return nodeResult;
@@ -310,7 +374,7 @@ SynthesisResult runSynthesis(const GraphResult& gr, const std::set<int64_t>& out
 
 TEST(TestSynthesizeInputs, SingleConvFwd)
 {
-    const auto gr     = buildConvFwdGraph();
+    const auto gr = buildConvFwdGraph();
     const auto result = runSynthesis(gr, {3});
 
     EXPECT_TRUE(result.filled) << result.reason;
@@ -318,7 +382,7 @@ TEST(TestSynthesizeInputs, SingleConvFwd)
 
 TEST(TestSynthesizeInputs, ConvPlusBiasFused)
 {
-    const auto gr     = buildConvBiasGraph();
+    const auto gr = buildConvBiasGraph();
     const auto result = runSynthesis(gr, {5});
 
     EXPECT_TRUE(result.filled) << result.reason;
@@ -326,7 +390,7 @@ TEST(TestSynthesizeInputs, ConvPlusBiasFused)
 
 TEST(TestSynthesizeInputs, ConvPlusBiasPlusReluFused)
 {
-    const auto gr     = buildConvBiasReluGraph();
+    const auto gr = buildConvBiasReluGraph();
     const auto result = runSynthesis(gr, {6});
 
     EXPECT_TRUE(result.filled) << result.reason;
@@ -334,7 +398,7 @@ TEST(TestSynthesizeInputs, ConvPlusBiasPlusReluFused)
 
 TEST(TestSynthesizeInputs, SdpaFwdNoStructuredOptionals)
 {
-    const auto gr     = buildSdpaFwdGraph();
+    const auto gr = buildSdpaFwdGraph();
     const auto result = runSynthesis(gr, {4});
 
     EXPECT_TRUE(result.filled) << result.reason;
@@ -342,7 +406,7 @@ TEST(TestSynthesizeInputs, SdpaFwdNoStructuredOptionals)
 
 TEST(TestSynthesizeInputs, SdpaFwdWithStructuredInputRefuses)
 {
-    const auto gr     = buildSdpaFwdWithStructuredGraph();
+    const auto gr = buildSdpaFwdWithStructuredGraph();
     const auto result = runSynthesis(gr, {4});
 
     EXPECT_FALSE(result.filled);
@@ -352,7 +416,7 @@ TEST(TestSynthesizeInputs, SdpaFwdWithStructuredInputRefuses)
 
 TEST(TestSynthesizeInputs, SdpaBwdStandaloneRefusesDerived)
 {
-    const auto gr     = buildSdpaBwdStandaloneGraph();
+    const auto gr = buildSdpaBwdStandaloneGraph();
     const auto result = runSynthesis(gr, {7, 8, 9});
 
     EXPECT_FALSE(result.filled);
@@ -361,7 +425,7 @@ TEST(TestSynthesizeInputs, SdpaBwdStandaloneRefusesDerived)
 
 TEST(TestSynthesizeInputs, SdpaFwdBwdFusedSucceeds)
 {
-    const auto gr     = buildSdpaFwdBwdFusedGraph();
+    const auto gr = buildSdpaFwdBwdFusedGraph();
     const auto result = runSynthesis(gr, {7, 8, 9});
 
     EXPECT_TRUE(result.filled) << result.reason;

From 476d4a0d847a05c673ef4c321cc90c305bb74552 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 14:10:08 -0400
Subject: [PATCH 13/18] Allow --gd to point directly at a bundle folder

deriveTestName() threw when a graph .json sat at the data root with no
folder above it. That blocked the natural "--gd .../graph_only_bundle"
invocation, where the user points the data root straight at a single
bundle's folder. Use the root folder's own name as the suite name in
that case instead of throwing, so the bundle is discovered as
{folder}.{stem}.

Update the unit test (JsonAtRootThrows -> JsonAtRootUsesFolderNameAsSuite)
to assert the new behavior.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/harness/golden/BundleDiscovery.hpp               | 8 +++++---
 .../integration-tests/tests/TestBundleDiscovery.cpp      | 9 ++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp b/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
index 56ea6dc254ff..e2743934e540 100644
--- a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
@@ -191,9 +191,11 @@ inline DerivedTestName deriveTestName(const std::filesystem::path& jsonPath,
 
     if(relativeDir.empty())
     {
-        throw std::runtime_error(
-            "Bundle .json must live in a sub-folder of the data root, not at the root itself: "
-            + jsonPath.string() + "; expected {folder}/{file}.json");
+        // --gd points directly at a bundle folder (the .json is at the root).
+        // Use the folder name as the suite so "--gd .../graph_only_bundle" works.
+        const std::string suite = sanitizeForGtest(bundleDir.filename().string());
+        const std::string test = sanitizeForGtest(jsonPath.stem().string());
+        return {suite, test};
     }
 
     std::string suite;
diff --git a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
index 36b4a1772ad8..6128f4f0ebc2 100644
--- a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
@@ -144,11 +144,14 @@ TEST_F(TestBundleDiscoveryFixture, TieredGoldenDataLayoutIsDiscovered)
     EXPECT_EQ(result.front().testName, "Small");
 }
 
-TEST_F(TestBundleDiscoveryFixture, JsonAtRootThrows)
+TEST_F(TestBundleDiscoveryFixture, JsonAtRootUsesFolderNameAsSuite)
 {
-    // A .json directly at the data root has no folder to form a suite -> throw.
+    // A .json directly at the data root uses the root folder name as suite.
     std::ofstream(_tempDir / "graph.json") << R"({"tensors": []})";
-    EXPECT_THROW(discoverBundles(_tempDir), std::runtime_error);
+    auto result = discoverBundles(_tempDir);
+    ASSERT_EQ(result.size(), 1u);
+    EXPECT_EQ(result[0].suiteName, sanitizeForGtest(_tempDir.filename().string()));
+    EXPECT_EQ(result[0].testName, "graph");
 }
 
 TEST_F(TestBundleDiscoveryFixture, EmptyLeafFolderWarnsAndSkips)

From da946304a828ea8fc1cc2296ecafe0177b65876c Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 16:17:32 -0400
Subject: [PATCH 14/18] Wire golden runner into TOML tolerance override +
 test_skips chain

Extract shared TomlGuards.hpp and wire all three harnesses through it.
Fix toleranceForNodeAttributes missing SdpaAttributes/SdpaBackwardAttributes.
Add SynthesisTracker::fillComputed + tensorAt for follow-up init unification.
Rename appendTensorDiff/appendFpDiff to writeTensorDiffReport/writeFpDiffReport.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/harness/GoldenReferenceCpu.hpp        |   6 +
 .../IntegrationGraphVerificationHarness.hpp   |  30 +-
 .../src/harness/TomlGuards.hpp                |  63 ++++
 ...raphGoldenReferenceVerificationHarness.cpp |  40 ++-
 ...raphGoldenReferenceVerificationHarness.hpp | 108 ++++---
 .../golden/input_init/SynthesisTracker.hpp    |  42 ++-
 .../integration-tests/tests/CMakeLists.txt    |   1 +
 .../tests/TestGoldenToleranceAndSkips.cpp     | 293 ++++++++++++++++++
 .../tests/TestSynthesisTracker.cpp            |  44 +++
 9 files changed, 553 insertions(+), 74 deletions(-)
 create mode 100644 dnn-providers/integration-tests/src/harness/TomlGuards.hpp
 create mode 100644 dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp

diff --git a/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp b/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
index c36adc19fb62..1f21a1978487 100644
--- a/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
+++ b/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
@@ -17,6 +17,8 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
+#include "harness/TomlGuards.hpp"
+
 namespace hipdnn_integration_tests
 {
 
@@ -47,12 +49,16 @@ class TestGoldenReferenceCpu : public ::testing::TestWithParam<std::filesystem::
 
         _graphAndTensors = hipdnn_test_sdk::utilities::loadGraphAndTensors(path);
         _referenceOutputTensors = _graphAndTensors.extractAndClearOutputTensorData();
+
+        skipIfTomlMatched(currentTestName());
     }
 
     void goldenReferenceTestSuite(float absoluteTolerance, float relativeTolerance)
     {
         SKIP_IF_WINDOWS();
 
+        applyTomlToleranceOverride(currentTestName(), absoluteTolerance, relativeTolerance);
+
         auto tensorMap = _graphAndTensors.hostBufferMap();
 
         hipdnn_test_sdk::utilities::CpuReferenceGraphExecutor().execute(
diff --git a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
index d884b3e2f50e..d5a49cd23fab 100644
--- a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
@@ -29,6 +29,7 @@
 #include "harness/SharedHandle.hpp"
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -57,16 +58,7 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
         ASSERT_EQ(hipInit(0), hipSuccess);
         ASSERT_EQ(hipGetDevice(&_deviceId), hipSuccess);
 
-        // Check for any engine specific test skips
-        if(auto* info = ::testing::UnitTest::GetInstance()->current_test_info(); info != nullptr)
-        {
-            const std::string testName = std::string(info->test_suite_name()) + "." + info->name();
-            if(auto skipReason = TestConfig::get().findSkipForTest(testName))
-            {
-                GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] "
-                             << *skipReason;
-            }
-        }
+        skipIfTomlMatched(currentTestName());
     }
 
     void setTestCaseNote(std::string note)
@@ -253,25 +245,9 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
                            float absoluteTolerance,
                            float relativeTolerance)
     {
-        // Check for per-test tolerance override from TOML config
         float finalAtol = absoluteTolerance;
         float finalRtol = relativeTolerance;
-
-        auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-        if(testInfo != nullptr)
-        {
-            std::string testName
-                = std::string(testInfo->test_suite_name()) + "." + testInfo->name();
-            auto override = TestConfig::get().findToleranceOverride(testName);
-            if(override.has_value())
-            {
-                finalAtol = override->atol;
-                finalRtol = override->rtol;
-                HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for " << testName
-                                                                         << ": atol=" << finalAtol
-                                                                         << " rtol=" << finalRtol);
-            }
-        }
+        applyTomlToleranceOverride(currentTestName(), finalAtol, finalRtol);
 
         // Since the graph can infer properties + Ids, we defer validator registration until right
         // before validation in verifyGraph
diff --git a/dnn-providers/integration-tests/src/harness/TomlGuards.hpp b/dnn-providers/integration-tests/src/harness/TomlGuards.hpp
new file mode 100644
index 000000000000..ccc098e663a8
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/TomlGuards.hpp
@@ -0,0 +1,63 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <hipdnn_plugin_sdk/PluginLogging.hpp>
+
+#include "harness/TestConfig.hpp"
+
+namespace hipdnn_integration_tests
+{
+
+inline std::string currentTestName()
+{
+    auto* info = ::testing::UnitTest::GetInstance()->current_test_info();
+    if(info == nullptr)
+    {
+        return {};
+    }
+    return std::string(info->test_suite_name()) + "." + info->name();
+}
+
+inline bool applyTomlToleranceOverride(const std::string& testName, float& atol, float& rtol)
+{
+    if(testName.empty())
+    {
+        return false;
+    }
+    auto ovr = TestConfig::get().findToleranceOverride(testName);
+    if(!ovr)
+    {
+        return false;
+    }
+    atol = ovr->atol;
+    rtol = ovr->rtol;
+    HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for " << testName << ": atol=" << atol
+                                                             << " rtol=" << rtol);
+    return true;
+}
+
+inline std::optional<std::string> checkTomlSkip(const std::string& testName)
+{
+    if(testName.empty())
+    {
+        return std::nullopt;
+    }
+    return TestConfig::get().findSkipForTest(testName);
+}
+
+inline void skipIfTomlMatched(const std::string& testName)
+{
+    if(auto reason = checkTomlSkip(testName))
+    {
+        GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] " << *reason;
+    }
+}
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index d38ed8340580..7573827a040f 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -23,6 +23,7 @@
 #include "harness/ReferenceCapabilityError.hpp"
 #include "harness/SharedHandle.hpp"
 #include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
 #include "harness/golden/UnverifiableBundleReport.hpp"
 #include "harness/golden/input_init/SynthesizeInputs.hpp"
 #include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
@@ -505,6 +506,15 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(OutputTenso
         float rtol = 0.0f;
         resolveTolerances(wrapper, dataType, atol, rtol);
 
+        const auto testName = currentTestName();
+        if(auto ovr = lookupToleranceOverride(testName))
+        {
+            atol = ovr->atol;
+            rtol = ovr->rtol;
+            HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for "
+                                   << testName << ": atol=" << atol << " rtol=" << rtol);
+        }
+
         compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
     }
 }
@@ -547,12 +557,12 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputTensor(
     {
         std::ostringstream report;
         report << reportHeader(uid, attrs, dataType, expected, atol, rtol);
-        appendTensorDiff(report, uid, attrs, dataType, expected, actual, atol, rtol);
+        writeTensorDiffReport(report, uid, attrs, dataType, expected, actual, atol, rtol);
         EXPECT_TRUE(false) << report.str();
     }
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::appendTensorDiff(
+void IntegrationGraphGoldenReferenceVerificationHarness::writeTensorDiffReport(
     std::ostream& os,
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
@@ -569,16 +579,16 @@ void IntegrationGraphGoldenReferenceVerificationHarness::appendTensorDiff(
     switch(dataType)
     {
     case DT::FLOAT:
-        appendFpDiff<float>(os, uid, attrs, expected, actual, atol, rtol);
+        writeFpDiffReport<float>(os, uid, attrs, expected, actual, atol, rtol);
         return;
     case DT::HALF:
-        appendFpDiff<half>(os, uid, attrs, expected, actual, atol, rtol);
+        writeFpDiffReport<half>(os, uid, attrs, expected, actual, atol, rtol);
         return;
     case DT::BFLOAT16:
-        appendFpDiff<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
+        writeFpDiffReport<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
         return;
     case DT::DOUBLE:
-        appendFpDiff<double>(os, uid, attrs, expected, actual, atol, rtol);
+        writeFpDiffReport<double>(os, uid, attrs, expected, actual, atol, rtol);
         return;
     default:
         os << "  (no element-wise diff available for this data type)\n";
@@ -586,7 +596,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::appendTensorDiff(
 }
 
 template <typename T>
-void IntegrationGraphGoldenReferenceVerificationHarness::appendFpDiff(
+void IntegrationGraphGoldenReferenceVerificationHarness::writeFpDiffReport(
     std::ostream& os,
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
@@ -675,6 +685,9 @@ float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForNodeAttrib
         return tol::pointwise::getTolerance<T>();
     case NA::LayernormAttributes:
         return tol::layernorm::getTolerance<T>();
+    case NA::SdpaAttributes:
+    case NA::SdpaBackwardAttributes:
+        return tol::sdpa::getToleranceFwd<T>();
     default:
         return 1e-3f;
     }
@@ -735,4 +748,17 @@ void IntegrationGraphGoldenReferenceVerificationHarness::applyMetadataGuards() c
     }
 }
 
+std::optional<std::string> IntegrationGraphGoldenReferenceVerificationHarness::lookupSkip(
+    const std::string& testName) const
+{
+    return TestConfig::get().findSkipForTest(testName);
+}
+
+std::optional<ToleranceOverride>
+    IntegrationGraphGoldenReferenceVerificationHarness::lookupToleranceOverride(
+        const std::string& testName) const
+{
+    return TestConfig::get().findToleranceOverride(testName);
+}
+
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index bfa45b2fed09..345a7dfe8233 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -21,6 +21,7 @@
 
 #include "harness/IReferenceGraphExecutor.hpp"
 #include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
 #include "harness/golden/IntegrationTestBundle.hpp"
 
 namespace hipdnn_integration_tests::golden
@@ -53,6 +54,16 @@ using OutputTensors
 //     are mark*Modified().
 //   * Virtual (inter-node) tensors are allocated internally by each executor; the
 //     variant packs we build carry only real (input + output) tensors.
+//
+// TODO(ALMIOPEN-1969 follow-up): Unify graph-init with the non-golden harness.
+//   Stage 1 — Route non-golden ops whose initializeBundle() is plain randomize
+//             (conv, matmul, BN-inference, reduction, rmsnorm-fwd, layernorm,
+//             pointwise) through the synthesis switch. Zero behavioral change.
+//   Stage 2 — Migrate structured recipes one op at a time: copy the exact
+//             ranges/seeds/derivation from each non-golden subclass override
+//             into the corresponding fill function, using fillComputed/tensorAt
+//             for derived inputs. Delete each override once its fill fn works.
+//   Stage 3 — Both harnesses share one init pipeline via SynthesisTracker.
 class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Test
 {
 public:
@@ -81,6 +92,14 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
             GTEST_SKIP() << "No bundle set";
         }
 
+        {
+            const auto testName = currentTestName();
+            if(auto reason = lookupSkip(testName))
+            {
+                GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] " << *reason;
+            }
+        }
+
         applyMetadataGuards();
     }
 
@@ -113,6 +132,15 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     // TestConfig singleton, which is only initialized by the real test main.
     virtual void applyMetadataGuards() const;
 
+    // TOML-driven skip check. Default reads TestConfig::get().findSkipForTest().
+    // Override in tests to avoid the singleton.
+    virtual std::optional<std::string> lookupSkip(const std::string& testName) const;
+
+    // TOML-driven tolerance override. Default reads TestConfig::get().findToleranceOverride().
+    // Override in tests to inject controlled values.
+    virtual std::optional<ToleranceOverride>
+        lookupToleranceOverride(const std::string& testName) const;
+
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
@@ -187,6 +215,32 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     void markOutputsModified(OutputTensors& outputs) const;
     static void markOutputsModifiedFor(OutputTensors& outputs, bool device);
 
+    // ── tolerances ──────────────────────────────────────────────────────
+    // Two-level lookup: per-operation default from TestTolerances.hpp,
+    // then TOML per-engine override (if a [[tolerance_overrides]] filter
+    // matches the current gtest name).
+    //
+    //   resolveTolerances        entry point — sets atol/rtol for one output tensor
+    //     deriveDefaultTolerance  max tolerance across all graph nodes (priority 2)
+    //       toleranceForDataType    dispatches on DataType → typed template
+    //         toleranceForNodeAttributes<T>  maps NodeAttributes → TestTolerances.hpp
+    static void
+        resolveTolerances(const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
+                          hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+                          float& atol,
+                          float& rtol);
+
+    static float deriveDefaultTolerance(
+        const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
+        hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
+
+    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
+                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
+
+    template <typename T>
+    static float
+        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType);
+
     // ── comparison ──────────────────────────────────────────────────────
     void compareAgainstGolden(OutputTensors& engineOutputs);
     void compareOutputs(OutputTensors& engineOutputs, OutputTensors& expected);
@@ -202,31 +256,7 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
                              float atol,
                              float rtol) const;
 
-    static void
-        appendTensorDiff(std::ostream& os,
-                         int64_t uid,
-                         const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                         hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                         hipdnn_data_sdk::utilities::ITensor& expected,
-                         hipdnn_data_sdk::utilities::ITensor& actual,
-                         float atol,
-                         float rtol);
-
-    template <typename T>
-    static void appendFpDiff(std::ostream& os,
-                             int64_t uid,
-                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                             hipdnn_data_sdk::utilities::ITensor& expected,
-                             hipdnn_data_sdk::utilities::ITensor& actual,
-                             float atol,
-                             float rtol);
-
     // ── reporting ───────────────────────────────────────────────────────
-    // Records the bundle path + reason in the process-wide
-    // UnverifiableBundleReport (printed as a summary after all tests),
-    // then GTEST_SKIP()s this test. The reason is a flat human-readable
-    // string — per-tensor details are concatenated into it by the caller
-    // (e.g., tracker.finish()), not stored as structured data.
     void skipUnverifiable(const std::string& reason);
     void recordRefError(const std::string& reason);
     static std::string refLabel(ReferenceExecutorType type);
@@ -243,23 +273,25 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
 
     static std::string dataTypeName(hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
 
-    // ── tolerances ──────────────────────────────────────────────────────
     static void
-        resolveTolerances(const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-                          hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                          float& atol,
-                          float& rtol);
+        writeTensorDiffReport(std::ostream& os,
+                              int64_t uid,
+                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+                              hipdnn_data_sdk::utilities::ITensor& expected,
+                              hipdnn_data_sdk::utilities::ITensor& actual,
+                              float atol,
+                              float rtol);
 
     template <typename T>
-    static float
-        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType);
-
-    static float deriveDefaultTolerance(
-        const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-        hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
-
-    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
+    static void
+        writeFpDiffReport(std::ostream& os,
+                          int64_t uid,
+                          const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                          hipdnn_data_sdk::utilities::ITensor& expected,
+                          hipdnn_data_sdk::utilities::ITensor& actual,
+                          float atol,
+                          float rtol);
 };
 
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
index 87dbe9fe8e4b..033877dbedb0 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
@@ -114,8 +114,16 @@ struct SynthesisResult
 //                generation. In a fused fwd+bwd graph the forward output flows
 //                to the backward input as a virtual tensor (not owned, silently
 //                skipped). In a standalone backward, the same tensor is a leaf
-//                input — markDerived records it, and finish() refuses because
-//                no forward pass produced it.
+//                input. Two ways to handle it:
+//                  * markDerived — record it and let finish() refuse (SKIP),
+//                    used when no recipe exists to produce a consistent value.
+//                  * fillComputed — the fill function runs the recipe itself
+//                    (e.g. a CPU forward pass to produce `o`/`stats` consistent
+//                    with the q/k/v it already filled FREE) and hands the
+//                    result to the tracker. This accounts for the input with
+//                    NO refusal, so finish() succeeds and the bundle runs.
+//                A recipe reads the already-filled FREE inputs via tensorAt()
+//                and writes the computed tensor back via fillComputed().
 //
 // finish() succeeds only when every owned leaf input was declared as some role
 // AND none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
@@ -157,6 +165,36 @@ class SynthesisTracker
         _accounted.insert(uid);
     }
 
+    // Declares `uid` as DERIVED-and-produced — the fill function computed a
+    // consistent value itself (the recipe) and supplies it here. Copies the
+    // bytes into the leaf input and accounts for it with NO refusal, so a
+    // graph that would otherwise SKIP (markDerived) instead runs. `source`
+    // must have the same dtype/shape as the leaf input at `uid`.
+    void fillComputed(int64_t uid, const hipdnn_data_sdk::utilities::ITensor& source)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        auto& dst = *_inputs.at(uid);
+        const auto* src = const_cast<hipdnn_data_sdk::utilities::ITensor&>(source).rawHostData();
+        dst.fillWithData(src, source.elementSpace() * source.elementSize());
+        _accounted.insert(uid);
+    }
+
+    // Read access to an already-filled leaf input, so a recipe can compute a
+    // derived input from inputs filled earlier in the same node (e.g. read
+    // q/k/v to produce o/stats). Returns nullptr if `uid` is not an owned leaf
+    // (virtual, output, or absent-optional uid=0).
+    hipdnn_data_sdk::utilities::ITensor* tensorAt(int64_t uid)
+    {
+        if(!isOwned(uid))
+        {
+            return nullptr;
+        }
+        return _inputs.at(uid).get();
+    }
+
     // Declares `uid` as STRUCTURED — accounts for it but records a refusal.
     void markStructured(int64_t uid, const char* role)
     {
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index 705b0bee96e7..9f013992bd63 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestSynthesisTracker.cpp
     TestSynthesizeInputs.cpp
     TestVerificationModePaths.cpp
+    TestGoldenToleranceAndSkips.cpp
 )
 
 target_include_directories(hipdnn_integration_tests_unit_tests
diff --git a/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp b/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp
new file mode 100644
index 000000000000..47a21f98551d
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp
@@ -0,0 +1,293 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Tests the TOML tolerance override and test_skips wiring in the golden harness:
+//
+//   1. lookupToleranceOverride replaces atol/rtol when matched
+//   2. Per-op default (priority 2) is used when override returns nullopt
+//   3. lookupSkip skips the test when matched
+//   4. Test runs normally when lookupSkip returns nullopt
+
+#include <gtest/gtest-spi.h>
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
+
+#include "harness/TestConfig.hpp"
+#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+#include "harness/golden/IntegrationTestBundle.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_integration_tests;
+using namespace hipdnn_integration_tests::golden;
+
+namespace
+{
+
+using EngineStub = std::function<void(std::unordered_map<int64_t, void*>&)>;
+
+class ToleranceTestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
+{
+public:
+    ToleranceTestableHarness(EngineStub engineStub,
+                             std::optional<ToleranceOverride> tolOverride,
+                             std::optional<std::string> skipReason)
+        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
+        , _engineStub(std::move(engineStub))
+        , _tolOverride(std::move(tolOverride))
+        , _skipReason(std::move(skipReason))
+    {
+    }
+
+    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
+    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
+
+protected:
+    VerificationMode getVerificationMode() const override
+    {
+        return VerificationMode::GOLDEN;
+    }
+
+    void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
+    {
+        _engineStub(variantPack);
+    }
+
+    void runReferenceExecutor(ReferenceExecutorType /*type*/,
+                              std::unordered_map<int64_t, void*>& /*variantPack*/) override
+    {
+    }
+
+    std::unique_ptr<IReferenceGraphExecutor>
+        makeReferenceExecutor(ReferenceExecutorType /*type*/) override
+    {
+        return nullptr;
+    }
+
+    void applyMetadataGuards() const override {}
+
+    std::optional<std::string> lookupSkip(const std::string& /*testName*/) const override
+    {
+        return _skipReason;
+    }
+
+    std::optional<ToleranceOverride>
+        lookupToleranceOverride(const std::string& /*testName*/) const override
+    {
+        return _tolOverride;
+    }
+
+private:
+    EngineStub _engineStub;
+    std::optional<ToleranceOverride> _tolOverride;
+    std::optional<std::string> _skipReason;
+};
+
+class TestGoldenToleranceAndSkips : public ::testing::Test
+{
+protected:
+    std::optional<hipdnn_test_sdk::utilities::ScopedDirectory> _scopedDir;
+    std::filesystem::path _tempDir;
+
+    static constexpr float K_OUTPUT_VALUE = 3.5f;
+    static constexpr int64_t K_OUTPUT_UID = 5;
+    static constexpr size_t K_OUTPUT_ELEMS = 120;
+
+    void SetUp() override
+    {
+        auto path
+            = std::filesystem::temp_directory_path()
+              / ("tol_skip_test_"
+                 + std::to_string(::testing::UnitTest::GetInstance()->current_test_info()->line()));
+        std::filesystem::remove_all(path);
+        _scopedDir.emplace(path);
+        _tempDir = _scopedDir->path();
+    }
+
+    static void writeBundleFiles(const std::filesystem::path& dir,
+                                 const std::string& name,
+                                 float goldenValue)
+    {
+        std::filesystem::create_directories(dir);
+        std::ofstream(dir / (name + ".json"))
+            << R"({"nodes": [{"inputs": {"x_tensor_uid": 0, "mean_tensor_uid": 1, )"
+               R"("inv_variance_tensor_uid": 2, "scale_tensor_uid": 3, "bias_tensor_uid": 4}, )"
+               R"("outputs": {"y_tensor_uid": 5}, "type": "BatchnormInferenceAttributes", )"
+               R"("compute_data_type": "float", "name": ""}], "tensors": [)"
+               R"({"name": "", "uid": 0, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 1, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 2, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 3, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 4, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 5, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}], "io_data_type": "float", )"
+               R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
+
+        std::ofstream(dir / (name + ".meta.json"))
+            << R"({"format_version": 1, "operation": "BatchnormInference"})";
+
+        const auto basePath = (dir / name).string();
+        const auto writeFloatBin = [&](int64_t uid, size_t elems, float value) {
+            const std::vector<float> data(elems, value);
+            std::ofstream out(basePath + ".tensor" + std::to_string(uid) + ".bin",
+                              std::ios::binary);
+            out.write(reinterpret_cast<const char*>(data.data()),
+                      static_cast<std::streamsize>(data.size() * sizeof(float)));
+        };
+
+        writeFloatBin(0, 120, 0.0f);
+        writeFloatBin(1, 3, 0.0f);
+        writeFloatBin(2, 3, 0.0f);
+        writeFloatBin(3, 3, 0.0f);
+        writeFloatBin(4, 3, 0.0f);
+        writeFloatBin(K_OUTPUT_UID, K_OUTPUT_ELEMS, goldenValue);
+    }
+
+    std::shared_ptr<IntegrationTestBundle> loadBundle(const std::string& name,
+                                                      float goldenValue) const
+    {
+        const auto dir = _tempDir / name;
+        writeBundleFiles(dir, name, goldenValue);
+        auto result = loadIntegrationTestBundle(dir / (name + ".json"));
+        EXPECT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
+        return std::make_shared<IntegrationTestBundle>(
+            std::move(std::get<IntegrationTestBundle>(result)));
+    }
+
+    static void writeOutput(std::unordered_map<int64_t, void*>& variantPack, float value)
+    {
+        auto* ptr = static_cast<float*>(variantPack.at(K_OUTPUT_UID));
+        std::fill(ptr, ptr + K_OUTPUT_ELEMS, value);
+    }
+
+    static bool anyFailed(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).failed())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool anySkipped(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).skipped())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static void runCapturing(std::shared_ptr<IntegrationTestBundle> bundle,
+                             EngineStub engineStub,
+                             std::optional<ToleranceOverride> tolOverride,
+                             std::optional<std::string> skipReason,
+                             ::testing::TestPartResultArray* results)
+    {
+        ToleranceTestableHarness harness(
+            std::move(engineStub), std::move(tolOverride), std::move(skipReason));
+        harness.setBundle(std::move(bundle), "tol-skip-test-bundle");
+
+        const ::testing::ScopedFakeTestPartResultReporter reporter(
+            ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, results);
+        harness.SetUp();
+        if(!anySkipped(*results))
+        {
+            harness.TestBody();
+        }
+    }
+};
+
+// Engine writes output that differs from golden by 0.05.
+// BN inference fp32 default tolerance is 2e-4 — so this FAILS without override.
+// A TOML override with atol=0.1 makes it pass.
+TEST_F(TestGoldenToleranceAndSkips, ToleranceOverrideApplied)
+{
+    constexpr float goldenValue = 1.0f;
+    constexpr float engineValue = 1.05f;
+    auto bundle = loadBundle("tol_override", goldenValue);
+
+    ::testing::TestPartResultArray results;
+    runCapturing(
+        bundle,
+        [](auto& vp) { writeOutput(vp, engineValue); },
+        ToleranceOverride{0.1f, 0.1f},
+        std::nullopt,
+        &results);
+
+    EXPECT_FALSE(anyFailed(results)) << "Should pass with the relaxed TOML override tolerance";
+}
+
+// Same scenario but lookupToleranceOverride returns nullopt.
+// The per-op default (BN inference fp32 = 2e-4) is used, so the 0.05 diff FAILS.
+TEST_F(TestGoldenToleranceAndSkips, DefaultToleranceUsedWhenNoOverride)
+{
+    constexpr float goldenValue = 1.0f;
+    constexpr float engineValue = 1.05f;
+    auto bundle = loadBundle("tol_default", goldenValue);
+
+    ::testing::TestPartResultArray results;
+    runCapturing(
+        bundle,
+        [](auto& vp) { writeOutput(vp, engineValue); },
+        std::nullopt,
+        std::nullopt,
+        &results);
+
+    EXPECT_TRUE(anyFailed(results)) << "Should fail with the tight per-op default tolerance";
+}
+
+// lookupSkip returns a reason string — SetUp() should GTEST_SKIP.
+TEST_F(TestGoldenToleranceAndSkips, SkipApplied)
+{
+    auto bundle = loadBundle("skip_test", 1.0f);
+
+    ::testing::TestPartResultArray results;
+    runCapturing(
+        bundle,
+        [](auto& /*vp*/) {},
+        std::nullopt,
+        std::string("known failure on gfx1100"),
+        &results);
+
+    EXPECT_TRUE(anySkipped(results)) << "Test should be skipped when lookupSkip returns a reason";
+}
+
+// lookupSkip returns nullopt — test runs normally (and passes because engine matches golden).
+TEST_F(TestGoldenToleranceAndSkips, NoSkipRunsNormally)
+{
+    constexpr float value = 1.0f;
+    auto bundle = loadBundle("no_skip", value);
+
+    ::testing::TestPartResultArray results;
+    runCapturing(
+        bundle, [](auto& vp) { writeOutput(vp, value); }, std::nullopt, std::nullopt, &results);
+
+    EXPECT_FALSE(anyFailed(results)) << "Test should run and pass when no skip is set";
+    EXPECT_FALSE(anySkipped(results)) << "Test should not be skipped when lookupSkip is nullopt";
+}
+
+} // namespace
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
index 9f016866b2c8..af361e7a1308 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -100,6 +100,50 @@ TEST(TestSynthesisTracker, DerivedInputFails)
     EXPECT_NE(result.reason.find("derived"), std::string::npos);
 }
 
+// fillComputed accounts for a derived input with NO refusal -> ok(), and the
+// computed bytes are copied into the leaf input.
+TEST(TestSynthesisTracker, FillComputedSucceedsAndCopies)
+{
+    auto inputs = makeTensors({1, 2});
+    const std::vector<int64_t> owned = {1, 2};
+    std::mt19937 rng(42);
+
+    // A source tensor holding a known value, matching the leaf's dtype/shape.
+    auto source = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(
+        std::vector<int64_t>{2, 3}, std::vector<int64_t>{3, 1});
+    source->fillTensorWithValue(7.5f);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.fillComputed(2, *source); // the "recipe" result
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+
+    // The bytes landed in the leaf input.
+    const auto* data = static_cast<const float*>(inputs.at(2)->rawHostData());
+    for(size_t i = 0; i < inputs.at(2)->elementCount(); ++i)
+    {
+        EXPECT_FLOAT_EQ(data[i], 7.5f);
+    }
+}
+
+// tensorAt returns an owned leaf input so a recipe can read already-filled
+// values, and returns nullptr for non-owned uids.
+TEST(TestSynthesisTracker, TensorAtReadsOwnedAndNullsNonOwned)
+{
+    auto inputs = makeTensors({1});
+    const std::vector<int64_t> owned = {1};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+
+    EXPECT_EQ(tracker.tensorAt(1), inputs.at(1).get());
+    EXPECT_EQ(tracker.tensorAt(0), nullptr); // absent-optional
+    EXPECT_EQ(tracker.tensorAt(99), nullptr); // not owned
+}
+
 // uid 0 (absent optional tensor) is silently ignored, not treated as owned.
 TEST(TestSynthesisTracker, ZeroUidIgnored)
 {

From 75bf232fc5e9eb2aabec055d706d1be26713d576 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 17:08:53 -0400
Subject: [PATCH 15/18] Remove fillComputed/tensorAt from SynthesisTracker
 (deferred to input-init unification branch)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../golden/input_init/SynthesisTracker.hpp    | 42 +-----------------
 .../tests/TestSynthesisTracker.cpp            | 44 -------------------
 2 files changed, 2 insertions(+), 84 deletions(-)

diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
index 033877dbedb0..05890f5ee6e8 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
@@ -114,16 +114,8 @@ struct SynthesisResult
 //                generation. In a fused fwd+bwd graph the forward output flows
 //                to the backward input as a virtual tensor (not owned, silently
 //                skipped). In a standalone backward, the same tensor is a leaf
-//                input. Two ways to handle it:
-//                  * markDerived — record it and let finish() refuse (SKIP),
-//                    used when no recipe exists to produce a consistent value.
-//                  * fillComputed — the fill function runs the recipe itself
-//                    (e.g. a CPU forward pass to produce `o`/`stats` consistent
-//                    with the q/k/v it already filled FREE) and hands the
-//                    result to the tracker. This accounts for the input with
-//                    NO refusal, so finish() succeeds and the bundle runs.
-//                A recipe reads the already-filled FREE inputs via tensorAt()
-//                and writes the computed tensor back via fillComputed().
+//                input. markDerived records it and lets finish() refuse (SKIP),
+//                used when no recipe exists to produce a consistent value.
 //
 // finish() succeeds only when every owned leaf input was declared as some role
 // AND none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
@@ -165,36 +157,6 @@ class SynthesisTracker
         _accounted.insert(uid);
     }
 
-    // Declares `uid` as DERIVED-and-produced — the fill function computed a
-    // consistent value itself (the recipe) and supplies it here. Copies the
-    // bytes into the leaf input and accounts for it with NO refusal, so a
-    // graph that would otherwise SKIP (markDerived) instead runs. `source`
-    // must have the same dtype/shape as the leaf input at `uid`.
-    void fillComputed(int64_t uid, const hipdnn_data_sdk::utilities::ITensor& source)
-    {
-        if(!isOwned(uid))
-        {
-            return;
-        }
-        auto& dst = *_inputs.at(uid);
-        const auto* src = const_cast<hipdnn_data_sdk::utilities::ITensor&>(source).rawHostData();
-        dst.fillWithData(src, source.elementSpace() * source.elementSize());
-        _accounted.insert(uid);
-    }
-
-    // Read access to an already-filled leaf input, so a recipe can compute a
-    // derived input from inputs filled earlier in the same node (e.g. read
-    // q/k/v to produce o/stats). Returns nullptr if `uid` is not an owned leaf
-    // (virtual, output, or absent-optional uid=0).
-    hipdnn_data_sdk::utilities::ITensor* tensorAt(int64_t uid)
-    {
-        if(!isOwned(uid))
-        {
-            return nullptr;
-        }
-        return _inputs.at(uid).get();
-    }
-
     // Declares `uid` as STRUCTURED — accounts for it but records a refusal.
     void markStructured(int64_t uid, const char* role)
     {
diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
index af361e7a1308..9f016866b2c8 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -100,50 +100,6 @@ TEST(TestSynthesisTracker, DerivedInputFails)
     EXPECT_NE(result.reason.find("derived"), std::string::npos);
 }
 
-// fillComputed accounts for a derived input with NO refusal -> ok(), and the
-// computed bytes are copied into the leaf input.
-TEST(TestSynthesisTracker, FillComputedSucceedsAndCopies)
-{
-    auto inputs = makeTensors({1, 2});
-    const std::vector<int64_t> owned = {1, 2};
-    std::mt19937 rng(42);
-
-    // A source tensor holding a known value, matching the leaf's dtype/shape.
-    auto source = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(
-        std::vector<int64_t>{2, 3}, std::vector<int64_t>{3, 1});
-    source->fillTensorWithValue(7.5f);
-
-    SynthesisTracker tracker(owned, inputs);
-    tracker.fillFree(1, -1.f, 1.f, rng);
-    tracker.fillComputed(2, *source); // the "recipe" result
-
-    const auto result = tracker.finish("TestOp");
-    EXPECT_TRUE(result.filled);
-
-    // The bytes landed in the leaf input.
-    const auto* data = static_cast<const float*>(inputs.at(2)->rawHostData());
-    for(size_t i = 0; i < inputs.at(2)->elementCount(); ++i)
-    {
-        EXPECT_FLOAT_EQ(data[i], 7.5f);
-    }
-}
-
-// tensorAt returns an owned leaf input so a recipe can read already-filled
-// values, and returns nullptr for non-owned uids.
-TEST(TestSynthesisTracker, TensorAtReadsOwnedAndNullsNonOwned)
-{
-    auto inputs = makeTensors({1});
-    const std::vector<int64_t> owned = {1};
-    std::mt19937 rng(42);
-
-    SynthesisTracker tracker(owned, inputs);
-    tracker.fillFree(1, -1.f, 1.f, rng);
-
-    EXPECT_EQ(tracker.tensorAt(1), inputs.at(1).get());
-    EXPECT_EQ(tracker.tensorAt(0), nullptr); // absent-optional
-    EXPECT_EQ(tracker.tensorAt(99), nullptr); // not owned
-}
-
 // uid 0 (absent optional tensor) is silently ignored, not treated as owned.
 TEST(TestSynthesisTracker, ZeroUidIgnored)
 {

From 50a8bd7dfb78a0a8f7fd9e0643d97cd14a11fb0d Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 17:19:26 -0400
Subject: [PATCH 16/18] Replace golden harness virtual tolerance/skip seam with
 direct TomlGuards calls

Both harnesses now use the same shared free functions (skipIfTomlMatched,
applyTomlToleranceOverride) from TomlGuards.hpp. The TOML lookup logic is
already unit-tested in TestTestSettings.cpp.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...raphGoldenReferenceVerificationHarness.cpp |  22 +-
 ...raphGoldenReferenceVerificationHarness.hpp |  17 +-
 .../integration-tests/tests/CMakeLists.txt    |   1 -
 .../tests/TestGoldenToleranceAndSkips.cpp     | 293 ------------------
 4 files changed, 2 insertions(+), 331 deletions(-)
 delete mode 100644 dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp

diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
index 7573827a040f..c4fca01301cc 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
@@ -506,14 +506,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(OutputTenso
         float rtol = 0.0f;
         resolveTolerances(wrapper, dataType, atol, rtol);
 
-        const auto testName = currentTestName();
-        if(auto ovr = lookupToleranceOverride(testName))
-        {
-            atol = ovr->atol;
-            rtol = ovr->rtol;
-            HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for "
-                                   << testName << ": atol=" << atol << " rtol=" << rtol);
-        }
+        applyTomlToleranceOverride(currentTestName(), atol, rtol);
 
         compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
     }
@@ -748,17 +741,4 @@ void IntegrationGraphGoldenReferenceVerificationHarness::applyMetadataGuards() c
     }
 }
 
-std::optional<std::string> IntegrationGraphGoldenReferenceVerificationHarness::lookupSkip(
-    const std::string& testName) const
-{
-    return TestConfig::get().findSkipForTest(testName);
-}
-
-std::optional<ToleranceOverride>
-    IntegrationGraphGoldenReferenceVerificationHarness::lookupToleranceOverride(
-        const std::string& testName) const
-{
-    return TestConfig::get().findToleranceOverride(testName);
-}
-
 } // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
index 345a7dfe8233..6388716e390c 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
@@ -92,13 +92,7 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
             GTEST_SKIP() << "No bundle set";
         }
 
-        {
-            const auto testName = currentTestName();
-            if(auto reason = lookupSkip(testName))
-            {
-                GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] " << *reason;
-            }
-        }
+        skipIfTomlMatched(currentTestName());
 
         applyMetadataGuards();
     }
@@ -132,15 +126,6 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
     // TestConfig singleton, which is only initialized by the real test main.
     virtual void applyMetadataGuards() const;
 
-    // TOML-driven skip check. Default reads TestConfig::get().findSkipForTest().
-    // Override in tests to avoid the singleton.
-    virtual std::optional<std::string> lookupSkip(const std::string& testName) const;
-
-    // TOML-driven tolerance override. Default reads TestConfig::get().findToleranceOverride().
-    // Override in tests to inject controlled values.
-    virtual std::optional<ToleranceOverride>
-        lookupToleranceOverride(const std::string& testName) const;
-
 private:
     bool _requiresDevice;
     std::filesystem::path _bundlePath;
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index 9f013992bd63..705b0bee96e7 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -22,7 +22,6 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestSynthesisTracker.cpp
     TestSynthesizeInputs.cpp
     TestVerificationModePaths.cpp
-    TestGoldenToleranceAndSkips.cpp
 )
 
 target_include_directories(hipdnn_integration_tests_unit_tests
diff --git a/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp b/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp
deleted file mode 100644
index 47a21f98551d..000000000000
--- a/dnn-providers/integration-tests/tests/TestGoldenToleranceAndSkips.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-// Tests the TOML tolerance override and test_skips wiring in the golden harness:
-//
-//   1. lookupToleranceOverride replaces atol/rtol when matched
-//   2. Per-op default (priority 2) is used when override returns nullopt
-//   3. lookupSkip skips the test when matched
-//   4. Test runs normally when lookupSkip returns nullopt
-
-#include <gtest/gtest-spi.h>
-#include <gtest/gtest.h>
-
-#include <cstring>
-#include <filesystem>
-#include <fstream>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
-
-#include "harness/TestConfig.hpp"
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
-
-// NOLINTBEGIN(readability-identifier-naming)
-
-using namespace hipdnn_integration_tests;
-using namespace hipdnn_integration_tests::golden;
-
-namespace
-{
-
-using EngineStub = std::function<void(std::unordered_map<int64_t, void*>&)>;
-
-class ToleranceTestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
-{
-public:
-    ToleranceTestableHarness(EngineStub engineStub,
-                             std::optional<ToleranceOverride> tolOverride,
-                             std::optional<std::string> skipReason)
-        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
-        , _engineStub(std::move(engineStub))
-        , _tolOverride(std::move(tolOverride))
-        , _skipReason(std::move(skipReason))
-    {
-    }
-
-    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
-    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
-
-protected:
-    VerificationMode getVerificationMode() const override
-    {
-        return VerificationMode::GOLDEN;
-    }
-
-    void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
-    {
-        _engineStub(variantPack);
-    }
-
-    void runReferenceExecutor(ReferenceExecutorType /*type*/,
-                              std::unordered_map<int64_t, void*>& /*variantPack*/) override
-    {
-    }
-
-    std::unique_ptr<IReferenceGraphExecutor>
-        makeReferenceExecutor(ReferenceExecutorType /*type*/) override
-    {
-        return nullptr;
-    }
-
-    void applyMetadataGuards() const override {}
-
-    std::optional<std::string> lookupSkip(const std::string& /*testName*/) const override
-    {
-        return _skipReason;
-    }
-
-    std::optional<ToleranceOverride>
-        lookupToleranceOverride(const std::string& /*testName*/) const override
-    {
-        return _tolOverride;
-    }
-
-private:
-    EngineStub _engineStub;
-    std::optional<ToleranceOverride> _tolOverride;
-    std::optional<std::string> _skipReason;
-};
-
-class TestGoldenToleranceAndSkips : public ::testing::Test
-{
-protected:
-    std::optional<hipdnn_test_sdk::utilities::ScopedDirectory> _scopedDir;
-    std::filesystem::path _tempDir;
-
-    static constexpr float K_OUTPUT_VALUE = 3.5f;
-    static constexpr int64_t K_OUTPUT_UID = 5;
-    static constexpr size_t K_OUTPUT_ELEMS = 120;
-
-    void SetUp() override
-    {
-        auto path
-            = std::filesystem::temp_directory_path()
-              / ("tol_skip_test_"
-                 + std::to_string(::testing::UnitTest::GetInstance()->current_test_info()->line()));
-        std::filesystem::remove_all(path);
-        _scopedDir.emplace(path);
-        _tempDir = _scopedDir->path();
-    }
-
-    static void writeBundleFiles(const std::filesystem::path& dir,
-                                 const std::string& name,
-                                 float goldenValue)
-    {
-        std::filesystem::create_directories(dir);
-        std::ofstream(dir / (name + ".json"))
-            << R"({"nodes": [{"inputs": {"x_tensor_uid": 0, "mean_tensor_uid": 1, )"
-               R"("inv_variance_tensor_uid": 2, "scale_tensor_uid": 3, "bias_tensor_uid": 4}, )"
-               R"("outputs": {"y_tensor_uid": 5}, "type": "BatchnormInferenceAttributes", )"
-               R"("compute_data_type": "float", "name": ""}], "tensors": [)"
-               R"({"name": "", "uid": 0, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
-               R"("data_type": "float", "virtual": false}, )"
-               R"({"name": "", "uid": 1, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
-               R"("data_type": "float", "virtual": false}, )"
-               R"({"name": "", "uid": 2, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
-               R"("data_type": "float", "virtual": false}, )"
-               R"({"name": "", "uid": 3, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
-               R"("data_type": "float", "virtual": false}, )"
-               R"({"name": "", "uid": 4, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
-               R"("data_type": "float", "virtual": false}, )"
-               R"({"name": "", "uid": 5, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
-               R"("data_type": "float", "virtual": false}], "io_data_type": "float", )"
-               R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
-
-        std::ofstream(dir / (name + ".meta.json"))
-            << R"({"format_version": 1, "operation": "BatchnormInference"})";
-
-        const auto basePath = (dir / name).string();
-        const auto writeFloatBin = [&](int64_t uid, size_t elems, float value) {
-            const std::vector<float> data(elems, value);
-            std::ofstream out(basePath + ".tensor" + std::to_string(uid) + ".bin",
-                              std::ios::binary);
-            out.write(reinterpret_cast<const char*>(data.data()),
-                      static_cast<std::streamsize>(data.size() * sizeof(float)));
-        };
-
-        writeFloatBin(0, 120, 0.0f);
-        writeFloatBin(1, 3, 0.0f);
-        writeFloatBin(2, 3, 0.0f);
-        writeFloatBin(3, 3, 0.0f);
-        writeFloatBin(4, 3, 0.0f);
-        writeFloatBin(K_OUTPUT_UID, K_OUTPUT_ELEMS, goldenValue);
-    }
-
-    std::shared_ptr<IntegrationTestBundle> loadBundle(const std::string& name,
-                                                      float goldenValue) const
-    {
-        const auto dir = _tempDir / name;
-        writeBundleFiles(dir, name, goldenValue);
-        auto result = loadIntegrationTestBundle(dir / (name + ".json"));
-        EXPECT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
-        return std::make_shared<IntegrationTestBundle>(
-            std::move(std::get<IntegrationTestBundle>(result)));
-    }
-
-    static void writeOutput(std::unordered_map<int64_t, void*>& variantPack, float value)
-    {
-        auto* ptr = static_cast<float*>(variantPack.at(K_OUTPUT_UID));
-        std::fill(ptr, ptr + K_OUTPUT_ELEMS, value);
-    }
-
-    static bool anyFailed(const ::testing::TestPartResultArray& results)
-    {
-        for(int i = 0; i < results.size(); ++i)
-        {
-            if(results.GetTestPartResult(i).failed())
-            {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    static bool anySkipped(const ::testing::TestPartResultArray& results)
-    {
-        for(int i = 0; i < results.size(); ++i)
-        {
-            if(results.GetTestPartResult(i).skipped())
-            {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    static void runCapturing(std::shared_ptr<IntegrationTestBundle> bundle,
-                             EngineStub engineStub,
-                             std::optional<ToleranceOverride> tolOverride,
-                             std::optional<std::string> skipReason,
-                             ::testing::TestPartResultArray* results)
-    {
-        ToleranceTestableHarness harness(
-            std::move(engineStub), std::move(tolOverride), std::move(skipReason));
-        harness.setBundle(std::move(bundle), "tol-skip-test-bundle");
-
-        const ::testing::ScopedFakeTestPartResultReporter reporter(
-            ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, results);
-        harness.SetUp();
-        if(!anySkipped(*results))
-        {
-            harness.TestBody();
-        }
-    }
-};
-
-// Engine writes output that differs from golden by 0.05.
-// BN inference fp32 default tolerance is 2e-4 — so this FAILS without override.
-// A TOML override with atol=0.1 makes it pass.
-TEST_F(TestGoldenToleranceAndSkips, ToleranceOverrideApplied)
-{
-    constexpr float goldenValue = 1.0f;
-    constexpr float engineValue = 1.05f;
-    auto bundle = loadBundle("tol_override", goldenValue);
-
-    ::testing::TestPartResultArray results;
-    runCapturing(
-        bundle,
-        [](auto& vp) { writeOutput(vp, engineValue); },
-        ToleranceOverride{0.1f, 0.1f},
-        std::nullopt,
-        &results);
-
-    EXPECT_FALSE(anyFailed(results)) << "Should pass with the relaxed TOML override tolerance";
-}
-
-// Same scenario but lookupToleranceOverride returns nullopt.
-// The per-op default (BN inference fp32 = 2e-4) is used, so the 0.05 diff FAILS.
-TEST_F(TestGoldenToleranceAndSkips, DefaultToleranceUsedWhenNoOverride)
-{
-    constexpr float goldenValue = 1.0f;
-    constexpr float engineValue = 1.05f;
-    auto bundle = loadBundle("tol_default", goldenValue);
-
-    ::testing::TestPartResultArray results;
-    runCapturing(
-        bundle,
-        [](auto& vp) { writeOutput(vp, engineValue); },
-        std::nullopt,
-        std::nullopt,
-        &results);
-
-    EXPECT_TRUE(anyFailed(results)) << "Should fail with the tight per-op default tolerance";
-}
-
-// lookupSkip returns a reason string — SetUp() should GTEST_SKIP.
-TEST_F(TestGoldenToleranceAndSkips, SkipApplied)
-{
-    auto bundle = loadBundle("skip_test", 1.0f);
-
-    ::testing::TestPartResultArray results;
-    runCapturing(
-        bundle,
-        [](auto& /*vp*/) {},
-        std::nullopt,
-        std::string("known failure on gfx1100"),
-        &results);
-
-    EXPECT_TRUE(anySkipped(results)) << "Test should be skipped when lookupSkip returns a reason";
-}
-
-// lookupSkip returns nullopt — test runs normally (and passes because engine matches golden).
-TEST_F(TestGoldenToleranceAndSkips, NoSkipRunsNormally)
-{
-    constexpr float value = 1.0f;
-    auto bundle = loadBundle("no_skip", value);
-
-    ::testing::TestPartResultArray results;
-    runCapturing(
-        bundle, [](auto& vp) { writeOutput(vp, value); }, std::nullopt, std::nullopt, &results);
-
-    EXPECT_FALSE(anyFailed(results)) << "Test should run and pass when no skip is set";
-    EXPECT_FALSE(anySkipped(results)) << "Test should not be skipped when lookupSkip is nullopt";
-}
-
-} // namespace
-
-// NOLINTEND(readability-identifier-naming)

From fcdc9efa64b3c332e60462b764b928c807151c4f Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 22:04:12 -0400
Subject: [PATCH 17/18] Unify graph/bundle harness input-init and rename
 golden->bundle

Harness unification (ALMIOPEN-1969 follow-up):

- Share input synthesis between both harnesses: move SynthesisTracker
  and SynthesizeInputs to harness/input_init/ (namespace
  hipdnn_integration_tests). The non-golden harness now drives its
  initializeBundle() through the same synthesis switch, with a random
  [-1,1] fallback. Remove 4 of 7 non-golden initializeBundle overrides
  whose ranges/seeds now match the shared fill functions; 3 remain
  (fused-graph range conflicts and a large-values stress test).

- Rename harness/golden/ -> harness/bundle/ and namespace
  hipdnn_integration_tests::golden -> ::bundle. "golden" was inaccurate:
  the harness verifies bundles via golden data OR a GPU/CPU reference,
  and most bundles ship no golden output. Rename the class
  IntegrationGraphGoldenReferenceVerificationHarness ->
  IntegrationBundleVerificationHarness (parallel to its sibling
  IntegrationGraphVerificationHarness). Fix stale "golden" log/report
  strings that mislabeled all bundles or shared comparison paths. The
  golden verification *mode* and golden output *data* vocabulary, and
  the --golden-data-dir CLI flag, are retained (accurate / external).

- Drop redundant hipInit/hipGetDevice/_deviceId from the graph harness:
  HIP initializes lazily and getSharedHandle()->hipdnnCreate() already
  does it before any graph runs; _deviceId was write-only.

- Split bundle metadata guards: the VRAM check applies to every bundle
  the engine runs, but the GPU-arch lock only matters when comparing
  against golden output values (arch-specific). Gate the arch check on
  hasGoldenOutputs so an inputs-only bundle verified against a local
  reference is not wrongly skipped on a different arch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../integration-tests/CMakeLists.txt          |   2 +-
 .../IntegrationGraphVerificationHarness.hpp   |  80 ++++++++++--
 .../{golden => bundle}/BundleDiscovery.hpp    |   4 +-
 .../{golden => bundle}/BundleRegistration.hpp |  12 +-
 .../IntegrationBundleVerificationHarness.cpp} | 114 +++++++++---------
 .../IntegrationBundleVerificationHarness.hpp} |  24 ++--
 .../IntegrationTestBundle.hpp                 |   4 +-
 .../UnverifiableBundleReport.hpp              |   4 +-
 .../input_init/SynthesisTracker.hpp           |   4 +-
 .../input_init/SynthesizeInputs.hpp           |  25 ++--
 .../IntegrationGpuBatchnormBackward.cpp       |  23 ----
 ...uBatchnormForwardInferenceWithVariance.cpp |  18 ---
 ...IntegrationGpuBatchnormForwardTraining.cpp |  35 ------
 ...nGpuBatchnormFwdInferenceVarianceActiv.cpp |  19 ---
 dnn-providers/integration-tests/src/main.cpp  |   8 +-
 .../integration-tests/tests/CMakeLists.txt    |   4 +-
 .../tests/TestBundleDiscovery.cpp             |   6 +-
 ....cpp => TestBundleVerificationHarness.cpp} |  16 +--
 .../tests/TestSynthesisTracker.cpp            |   4 +-
 .../tests/TestSynthesizeInputs.cpp            |   4 +-
 .../tests/TestVerificationModePaths.cpp       |  14 +--
 .../tests/TestVerificationPaths.cpp           |   4 +-
 22 files changed, 199 insertions(+), 229 deletions(-)
 rename dnn-providers/integration-tests/src/harness/{golden => bundle}/BundleDiscovery.hpp (99%)
 rename dnn-providers/integration-tests/src/harness/{golden => bundle}/BundleRegistration.hpp (94%)
 rename dnn-providers/integration-tests/src/harness/{golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp => bundle/IntegrationBundleVerificationHarness.cpp} (84%)
 rename dnn-providers/integration-tests/src/harness/{golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp => bundle/IntegrationBundleVerificationHarness.hpp} (92%)
 rename dnn-providers/integration-tests/src/harness/{golden => bundle}/IntegrationTestBundle.hpp (99%)
 rename dnn-providers/integration-tests/src/harness/{golden => bundle}/UnverifiableBundleReport.hpp (97%)
 rename dnn-providers/integration-tests/src/harness/{golden => }/input_init/SynthesisTracker.hpp (99%)
 rename dnn-providers/integration-tests/src/harness/{golden => }/input_init/SynthesizeInputs.hpp (97%)
 rename dnn-providers/integration-tests/tests/{TestGoldenVerificationHarness.cpp => TestBundleVerificationHarness.cpp} (94%)

diff --git a/dnn-providers/integration-tests/CMakeLists.txt b/dnn-providers/integration-tests/CMakeLists.txt
index 1f2dd08974af..6daf3bc367c6 100644
--- a/dnn-providers/integration-tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/CMakeLists.txt
@@ -128,7 +128,7 @@ set(INTEGRATION_TESTS_EXE hipdnn_integration_tests)
 
 add_executable(${INTEGRATION_TESTS_EXE}
     src/main.cpp
-    src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+    src/harness/bundle/IntegrationBundleVerificationHarness.cpp
 )
 
 add_subdirectory(src/integration_tests)
diff --git a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
index d5a49cd23fab..4187b3022a38 100644
--- a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
@@ -30,6 +30,7 @@
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
 #include "harness/TomlGuards.hpp"
+#include "harness/input_init/SynthesizeInputs.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -42,7 +43,6 @@ template <typename DataType, typename TestCaseType>
 class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<TestCaseType>
 {
 protected:
-    int _deviceId = 0;
     std::string _testCaseNote;
     std::string _testCaseLayout;
     std::unordered_map<int64_t, std::string> _tensorIdToNameMap;
@@ -54,10 +54,9 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
     {
         SKIP_IF_NO_DEVICES();
 
-        // Initialize HIP
-        ASSERT_EQ(hipInit(0), hipSuccess);
-        ASSERT_EQ(hipGetDevice(&_deviceId), hipSuccess);
-
+        // HIP initializes lazily on first runtime use; the shared hipdnn handle
+        // (getSharedHandle -> hipdnnCreate) does this before any graph executes,
+        // so no explicit hipInit is needed here.
         skipIfTomlMatched(currentTestName());
     }
 
@@ -308,17 +307,80 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
         });
     }
 
-    virtual void initializeBundle([[maybe_unused]] const hipdnn_frontend::graph::Graph& graph,
+    virtual void initializeBundle(const hipdnn_frontend::graph::Graph& graph,
                                   hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
                                   unsigned int seed)
     {
         bundle.sentinelFillOutputTensors();
 
-        for(auto& tensorPair : bundle.tensors)
+        auto [serialized, serErr] = graph.to_binary();
+        if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
+        {
+            initializeBundleFallback(bundle, seed);
+            return;
+        }
+
+        const auto* fb = hipdnn_flatbuffers_sdk::data_objects::GetGraph(serialized.data());
+        if(fb == nullptr || fb->nodes() == nullptr)
+        {
+            initializeBundleFallback(bundle, seed);
+            return;
+        }
+
+        std::vector<int64_t> leafInputUids;
+        InputTensorMap inputs;
+        for(auto& [uid, tensor] : bundle.tensors)
+        {
+            if(!bundle.isOutput(uid))
+            {
+                leafInputUids.push_back(uid);
+                inputs[uid] = std::move(tensor);
+            }
+        }
+
+        std::mt19937 rng(seed);
+        SynthesisTracker tracker(leafInputUids, inputs);
+
+        bool synthesisOk = true;
+        for(const auto* node : *fb->nodes())
+        {
+            if(node == nullptr)
+            {
+                continue;
+            }
+            auto result = synthesizeNodeInputs(*node, tracker, rng);
+            if(!result.filled)
+            {
+                synthesisOk = false;
+                break;
+            }
+        }
+
+        if(synthesisOk)
+        {
+            auto finalResult = tracker.finish("synthesis");
+            synthesisOk = finalResult.filled;
+        }
+
+        for(auto& [uid, tensor] : inputs)
+        {
+            bundle.tensors[uid] = std::move(tensor);
+        }
+
+        if(!synthesisOk)
+        {
+            initializeBundleFallback(bundle, seed);
+        }
+    }
+
+    void initializeBundleFallback(hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
+                                  unsigned int seed)
+    {
+        for(auto& [uid, tensor] : bundle.tensors)
         {
-            if(!bundle.isOutput(tensorPair.first))
+            if(!bundle.isOutput(uid))
             {
-                bundle.randomizeTensor(tensorPair.first, -1.0f, 1.0f, seed);
+                bundle.randomizeTensor(uid, -1.0f, 1.0f, seed);
             }
         }
     }
diff --git a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp b/dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
similarity index 99%
rename from dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
index e2743934e540..800c1288d4ea 100644
--- a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
@@ -14,7 +14,7 @@
 
 #include <hipdnn_plugin_sdk/PluginLogging.hpp>
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // Naming types, kept together. DerivedTestName is the output of deriveTestName()
@@ -272,4 +272,4 @@ inline std::vector<DiscoveredBundle> discoverBundles(const std::filesystem::path
     return bundles;
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp b/dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
similarity index 94%
rename from dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
index 5f45fc361f5a..6cfbc4e74322 100644
--- a/dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
@@ -15,10 +15,10 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 
 #include "harness/TestConfig.hpp"
-#include "harness/golden/BundleDiscovery.hpp"
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 namespace detail
@@ -58,7 +58,7 @@ inline void registerBundles(const std::vector<LoadedBundle>& bundles)
             __FILE__,
             __LINE__,
             [loaded = bundle.bundle, path = bundle.jsonPath]() -> ::testing::Test* {
-                auto* test = new IntegrationGraphGoldenReferenceVerificationHarness(
+                auto* test = new IntegrationBundleVerificationHarness(
                     /*requiresDevice=*/true);
                 test->setBundle(loaded, path);
                 return test;
@@ -159,7 +159,7 @@ inline void registerBundleTests()
 
     detail::registerBundles(bundles);
 
-    HIPDNN_PLUGIN_LOG_INFO("Registered " << bundles.size() << " golden bundle test(s)");
+    HIPDNN_PLUGIN_LOG_INFO("Registered " << bundles.size() << " bundle test(s)");
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
similarity index 84%
rename from dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
rename to dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
index c4fca01301cc..7e1faadda6d0 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
@@ -1,7 +1,7 @@
 // Copyright © Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
 
 #include <algorithm>
 #include <ostream>
@@ -24,16 +24,16 @@
 #include "harness/SharedHandle.hpp"
 #include "harness/TestConfig.hpp"
 #include "harness/TomlGuards.hpp"
-#include "harness/golden/UnverifiableBundleReport.hpp"
-#include "harness/golden/input_init/SynthesizeInputs.hpp"
+#include "harness/bundle/UnverifiableBundleReport.hpp"
 #include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
+#include "harness/input_init/SynthesizeInputs.hpp"
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // ---- virtual defaults ------------------------------------------------------
 
-void IntegrationGraphGoldenReferenceVerificationHarness::executeGraphThroughEngine(
+void IntegrationBundleVerificationHarness::executeGraphThroughEngine(
     std::unordered_map<int64_t, void*>& variantPack)
 {
     auto handle = getSharedHandle();
@@ -89,7 +89,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::executeGraphThroughEngi
     ASSERT_TRUE(result.is_good()) << result.get_message();
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::runReferenceExecutor(
+void IntegrationBundleVerificationHarness::runReferenceExecutor(
     ReferenceExecutorType type, std::unordered_map<int64_t, void*>& variantPack)
 {
     auto executor = makeReferenceExecutor(type);
@@ -97,8 +97,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runReferenceExecutor(
 }
 
 std::unique_ptr<IReferenceGraphExecutor>
-    IntegrationGraphGoldenReferenceVerificationHarness::makeReferenceExecutor(
-        ReferenceExecutorType type)
+    IntegrationBundleVerificationHarness::makeReferenceExecutor(ReferenceExecutorType type)
 {
     switch(type)
     {
@@ -113,12 +112,12 @@ std::unique_ptr<IReferenceGraphExecutor>
 
 // ---- top-level dispatch ----------------------------------------------------
 
-VerificationMode IntegrationGraphGoldenReferenceVerificationHarness::getVerificationMode() const
+VerificationMode IntegrationBundleVerificationHarness::getVerificationMode() const
 {
     return TestConfig::get().getVerificationMode();
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::runComparison()
+void IntegrationBundleVerificationHarness::runComparison()
 {
     if(_bundle->outputTensorUids.empty())
     {
@@ -168,7 +167,7 @@ void skipEngineCouldNotRun(const std::filesystem::path& bundlePath, const std::s
 }
 } // namespace
 
-std::optional<OutputTensors> IntegrationGraphGoldenReferenceVerificationHarness::runEngineOrSkip()
+std::optional<OutputTensors> IntegrationBundleVerificationHarness::runEngineOrSkip()
 {
     std::string error;
     auto engineOutputs = runEngineCapturingOutputs(error);
@@ -179,7 +178,7 @@ std::optional<OutputTensors> IntegrationGraphGoldenReferenceVerificationHarness:
     return engineOutputs;
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
+void IntegrationBundleVerificationHarness::runGoldenMode()
 {
     if(!_bundle->hasGoldenOutputs)
     {
@@ -194,8 +193,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runGoldenMode()
     compareAgainstGolden(*engineOutputs);
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::runExplicitRefMode(
-    ReferenceExecutorType type)
+void IntegrationBundleVerificationHarness::runExplicitRefMode(ReferenceExecutorType type)
 {
     auto engineOutputs = runEngineOrSkip();
     if(!engineOutputs)
@@ -224,7 +222,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runExplicitRefMode(
     }
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
+void IntegrationBundleVerificationHarness::runAutoMode()
 {
     auto engineOutputs = runEngineOrSkip();
     if(!engineOutputs)
@@ -289,7 +287,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::runAutoMode()
 
 // ---- inputs ----------------------------------------------------------------
 
-bool IntegrationGraphGoldenReferenceVerificationHarness::ensureInputsAvailable()
+bool IntegrationBundleVerificationHarness::ensureInputsAvailable()
 {
     if(_bundle->tensors.has_value())
     {
@@ -298,7 +296,7 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::ensureInputsAvailable()
     return synthesizeInputs();
 }
 
-bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
+bool IntegrationBundleVerificationHarness::synthesizeInputs()
 {
     const auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
@@ -354,7 +352,7 @@ bool IntegrationGraphGoldenReferenceVerificationHarness::synthesizeInputs()
 // make an unwritten output indistinguishable from a legitimately-computed zero,
 // so engine and reference could silently agree on garbage (both untouched zeros)
 // and the comparison would vacuously pass.
-OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateSentinelOutputs() const
+OutputTensors IntegrationBundleVerificationHarness::allocateSentinelOutputs() const
 {
     const auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
@@ -369,8 +367,8 @@ OutputTensors IntegrationGraphGoldenReferenceVerificationHarness::allocateSentin
 }
 
 std::unordered_map<int64_t, void*>
-    IntegrationGraphGoldenReferenceVerificationHarness::buildVariantPack(OutputTensors& outputs,
-                                                                         bool useDevice) const
+    IntegrationBundleVerificationHarness::buildVariantPack(OutputTensors& outputs,
+                                                           bool useDevice) const
 {
     std::unordered_map<int64_t, void*> variantPack;
     const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
@@ -392,8 +390,7 @@ std::unordered_map<int64_t, void*>
 }
 
 std::optional<OutputTensors>
-    IntegrationGraphGoldenReferenceVerificationHarness::runEngineCapturingOutputs(
-        std::string& error)
+    IntegrationBundleVerificationHarness::runEngineCapturingOutputs(std::string& error)
 {
     OutputTensors engineOutputs = allocateSentinelOutputs();
     auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
@@ -422,9 +419,9 @@ std::optional<OutputTensors>
     return engineOutputs;
 }
 
-IntegrationGraphGoldenReferenceVerificationHarness::RefRunResult
-    IntegrationGraphGoldenReferenceVerificationHarness::runReferenceCapturingOutputs(
-        ReferenceExecutorType type, OutputTensors& refOutputs)
+IntegrationBundleVerificationHarness::RefRunResult
+    IntegrationBundleVerificationHarness::runReferenceCapturingOutputs(ReferenceExecutorType type,
+                                                                       OutputTensors& refOutputs)
 {
     refOutputs = allocateSentinelOutputs();
     const bool useDevice = _requiresDevice && (type == ReferenceExecutorType::GPU);
@@ -447,14 +444,13 @@ IntegrationGraphGoldenReferenceVerificationHarness::RefRunResult
     return {RefStatus::RAN, {}};
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::markOutputsModified(
-    OutputTensors& outputs) const
+void IntegrationBundleVerificationHarness::markOutputsModified(OutputTensors& outputs) const
 {
     markOutputsModifiedFor(outputs, _requiresDevice);
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::markOutputsModifiedFor(
-    OutputTensors& outputs, bool device)
+void IntegrationBundleVerificationHarness::markOutputsModifiedFor(OutputTensors& outputs,
+                                                                  bool device)
 {
     for(auto& [uid, tensor] : outputs)
     {
@@ -471,16 +467,15 @@ void IntegrationGraphGoldenReferenceVerificationHarness::markOutputsModifiedFor(
 
 // ---- comparison ------------------------------------------------------------
 
-void IntegrationGraphGoldenReferenceVerificationHarness::compareAgainstGolden(
-    OutputTensors& engineOutputs)
+void IntegrationBundleVerificationHarness::compareAgainstGolden(OutputTensors& engineOutputs)
 {
     compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
         return *_bundle->tensors->at(uid);
     });
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputs(
-    OutputTensors& engineOutputs, OutputTensors& expected)
+void IntegrationBundleVerificationHarness::compareOutputs(OutputTensors& engineOutputs,
+                                                          OutputTensors& expected)
 {
     compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
         return *expected.at(uid);
@@ -488,8 +483,8 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputs(
 }
 
 template <typename ExpectedLookup>
-void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(OutputTensors& engineOutputs,
-                                                                     ExpectedLookup expectedFor)
+void IntegrationBundleVerificationHarness::compareEach(OutputTensors& engineOutputs,
+                                                       ExpectedLookup expectedFor)
 {
     auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
@@ -514,27 +509,27 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareEach(OutputTenso
 
 // ---- reporting helpers -----------------------------------------------------
 
-void IntegrationGraphGoldenReferenceVerificationHarness::skipUnverifiable(const std::string& reason)
+void IntegrationBundleVerificationHarness::skipUnverifiable(const std::string& reason)
 {
     UnverifiableBundleReport::get().record(
         _bundlePath.string(), reason, UnverifiableSeverity::UNVERIFIABLE);
     GTEST_SKIP() << "Unverifiable: " << reason << " (" << _bundlePath << ")";
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::recordRefError(const std::string& reason)
+void IntegrationBundleVerificationHarness::recordRefError(const std::string& reason)
 {
     UnverifiableBundleReport::get().record(
         _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
 }
 
-std::string IntegrationGraphGoldenReferenceVerificationHarness::refLabel(ReferenceExecutorType type)
+std::string IntegrationBundleVerificationHarness::refLabel(ReferenceExecutorType type)
 {
     return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
 }
 
 // ---- comparison + tolerance machinery --------------------------------------
 
-void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputTensor(
+void IntegrationBundleVerificationHarness::compareOutputTensor(
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
@@ -555,7 +550,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::compareOutputTensor(
     }
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::writeTensorDiffReport(
+void IntegrationBundleVerificationHarness::writeTensorDiffReport(
     std::ostream& os,
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
@@ -589,7 +584,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::writeTensorDiffReport(
 }
 
 template <typename T>
-void IntegrationGraphGoldenReferenceVerificationHarness::writeFpDiffReport(
+void IntegrationBundleVerificationHarness::writeFpDiffReport(
     std::ostream& os,
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
@@ -603,14 +598,14 @@ void IntegrationGraphGoldenReferenceVerificationHarness::writeFpDiffReport(
     hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
 }
 
-std::string IntegrationGraphGoldenReferenceVerificationHarness::labelFor(
+std::string IntegrationBundleVerificationHarness::labelFor(
     int64_t uid, const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
 {
     const auto* name = attrs.name();
     return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
 }
 
-std::string IntegrationGraphGoldenReferenceVerificationHarness::reportHeader(
+std::string IntegrationBundleVerificationHarness::reportHeader(
     int64_t uid,
     const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
@@ -619,7 +614,7 @@ std::string IntegrationGraphGoldenReferenceVerificationHarness::reportHeader(
     float rtol) const
 {
     std::ostringstream os;
-    os << "\nGolden comparison FAILED\n"
+    os << "\nBundle output comparison FAILED\n"
        << "  Bundle: " << _bundlePath << "\n"
        << "  Tensor: " << labelFor(uid, attrs) << " (UID " << uid << ", output)\n"
        << "  Shape:  " << hipdnn_test_sdk::utilities::StreamVec(expected.dims()) << "  "
@@ -628,13 +623,13 @@ std::string IntegrationGraphGoldenReferenceVerificationHarness::reportHeader(
     return os.str();
 }
 
-std::string IntegrationGraphGoldenReferenceVerificationHarness::dataTypeName(
+std::string IntegrationBundleVerificationHarness::dataTypeName(
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
 {
     return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::resolveTolerances(
+void IntegrationBundleVerificationHarness::resolveTolerances(
     const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
     float& atol,
@@ -646,7 +641,7 @@ void IntegrationGraphGoldenReferenceVerificationHarness::resolveTolerances(
 }
 
 template <typename T>
-float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForNodeAttributes(
+float IntegrationBundleVerificationHarness::toleranceForNodeAttributes(
     hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType)
 {
     using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
@@ -686,7 +681,7 @@ float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForNodeAttrib
     }
 }
 
-float IntegrationGraphGoldenReferenceVerificationHarness::deriveDefaultTolerance(
+float IntegrationBundleVerificationHarness::deriveDefaultTolerance(
     const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
 {
@@ -705,7 +700,7 @@ float IntegrationGraphGoldenReferenceVerificationHarness::deriveDefaultTolerance
     return found ? maxTolerance : 1e-3f;
 }
 
-float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForDataType(
+float IntegrationBundleVerificationHarness::toleranceForDataType(
     hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
     hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
 {
@@ -726,19 +721,30 @@ float IntegrationGraphGoldenReferenceVerificationHarness::toleranceForDataType(
     }
 }
 
-void IntegrationGraphGoldenReferenceVerificationHarness::applyMetadataGuards() const
+void IntegrationBundleVerificationHarness::applyMetadataGuards() const
 {
+    // VRAM is an execution-feasibility guard: the engine allocates the same
+    // buffers and runs the same graph regardless of how its output is verified,
+    // so this applies to every bundle (golden or reference-verified).
     if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
            _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
     {
         GTEST_SKIP() << *reason;
     }
 
-    if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
-           _bundle->metadata, TestConfig::get().getCurrentArch()))
+    // Arch-lock only matters for golden data: golden output VALUES are
+    // numerically arch-specific (AITER / GPU-ref generated). Inputs are not
+    // arch-locked. When there is no golden data the engine output is verified
+    // against a reference executor run on THIS device, so the bundle's recorded
+    // arch is irrelevant and must not gate the test.
+    if(_bundle->hasGoldenOutputs)
     {
-        GTEST_SKIP() << *reason;
+        if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
+               _bundle->metadata, TestConfig::get().getCurrentArch()))
+        {
+            GTEST_SKIP() << *reason;
+        }
     }
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
similarity index 92%
rename from dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
index 6388716e390c..b097de7a4ad7 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
@@ -22,9 +22,9 @@
 #include "harness/IReferenceGraphExecutor.hpp"
 #include "harness/TestConfig.hpp"
 #include "harness/TomlGuards.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // Output tensors, keyed by uid. Used both for the engine's computed "actual"
@@ -55,19 +55,15 @@ using OutputTensors
 //   * Virtual (inter-node) tensors are allocated internally by each executor; the
 //     variant packs we build carry only real (input + output) tensors.
 //
-// TODO(ALMIOPEN-1969 follow-up): Unify graph-init with the non-golden harness.
-//   Stage 1 — Route non-golden ops whose initializeBundle() is plain randomize
-//             (conv, matmul, BN-inference, reduction, rmsnorm-fwd, layernorm,
-//             pointwise) through the synthesis switch. Zero behavioral change.
-//   Stage 2 — Migrate structured recipes one op at a time: copy the exact
-//             ranges/seeds/derivation from each non-golden subclass override
-//             into the corresponding fill function, using fillComputed/tensorAt
-//             for derived inputs. Delete each override once its fill fn works.
-//   Stage 3 — Both harnesses share one init pipeline via SynthesisTracker.
-class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Test
+// NOTE: Stages 1-3 of init unification are done (ALMIOPEN-1969 follow-up).
+//   Both harnesses share SynthesisTracker + SynthesizeInputs from harness/input_init/.
+//   Remaining: 3 non-golden overrides kept for fused-graph range conflicts or
+//   specialized stress tests (BN backward activ, BN fwd training activ,
+//   conv backward weights large-values).
+class IntegrationBundleVerificationHarness : public ::testing::Test
 {
 public:
-    explicit IntegrationGraphGoldenReferenceVerificationHarness(bool requiresDevice)
+    explicit IntegrationBundleVerificationHarness(bool requiresDevice)
         : _requiresDevice(requiresDevice)
     {
     }
@@ -279,4 +275,4 @@ class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Tes
                           float rtol);
 };
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp
similarity index 99%
rename from dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp
index 84529b4060d6..9bf691ae06e3 100644
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp
@@ -21,7 +21,7 @@
 #include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
 #include <hipdnn_test_sdk/utilities/LoadGraphAndTensors.hpp>
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // Loaded tensors keyed by tensor UID. Holds every tensor declared by the graph —
@@ -325,4 +325,4 @@ inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPat
     return bundle;
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp b/dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp
similarity index 97%
rename from dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp
index d4f7ed908ad3..5ebe1902ba5b 100644
--- a/dnn-providers/integration-tests/src/harness/golden/UnverifiableBundleReport.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp
@@ -9,7 +9,7 @@
 #include <utility>
 #include <vector>
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // Why a bundle could not be verified. The two severities are printed in separate
@@ -122,4 +122,4 @@ class UnverifiableBundleReport
     std::vector<Record> _records;
 };
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp
similarity index 99%
rename from dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
rename to dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp
index 05890f5ee6e8..3a5ca814438e 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesisTracker.hpp
+++ b/dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp
@@ -15,7 +15,7 @@
 #include <hipdnn_data_sdk/utilities/Tensor.hpp>
 #include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests
 {
 
 // Pre-allocated input tensors keyed by uid, handed to a fill function to populate.
@@ -227,4 +227,4 @@ class SynthesisTracker
     std::vector<std::string> _refusals;
 };
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp
similarity index 97%
rename from dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
rename to dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp
index df5bc348f24d..1094cb5c1806 100644
--- a/dnn-providers/integration-tests/src/harness/golden/input_init/SynthesizeInputs.hpp
+++ b/dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp
@@ -3,9 +3,9 @@
 
 #pragma once
 
-#include "harness/golden/input_init/SynthesisTracker.hpp"
+#include "harness/input_init/SynthesisTracker.hpp"
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests
 {
 
 // ── Per-op fill functions ─────────────────────────────────────────────────────
@@ -107,8 +107,9 @@ inline SynthesisResult
         return SynthesisResult::unsupported("not BatchnormInferenceAttributesVarianceExt");
     }
     tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    tracker.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
-    tracker.fillFree(a->variance_tensor_uid(), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->mean_tensor_uid(), -1.0f, 1.0f, rng);
+    // Variance must be non-negative
+    tracker.fillFree(a->variance_tensor_uid(), 0.1f, 1.0f, rng);
     tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
     tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
     tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
@@ -128,11 +129,11 @@ inline SynthesisResult
         return SynthesisResult::unsupported("not BatchnormAttributes");
     }
     tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
-    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
-    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -2.0f, 2.0f, rng);
     tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
-    tracker.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
-    tracker.fillFree(a->prev_running_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->prev_running_variance_tensor_uid().value_or(0), -2.0f, 2.0f, rng);
     tracker.fillFree(a->momentum_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
 
     if(a->peer_stats_tensor_uid() != nullptr)
@@ -157,11 +158,11 @@ inline SynthesisResult
     {
         return SynthesisResult::unsupported("not BatchnormBackwardAttributes");
     }
-    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->dy_tensor_uid(), -0.1f, 0.1f, rng);
     tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
     tracker.fillFree(a->mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
-    tracker.fillFree(a->inv_variance_tensor_uid().value_or(0), 0.5f, 1.5f, rng);
-    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->inv_variance_tensor_uid().value_or(0), 1.9f, 2.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -0.1f, 0.1f, rng);
 
     if(a->peer_stats_tensor_uid() != nullptr)
     {
@@ -491,4 +492,4 @@ inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_o
     }
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
index fca914087f87..22289c0c4a02 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
@@ -127,29 +127,6 @@ class BatchnormBackward : public IntegrationGraphVerificationHarness<DataType, B
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BatchnormBwdTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BatchnormBwdTensorIds::DY_UID)
-            ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-        bundle.tensors.at(BatchnormBwdTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-
-        if(!CalcStats)
-        {
-            bundle.tensors.at(BatchnormBwdTensorIds::MEAN_UID)
-                ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-
-            bundle.tensors.at(BatchnormBwdTensorIds::INV_VARIANCE_UID)
-                ->fillTensorWithRandomValues(1.9f, 2.0f, seed);
-        }
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
index 46738378aadf..353948aabf8d 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
@@ -117,24 +117,6 @@ class BatchnormForwardInferenceWithVariance
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BnInfVarTensorIds::X_UID)->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::MEAN_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        // Variance must be non-negative; use positive range
-        bundle.tensors.at(BnInfVarTensorIds::VARIANCE_UID)
-            ->fillTensorWithRandomValues(0.1f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
index 979ad80feb5a..f50cc84e4d88 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
@@ -208,41 +208,6 @@ class BatchnormForwardTraining
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        // Note: Epsilon and momentum are pass-by-value (set via set_value()), not buffers
-
-        // X input: default range
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-
-        // Scale and bias: -2.0 to 2.0 to match MIOpen
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 1);
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 2);
-
-        // Running mean: only initialize PREV (input), leave NEXT (output) with sentinel
-        if(bundle.tensors.find(BatchnormFwdTrainingTensorIds::PREV_RUNNING_MEAN_UID)
-           != bundle.tensors.end())
-        {
-            bundle.tensors.at(BatchnormFwdTrainingTensorIds::PREV_RUNNING_MEAN_UID)
-                ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 1000);
-        }
-
-        // Running variance: only initialize PREV (input), leave NEXT (output) with sentinel
-        if(bundle.tensors.find(BatchnormFwdTrainingTensorIds::PREV_RUNNING_VARIANCE_UID)
-           != bundle.tensors.end())
-        {
-            bundle.tensors.at(BatchnormFwdTrainingTensorIds::PREV_RUNNING_VARIANCE_UID)
-                ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 2000);
-        }
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
index 3273a926df23..d136d07e90ce 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
@@ -149,25 +149,6 @@ class BatchnormFwdInferenceVarianceActiv
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BnInfVarActivTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::MEAN_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        // Variance must be non-negative; use positive range
-        bundle.tensors.at(BnInfVarActivTensorIds::VARIANCE_UID)
-            ->fillTensorWithRandomValues(0.1f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/main.cpp b/dnn-providers/integration-tests/src/main.cpp
index 9cb995f90a93..a2b787ca18f7 100644
--- a/dnn-providers/integration-tests/src/main.cpp
+++ b/dnn-providers/integration-tests/src/main.cpp
@@ -21,8 +21,8 @@
 #include "harness/SharedHandle.hpp"
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
-#include "harness/golden/BundleRegistration.hpp"
-#include "harness/golden/UnverifiableBundleReport.hpp"
+#include "harness/bundle/BundleRegistration.hpp"
+#include "harness/bundle/UnverifiableBundleReport.hpp"
 
 namespace
 {
@@ -292,13 +292,13 @@ int main(int argc, char** argv) noexcept
             return 1;
         }
 
-        hipdnn_integration_tests::golden::registerBundleTests();
+        hipdnn_integration_tests::bundle::registerBundleTests();
 
         const int result = RUN_ALL_TESTS();
 
         // Print bundles that ended without a verdict (no oracle / reference bug).
         // Informational only — these SKIP, so they do not affect `result`.
-        hipdnn_integration_tests::golden::UnverifiableBundleReport::get().print();
+        hipdnn_integration_tests::bundle::UnverifiableBundleReport::get().print();
 
         // Generate support matrix if requested
         if(hipdnn_integration_tests::SupportMatrixCollector::get().isEnabled())
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index 705b0bee96e7..9bd2c2970847 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -3,7 +3,7 @@
 
 add_executable(hipdnn_integration_tests_unit_tests
     main.cpp
-    ../src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.cpp
+    ../src/harness/bundle/IntegrationBundleVerificationHarness.cpp
     TestArchMatch.cpp
     TestBundleMetadata.cpp
     TestGraphDescription.cpp
@@ -18,7 +18,7 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestReferenceGraphExecutorFactory.cpp
     TestBundleDiscovery.cpp
     TestVerificationPaths.cpp
-    TestGoldenVerificationHarness.cpp
+    TestBundleVerificationHarness.cpp
     TestSynthesisTracker.cpp
     TestSynthesizeInputs.cpp
     TestVerificationModePaths.cpp
diff --git a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
index 6128f4f0ebc2..513dfd4be142 100644
--- a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
@@ -12,10 +12,10 @@
 #include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/LoadGraphAndTensors.hpp>
 
-#include "harness/golden/BundleDiscovery.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 // NOLINTBEGIN(readability-identifier-naming)
 
diff --git a/dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp b/dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
similarity index 94%
rename from dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp
rename to dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
index 95dcfb6ea887..cffe75ed9383 100644
--- a/dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
@@ -1,7 +1,7 @@
 // Copyright © Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// Unit tests for IntegrationGraphGoldenReferenceVerificationHarness's core
+// Unit tests for IntegrationBundleVerificationHarness's core
 // contract: how it translates an executor's behaviour into a GTest outcome.
 //
 //   executor throws (unsupported graph) -> SKIP
@@ -27,12 +27,12 @@
 
 #include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
 
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 namespace
 {
@@ -40,19 +40,19 @@ namespace
 // Exposes the harness's protected SetUp/TestBody so a test can drive the full
 // lifecycle directly, and overrides executeGraphThroughEngine with a stub so the
 // tests run on CPU-only CI without a real GPU engine.
-class TestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
+class TestableHarness : public IntegrationBundleVerificationHarness
 {
 public:
     using StubFunc = std::function<void(std::unordered_map<int64_t, void*>&)>;
 
     explicit TestableHarness(StubFunc stub)
-        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
+        : IntegrationBundleVerificationHarness(/*requiresDevice=*/false)
         , _stub(std::move(stub))
     {
     }
 
-    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
-    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
+    using IntegrationBundleVerificationHarness::SetUp;
+    using IntegrationBundleVerificationHarness::TestBody;
 
 protected:
     void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
index 9f016866b2c8..ac6e3e930d7c 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -10,11 +10,11 @@
 
 #include <hipdnn_data_sdk/utilities/Tensor.hpp>
 
-#include "harness/golden/input_init/SynthesisTracker.hpp"
+#include "harness/input_init/SynthesisTracker.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests;
 
 namespace
 {
diff --git a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
index feba60a0769c..c9de5c4c1405 100644
--- a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
+++ b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
@@ -12,12 +12,12 @@
 #include <hipdnn_data_sdk/utilities/Tensor.hpp>
 #include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
 
-#include "harness/golden/input_init/SynthesizeInputs.hpp"
+#include "harness/input_init/SynthesizeInputs.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
 using namespace hipdnn_flatbuffers_sdk::data_objects;
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests;
 
 namespace
 {
diff --git a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
index b2e6becab76a..be39945b972d 100644
--- a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
+++ b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
@@ -25,13 +25,13 @@
 
 #include "harness/ReferenceCapabilityError.hpp"
 #include "harness/TestConfig.hpp"
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
 using namespace hipdnn_integration_tests;
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 namespace
 {
@@ -39,19 +39,19 @@ namespace
 using EngineStub = std::function<void(std::unordered_map<int64_t, void*>&)>;
 using RefStub = std::function<void(ReferenceExecutorType, std::unordered_map<int64_t, void*>&)>;
 
-class ModeTestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
+class ModeTestableHarness : public IntegrationBundleVerificationHarness
 {
 public:
     ModeTestableHarness(VerificationMode mode, EngineStub engineStub, RefStub refStub)
-        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
+        : IntegrationBundleVerificationHarness(/*requiresDevice=*/false)
         , _mode(mode)
         , _engineStub(std::move(engineStub))
         , _refStub(std::move(refStub))
     {
     }
 
-    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
-    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
+    using IntegrationBundleVerificationHarness::SetUp;
+    using IntegrationBundleVerificationHarness::TestBody;
 
 protected:
     VerificationMode getVerificationMode() const override
diff --git a/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
index 36a54fcbb5ec..b2d7ecb5473d 100644
--- a/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
+++ b/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
@@ -12,12 +12,12 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
-#include "harness/golden/BundleDiscovery.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
 #include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 namespace
 {

From d16d91f718281e6d877f4470513ffdbd5bf2d022 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 23 Jun 2026 23:11:41 -0400
Subject: [PATCH 18/18] Unify tolerance resolution into a shared policy-based
 resolver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both harnesses resolved comparison tolerance with duplicated, divergent
code. Replace both with one shared resolver
(harness/tolerance/ToleranceResolver.hpp) where the aggregation policy is a
plain function (GraphWrapper, dtype) -> float, selectable per caller.

What changed:
- New resolveTolerance(wrapper, dtype, testName, atol, rtol, policy=max):
  derives a default via the chosen aggregation policy, then applies the TOML
  per-test override (highest priority) in ONE place. Single tolerance entry
  point for both harnesses; the override is no longer applied separately in
  either, so the layering order lives here alone.
- Two policies, each a free function (no enum/switch; the policy IS the
  function, C++17 function pointer):
    * maxAcrossNodes  — loosest per-node tolerance; conservative envelope,
      never tighter than any node so it cannot cause a false failure.
    * outputOpTolerance — tolerance of the last non-Pointwise op (the
      output-producing op); reproduces the graph harness's historical
      getTolerance() behavior.
- Wiring preserves existing behavior on both sides:
    * Graph harness getTolerance() -> outputOpTolerance (its prior policy),
      now expressed through the shared resolver instead of a private
      dynamic_cast switch. It also serializes via to_binary() and reads the
      output dtype from the flatbuffer, so it shares the resolver keyed on the
      same representation the bundle harness uses.
    * Bundle harness -> maxAcrossNodes (its prior policy), unchanged.
  The two policies agree for the common one-real-op + activation case
  (activation is Pointwise -> skipped; the single real op is both loosest and
  last), so they differ only on multi-real-op fusions, which carry explicit
  overrides today.
- Deletes the duplicate per-op tolerance switch that existed in each harness
  (one on dynamic_cast of live nodes, one on the flatbuffer NodeAttributes
  enum); both now share one flatbuffer-keyed lookup.
- warnIfMultipleOutputs() guards the single-output assumption: every current
  policy reduces over the whole graph, not the per-output subgraph, so
  multi-output graphs are flagged loudly (deferred fix, ALMIOPEN-2216).

Why output-op for the graph harness (not max): a mechanical migration of the
C++ graph tests should not change their tolerances. outputOpTolerance keeps the
exact prior numbers. Note this is a heuristic, not a principled tight bound
(it attributes the whole output tolerance to one op); it is kept for migration
parity. The principled tighten path is the future DynamicTolerances upgrade
(see TODO in ToleranceResolver.hpp and ALMIOPEN-2216); FP4 also needs sub-bf16
entries in the dtype switch (currently falls through to 1e-3).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../IntegrationGraphVerificationHarness.hpp   | 134 ++++------
 .../IntegrationBundleVerificationHarness.cpp  | 100 +------
 .../IntegrationBundleVerificationHarness.hpp  |  27 +-
 .../harness/tolerance/ToleranceResolver.hpp   | 250 ++++++++++++++++++
 4 files changed, 314 insertions(+), 197 deletions(-)
 create mode 100644 dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp

diff --git a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
index 4187b3022a38..ff06e24d1fb5 100644
--- a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
@@ -11,8 +11,6 @@
 #include <hipdnn_frontend/Graph.hpp>
 #include <hipdnn_frontend/Utilities.hpp>
 #include <hipdnn_frontend/attributes/TensorAttributes.hpp>
-#include <hipdnn_frontend/node/RMSNormNode.hpp>
-#include <hipdnn_frontend/node/ReductionNode.hpp>
 #include <hipdnn_plugin_sdk/PluginLogging.hpp>
 #include <hipdnn_test_sdk/utilities/CpuFpReferenceMiopenRmsValidation.hpp>
 #include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
@@ -31,6 +29,7 @@
 #include "harness/TestConfig.hpp"
 #include "harness/TomlGuards.hpp"
 #include "harness/input_init/SynthesizeInputs.hpp"
+#include "harness/tolerance/ToleranceResolver.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -72,41 +71,64 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
 
     virtual void runGraphTest() = 0;
 
-    // Determine tolerance for an output tensor based on the graph and
-    // configured tolerance mode for the engine.
+    // Determine the FINAL tolerance for an output tensor: an aggregation-policy
+    // default plus the TOML per-test override, both via
+    // harness/tolerance/ToleranceResolver.hpp. The resolver is keyed on the
+    // serialized flatbuffer graph: we serialize with to_binary() — the same
+    // pattern initializeBundle() already uses — and read the output tensor's dtype
+    // from the flatbuffer.
+    //
+    // Policy = outputOpTolerance (the last non-Pointwise op), which reproduces
+    // this harness's historical getTolerance() behavior so the C++ graph tests
+    // keep their exact tolerances. (The bundle harness uses the maxAcrossNodes
+    // default; the two agree for the common one-real-op + activation case.) The
+    // returned value is already overridden, so registerValidator stores it as-is.
     float getTolerance(const hipdnn_frontend::graph::Graph& graph,
                        const std::shared_ptr<hipdnn_frontend::graph::TensorAttributes>& output)
     {
         ToleranceMode mode = TestConfig::get().getToleranceMode();
+        if(mode != ToleranceMode::DEFAULT)
+        {
+            ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
+            return 0.0f;
+        }
 
-        if(mode == ToleranceMode::DEFAULT)
+        auto [serialized, serErr] = graph.to_binary();
+        if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
         {
-            // We determine the tolerance based on the last non-PointwiseNode
-            // (the root op). This will be gradually updated to use dynamic
-            // calculation as possible; eventually, the tolerance will be
-            // entirely dynamically determined in the default case.
-            //
-            // NOTE: after validate(), the graph's sub-nodes are in topological order.
-            const hipdnn_frontend::graph::INode* rootOp = nullptr;
-            graph.visit([&](const hipdnn_frontend::graph::INode& node) {
-                if(dynamic_cast<const hipdnn_frontend::graph::PointwiseNode*>(&node) == nullptr
-                   && dynamic_cast<const hipdnn_frontend::graph::Graph*>(&node) == nullptr)
-                {
-                    rootOp = &node;
-                }
-            });
+            ADD_FAILURE() << "getTolerance: graph serialization failed";
+            return 0.0f;
+        }
 
-            if(rootOp == nullptr)
-            {
-                ADD_FAILURE() << "getTolerance: no root op found in graph";
-                return 0.0f;
-            }
+        const auto wrapper
+            = hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper::fromSerializedBlob(
+                serialized.data(), serialized.size());
+        if(!wrapper.isValid())
+        {
+            ADD_FAILURE() << "getTolerance: serialized graph failed verification";
+            return 0.0f;
+        }
 
-            return toleranceForNode(*rootOp, output->get_data_type());
+        const auto& tensorMap = wrapper.getTensorMap();
+        const auto it = tensorMap.find(output->get_uid());
+        if(it == tensorMap.end())
+        {
+            ADD_FAILURE() << "getTolerance: output tensor uid " << output->get_uid()
+                          << " not found in serialized graph";
+            return 0.0f;
         }
 
-        ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
-        return 0.0f;
+        float atol = 0.0f;
+        float rtol = 0.0f;
+        tolerance::resolveTolerance(wrapper,
+                                    it->second->data_type(),
+                                    currentTestName(),
+                                    atol,
+                                    rtol,
+                                    tolerance::outputOpTolerance);
+        // getTolerance's single-float contract predates split atol/rtol; under the
+        // current resolver the two are equal (same default, same override).
+        return atol;
     }
 
     void verifyGraph(hipdnn_frontend::graph::Graph& graph, unsigned int seed)
@@ -192,6 +214,9 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
             << "At least one output tensor id must be specified for "
                "validation.";
 
+        tolerance::warnIfMultipleOutputs(gpuBundle.outputTensorIds.size(),
+                                         "IntegrationGraphVerificationHarness");
+
         HIPDNN_PLUGIN_LOG_INFO("Validating " << gpuBundle.outputTensorIds.size()
                                              << " output tensors");
 
@@ -244,9 +269,10 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
                            float absoluteTolerance,
                            float relativeTolerance)
     {
-        float finalAtol = absoluteTolerance;
-        float finalRtol = relativeTolerance;
-        applyTomlToleranceOverride(currentTestName(), finalAtol, finalRtol);
+        // Tolerances arrive already resolved (default + TOML override) from
+        // getTolerance via ToleranceResolver; no override is applied here.
+        const float finalAtol = absoluteTolerance;
+        const float finalRtol = relativeTolerance;
 
         // Since the graph can infer properties + Ids, we defer validator registration until right
         // before validation in verifyGraph
@@ -385,54 +411,6 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
         }
     }
 
-    static float toleranceForNode(const hipdnn_frontend::graph::INode& node,
-                                  hipdnn_frontend::DataType dataType)
-    {
-        switch(dataType)
-        {
-        case hipdnn_frontend::DataType::FLOAT:
-            return toleranceForNodeTyped<float>(node);
-        case hipdnn_frontend::DataType::HALF:
-            return toleranceForNodeTyped<half>(node);
-        case hipdnn_frontend::DataType::BFLOAT16:
-            return toleranceForNodeTyped<bfloat16>(node);
-        default:
-            ADD_FAILURE() << "toleranceForNode: unsupported data type";
-            return 0.0f;
-        }
-    }
-
-    template <typename T>
-    static float toleranceForNodeTyped(const hipdnn_frontend::graph::INode& node)
-    {
-        namespace fe = hipdnn_frontend::graph;
-        using namespace hipdnn_test_sdk::utilities;
-
-        if(dynamic_cast<const fe::ConvolutionFpropNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceFwd<T>());
-        if(dynamic_cast<const fe::ConvolutionDgradNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceBwd<T>());
-        if(dynamic_cast<const fe::ConvolutionWgradNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceWrw<T>());
-        if(dynamic_cast<const fe::BatchnormInferenceNodeVarianceExt*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceInferenceWithVariance<T>());
-        if(dynamic_cast<const fe::BatchnormInferenceNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceInference<T>());
-        if(dynamic_cast<const fe::BatchnormNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceTraining<T>());
-        if(dynamic_cast<const fe::BatchnormBackwardNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceBackward<T>());
-        if(dynamic_cast<const fe::MatmulNode*>(&node) != nullptr)
-            return static_cast<float>(matmul::getTolerance<T>());
-        if(dynamic_cast<const fe::ReductionNode*>(&node) != nullptr)
-            return static_cast<float>(reduction::getTolerance<T>());
-        if(dynamic_cast<const fe::RMSNormNode*>(&node) != nullptr)
-            return static_cast<float>(rmsnorm::getTolerance<T>());
-
-        ADD_FAILURE() << "toleranceForNodeTyped: unsupported node type";
-        return 0.0f;
-    }
-
     void executeGpuGraph(hipdnnHandle_t handle,
                          hipdnn_frontend::graph::Graph& graph,
                          hipdnn_test_sdk::utilities::GraphTensorBundle& bundle)
diff --git a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
index 7e1faadda6d0..c5ebb2507f31 100644
--- a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
@@ -27,6 +27,7 @@
 #include "harness/bundle/UnverifiableBundleReport.hpp"
 #include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
 #include "harness/input_init/SynthesizeInputs.hpp"
+#include "harness/tolerance/ToleranceResolver.hpp"
 
 namespace hipdnn_integration_tests::bundle
 {
@@ -489,6 +490,9 @@ void IntegrationBundleVerificationHarness::compareEach(OutputTensors& engineOutp
     auto wrapper = _bundle->graphWrapper();
     const auto& tensorAttrMap = wrapper.getTensorMap();
 
+    tolerance::warnIfMultipleOutputs(_bundle->outputTensorUids.size(),
+                                     "IntegrationBundleVerificationHarness");
+
     for(const int64_t uid : _bundle->outputTensorUids)
     {
         auto& actualTensor = *engineOutputs.at(uid);
@@ -499,9 +503,7 @@ void IntegrationBundleVerificationHarness::compareEach(OutputTensors& engineOutp
 
         float atol = 0.0f;
         float rtol = 0.0f;
-        resolveTolerances(wrapper, dataType, atol, rtol);
-
-        applyTomlToleranceOverride(currentTestName(), atol, rtol);
+        tolerance::resolveTolerance(wrapper, dataType, currentTestName(), atol, rtol);
 
         compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
     }
@@ -629,98 +631,6 @@ std::string IntegrationBundleVerificationHarness::dataTypeName(
     return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
 }
 
-void IntegrationBundleVerificationHarness::resolveTolerances(
-    const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-    float& atol,
-    float& rtol)
-{
-    const float defaultTolerance = deriveDefaultTolerance(wrapper, dataType);
-    atol = defaultTolerance;
-    rtol = defaultTolerance;
-}
-
-template <typename T>
-float IntegrationBundleVerificationHarness::toleranceForNodeAttributes(
-    hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType)
-{
-    using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
-    namespace tol = hipdnn_test_sdk::utilities;
-
-    switch(attrType)
-    {
-    case NA::ConvolutionFwdAttributes:
-        return tol::conv::getToleranceFwd<T>();
-    case NA::ConvolutionBwdAttributes:
-        return tol::conv::getToleranceBwd<T>();
-    case NA::ConvolutionWrwAttributes:
-        return tol::conv::getToleranceWrw<T>();
-    case NA::BatchnormInferenceAttributes:
-        return tol::batchnorm::getToleranceInference<T>();
-    case NA::BatchnormInferenceAttributesVarianceExt:
-        return tol::batchnorm::getToleranceInferenceWithVariance<T>();
-    case NA::BatchnormAttributes:
-        return tol::batchnorm::getToleranceTraining<T>();
-    case NA::BatchnormBackwardAttributes:
-        return tol::batchnorm::getToleranceBackward<T>();
-    case NA::MatmulAttributes:
-        return tol::matmul::getTolerance<T>();
-    case NA::ReductionAttributes:
-        return tol::reduction::getTolerance<T>();
-    case NA::RMSNormAttributes:
-        return tol::rmsnorm::getTolerance<T>();
-    case NA::PointwiseAttributes:
-        return tol::pointwise::getTolerance<T>();
-    case NA::LayernormAttributes:
-        return tol::layernorm::getTolerance<T>();
-    case NA::SdpaAttributes:
-    case NA::SdpaBackwardAttributes:
-        return tol::sdpa::getToleranceFwd<T>();
-    default:
-        return 1e-3f;
-    }
-}
-
-float IntegrationBundleVerificationHarness::deriveDefaultTolerance(
-    const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-{
-    const auto nodeCount = wrapper.nodeCount();
-
-    bool found = false;
-    float maxTolerance = 0.0f;
-    for(uint32_t i = 0; i < nodeCount; ++i)
-    {
-        const auto attrType = wrapper.getNode(i).attributes_type();
-        const float nodeTolerance = toleranceForDataType(attrType, dataType);
-        maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
-        found = true;
-    }
-
-    return found ? maxTolerance : 1e-3f;
-}
-
-float IntegrationBundleVerificationHarness::toleranceForDataType(
-    hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-{
-    using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
-    using hipdnn_data_sdk::types::bfloat16;
-    using hipdnn_data_sdk::types::half;
-
-    switch(dataType)
-    {
-    case DT::FLOAT:
-        return toleranceForNodeAttributes<float>(attrType);
-    case DT::HALF:
-        return toleranceForNodeAttributes<half>(attrType);
-    case DT::BFLOAT16:
-        return toleranceForNodeAttributes<bfloat16>(attrType);
-    default:
-        return 1e-3f;
-    }
-}
-
 void IntegrationBundleVerificationHarness::applyMetadataGuards() const
 {
     // VRAM is an execution-feasibility guard: the engine allocates the same
diff --git a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
index b097de7a4ad7..65a28997d2b0 100644
--- a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
@@ -197,30 +197,9 @@ class IntegrationBundleVerificationHarness : public ::testing::Test
     static void markOutputsModifiedFor(OutputTensors& outputs, bool device);
 
     // ── tolerances ──────────────────────────────────────────────────────
-    // Two-level lookup: per-operation default from TestTolerances.hpp,
-    // then TOML per-engine override (if a [[tolerance_overrides]] filter
-    // matches the current gtest name).
-    //
-    //   resolveTolerances        entry point — sets atol/rtol for one output tensor
-    //     deriveDefaultTolerance  max tolerance across all graph nodes (priority 2)
-    //       toleranceForDataType    dispatches on DataType → typed template
-    //         toleranceForNodeAttributes<T>  maps NodeAttributes → TestTolerances.hpp
-    static void
-        resolveTolerances(const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-                          hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                          float& atol,
-                          float& rtol);
-
-    static float deriveDefaultTolerance(
-        const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-        hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
-
-    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
-
-    template <typename T>
-    static float
-        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType);
+    // Default tolerance derivation (max-across-nodes, per-op/per-dtype lookup)
+    // is shared with the graph harness via harness/tolerance/ToleranceResolver.hpp;
+    // the TOML per-test override is layered on top in compareEach.
 
     // ── comparison ──────────────────────────────────────────────────────
     void compareAgainstGolden(OutputTensors& engineOutputs);
diff --git a/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp b/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp
new file mode 100644
index 000000000000..2d28d835c5f5
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp
@@ -0,0 +1,250 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <hipdnn_data_sdk/types.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_plugin_sdk/PluginLogging.hpp>
+#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
+
+#include "harness/TomlGuards.hpp"
+
+// Shared default-tolerance resolution for both verification harnesses
+// (ALMIOPEN-2216). Both the programmatic graph harness and the bundle harness
+// reduce to the same question — "given a serialized graph and an output dtype,
+// what default atol/rtol should the comparison use?" — so the policy lives here,
+// keyed on the flatbuffer GraphWrapper, which is the common representation: the
+// bundle harness already holds one, and the graph harness obtains it via
+// Graph::to_binary().
+//
+// This header owns POLICY only; the per-operation / per-dtype tolerance NUMBERS
+// stay in hipdnn_test_sdk TestTolerances.hpp and are read, never modified.
+//
+// TODO(dynamic tolerance): the per-op tolerance source here is the FIXED table
+// (TestTolerances.hpp). The codebase already ships a dynamic, shape/dtype-aware
+// model — hipdnn_test_sdk DynamicTolerances.hpp + per-op DynamicTolerances{Matmul,
+// Conv,BatchNorm,LayerNorm,RMSNorm,Sdpa,Pointwise}.hpp and
+// pointwise/PointwiseErrorClassification.hpp — already wired into other test
+// fixtures (conv gpu-ref, sdpa backward, cpu-executor plan tests). RFC 0011
+// §"Tolerance Framework" / "Future Work #1" defines the upgrade: replace the
+// fixed level-3 default with DynamicTolerances, keyed on graph properties
+// (op, dtype, tensor dims), without changing the three-level chain or this
+// aggregation policy. When promoting, add a sibling aggregation function that
+// routes through the existing DynamicTolerances functions instead of
+// TestTolerances.hpp, and pass it to resolveTolerance; also add
+// sub-bf16 dtypes (FP4) which the current DataType switch lacks (falls through to
+// 1e-3). See ALMIOPEN-2216.
+//
+// Two policy decisions are encoded here, each kept independently evolvable:
+//
+//   * Aggregation = max-across-nodes. The output tolerance is the loosest
+//     per-node tolerance in the graph. This is the conservative envelope: it can
+//     be too loose on a long fused chain but is never too tight, so it never
+//     manufactures a false failure. Root-op-only selection (the graph harness's
+//     prior heuristic) is unsafe — an upstream high-K / low-precision node
+//     dominates the error, so picking the "root" can under-tolerance and fail a
+//     correct kernel. A principled alternative (analytic error propagation along
+//     the producer chain) is the documented future upgrade; it needs per-op
+//     condition-number models and is deferred.
+//
+//   * dtype key = the OUTPUT tensor's dtype (passed in by the caller). Truly
+//     per-node dtype keying — each node keyed on its own output-edge dtype — only
+//     differs from this in mixed-I/O fused graphs, and recovering a node's
+//     output dtype needs a per-op tensor-UID extractor (the flatbuffer Node
+//     carries only compute_data_type, not its I/O tensors). That extractor is
+//     the same machinery the per-output subgraph walk needs, so per-node dtype is
+//     deferred together with multi-output support (ALMIOPEN-2216).
+//
+// resolveTolerance() is the single entry point for both harnesses: it derives
+// the max-across-nodes default and then applies the TOML per-test override (the
+// highest-priority layer) in one place, so neither harness applies the override
+// separately and the layering order lives here alone.
+
+namespace hipdnn_integration_tests::tolerance
+{
+
+namespace fb = hipdnn_flatbuffers_sdk::flatbuffer_utilities;
+namespace data = hipdnn_flatbuffers_sdk::data_objects;
+
+// Per-op tolerance for one node attribute type, at a fixed element type T.
+// Maps a flatbuffer NodeAttributes tag onto the corresponding TestTolerances.hpp
+// entry. Unknown ops fall back to a conservative 1e-3.
+template <typename T>
+inline float toleranceForNodeAttributes(data::NodeAttributes attrType)
+{
+    using NA = data::NodeAttributes;
+    namespace tol = hipdnn_test_sdk::utilities;
+
+    switch(attrType)
+    {
+    case NA::ConvolutionFwdAttributes:
+        return tol::conv::getToleranceFwd<T>();
+    case NA::ConvolutionBwdAttributes:
+        return tol::conv::getToleranceBwd<T>();
+    case NA::ConvolutionWrwAttributes:
+        return tol::conv::getToleranceWrw<T>();
+    case NA::BatchnormInferenceAttributes:
+        return tol::batchnorm::getToleranceInference<T>();
+    case NA::BatchnormInferenceAttributesVarianceExt:
+        return tol::batchnorm::getToleranceInferenceWithVariance<T>();
+    case NA::BatchnormAttributes:
+        return tol::batchnorm::getToleranceTraining<T>();
+    case NA::BatchnormBackwardAttributes:
+        return tol::batchnorm::getToleranceBackward<T>();
+    case NA::MatmulAttributes:
+        return tol::matmul::getTolerance<T>();
+    case NA::ReductionAttributes:
+        return tol::reduction::getTolerance<T>();
+    case NA::RMSNormAttributes:
+        return tol::rmsnorm::getTolerance<T>();
+    case NA::PointwiseAttributes:
+        return tol::pointwise::getTolerance<T>();
+    case NA::LayernormAttributes:
+        return tol::layernorm::getTolerance<T>();
+    case NA::SdpaAttributes:
+    case NA::SdpaBackwardAttributes:
+        return tol::sdpa::getToleranceFwd<T>();
+    default:
+        return 1e-3f;
+    }
+}
+
+// Dispatch the element-type template on a runtime DataType.
+inline float toleranceForNode(data::NodeAttributes attrType, data::DataType dataType)
+{
+    using DT = data::DataType;
+    using hipdnn_data_sdk::types::bfloat16;
+    using hipdnn_data_sdk::types::half;
+
+    switch(dataType)
+    {
+    case DT::FLOAT:
+        return toleranceForNodeAttributes<float>(attrType);
+    case DT::HALF:
+        return toleranceForNodeAttributes<half>(attrType);
+    case DT::BFLOAT16:
+        return toleranceForNodeAttributes<bfloat16>(attrType);
+    default:
+        return 1e-3f;
+    }
+}
+
+// An aggregation policy reduces the per-node tolerances of a graph to one
+// default tolerance for an output. It is just a function (graph, dtype) -> float;
+// new policies are added as new functions, and resolveTolerance() takes the
+// chosen one as a parameter. No enum/switch — the policy IS the function.
+using AggregationPolicy = float (*)(const fb::GraphWrapper&, data::DataType);
+
+// Conservative policy (the default): max-across-nodes — the loosest per-node
+// tolerance in the graph. Never tighter than any single node, so it cannot
+// manufacture a false failure; for a fused output (which genuinely accumulates
+// error from every op on its chain) the loosest contributing op is the correct
+// floor. Returns 1e-3 for a graph with no nodes.
+inline float maxAcrossNodes(const fb::GraphWrapper& wrapper, data::DataType dataType)
+{
+    const auto nodeCount = wrapper.nodeCount();
+
+    bool found = false;
+    float maxTolerance = 0.0f;
+    for(uint32_t i = 0; i < nodeCount; ++i)
+    {
+        const auto attrType = wrapper.getNode(i).attributes_type();
+        const float nodeTolerance = toleranceForNode(attrType, dataType);
+        maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
+        found = true;
+    }
+
+    return found ? maxTolerance : 1e-3f;
+}
+
+// Output-op policy: the tolerance of the last non-Pointwise node in topological
+// order — i.e. the op that produces the graph's output. This reproduces the
+// graph harness's historical getTolerance() behavior so the C++ graph tests keep
+// their exact tolerances as they migrate. It is tighter than maxAcrossNodes only
+// on fused chains whose loosest op is NOT the output op; for the common case
+// (one real op + activation) the two policies are identical, since the activation
+// is Pointwise (skipped) and the single real op is both loosest and last.
+//
+// NOTE: this is a heuristic, not a principled tight bound — it attributes the
+// whole output's tolerance to one op and ignores upstream error accumulation.
+// Kept only for migration parity; max remains the default everywhere else, and
+// the principled tighten path is the future DynamicTolerances upgrade. Falls back
+// to maxAcrossNodes if every node is Pointwise (no clear producing op).
+inline float outputOpTolerance(const fb::GraphWrapper& wrapper, data::DataType dataType)
+{
+    const auto nodeCount = wrapper.nodeCount();
+
+    bool foundRoot = false;
+    data::NodeAttributes rootAttr = data::NodeAttributes::NONE;
+    for(uint32_t i = 0; i < nodeCount; ++i)
+    {
+        const auto attrType = wrapper.getNode(i).attributes_type();
+        if(attrType != data::NodeAttributes::PointwiseAttributes)
+        {
+            rootAttr = attrType; // last non-Pointwise wins (topological order)
+            foundRoot = true;
+        }
+    }
+
+    if(!foundRoot)
+    {
+        return maxAcrossNodes(wrapper, dataType);
+    }
+    return toleranceForNode(rootAttr, dataType);
+}
+
+// Future policies live here as sibling functions, e.g.:
+//   float propagatedBound(wrapper, dtype);    // analytic error propagation
+//   float dynamic(wrapper, dtype);            // wired to DynamicTolerances.hpp
+// Each is added without touching resolveTolerance or any caller — pass it in.
+
+// Warn (once per call site) when a graph has more than one output tensor.
+//
+// Every current aggregation policy reduces over the WHOLE graph, not the subgraph
+// that produces a given output: maxAcrossNodes takes the loosest of all nodes,
+// outputOpTolerance takes the single last non-Pointwise node. For a multi-output
+// graph neither is scoped to the output being toleranced, so a tolerance may be
+// attributed from an unrelated branch. The precise fix (per-output subgraph
+// scoping) is deferred together with per-node dtype keying (ALMIOPEN-2216),
+// because both need a per-op tensor-UID extractor. Until then we surface the
+// imprecision loudly rather than letting it pass silently.
+inline void warnIfMultipleOutputs(std::size_t outputCount, const char* context)
+{
+    if(outputCount > 1)
+    {
+        HIPDNN_PLUGIN_LOG_WARN(context
+                               << ": graph has " << outputCount
+                               << " output tensors; tolerance is reduced over the whole graph, not "
+                                  "the per-output subgraph (deferred, ALMIOPEN-2216)");
+    }
+}
+
+// Resolve the FINAL absolute/relative tolerance for an output tensor of the
+// given dtype: the chosen aggregation policy's default (max-across-nodes unless
+// overridden), then the TOML per-test override (highest priority) applied on top.
+// This is the single tolerance entry point for both harnesses — neither applies
+// the override separately, so the layering order (default -> override) lives in
+// exactly one place. The aggregation policy is a parameter (default
+// maxAcrossNodes) so a caller can select a different policy without any change
+// here.
+inline void resolveTolerance(const fb::GraphWrapper& wrapper,
+                             data::DataType dataType,
+                             const std::string& testName,
+                             float& atol,
+                             float& rtol,
+                             AggregationPolicy aggregate = maxAcrossNodes)
+{
+    const float defaultTolerance = aggregate(wrapper, dataType);
+    atol = defaultTolerance;
+    rtol = defaultTolerance;
+    applyTomlToleranceOverride(testName, atol, rtol);
+}
+
+} // namespace hipdnn_integration_tests::tolerance