diff --git a/dnn-providers/integration-tests/CMakeLists.txt b/dnn-providers/integration-tests/CMakeLists.txt
index c47dcbdd529f..6daf3bc367c6 100644
--- a/dnn-providers/integration-tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/CMakeLists.txt
@@ -128,6 +128,7 @@ set(INTEGRATION_TESTS_EXE hipdnn_integration_tests)
 
 add_executable(${INTEGRATION_TESTS_EXE}
     src/main.cpp
+    src/harness/bundle/IntegrationBundleVerificationHarness.cpp
 )
 
 add_subdirectory(src/integration_tests)
diff --git a/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp b/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
index ff80def310de..3c16ff9f04c0 100644
--- a/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
+++ b/dnn-providers/integration-tests/src/harness/CpuReferenceGraphExecutorAdapter.hpp
@@ -3,9 +3,13 @@
 
 #pragma once
 
+#include <stdexcept>
+#include <string>
+
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
 #include "IReferenceGraphExecutor.hpp"
+#include "ReferenceCapabilityError.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -17,7 +21,25 @@ class CpuReferenceGraphExecutorAdapter : public IReferenceGraphExecutor
                  size_t size,
                  const std::unordered_map<int64_t, void*>& variantPack) override
     {
-        _executor.execute(graphBuffer, size, variantPack);
+        // The shared test_sdk CPU executor throws a plain std::runtime_error for
+        // BOTH "no plan for this op" (capability miss, case A) and a genuine
+        // runtime failure (case C) — it does not distinguish them by type. We
+        // cannot tell them apart here, so we conservatively translate every throw
+        // into a ReferenceCapabilityError (case A), carrying the original message
+        // so a real failure still surfaces in the unverifiable report. Net effect:
+        // a CPU-ref crash routes as "couldn't run" rather than a hard FAIL. The
+        // GPU executor (our code) keeps full A-vs-C fidelity by throwing the right
+        // type at the source.
+        try
+        {
+            _executor.execute(graphBuffer, size, variantPack);
+        }
+        catch(const std::exception& e)
+        {
+            throw ReferenceCapabilityError(std::string("CPU reference executor could not run "
+                                                       "this graph: ")
+                                           + e.what());
+        }
     }
 
     bool requiresDeviceMemory() const override
diff --git a/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp b/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
index c36adc19fb62..1f21a1978487 100644
--- a/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
+++ b/dnn-providers/integration-tests/src/harness/GoldenReferenceCpu.hpp
@@ -17,6 +17,8 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
+#include "harness/TomlGuards.hpp"
+
 namespace hipdnn_integration_tests
 {
 
@@ -47,12 +49,16 @@ class TestGoldenReferenceCpu : public ::testing::TestWithParam<std::filesystem::
 
         _graphAndTensors = hipdnn_test_sdk::utilities::loadGraphAndTensors(path);
         _referenceOutputTensors = _graphAndTensors.extractAndClearOutputTensorData();
+
+        skipIfTomlMatched(currentTestName());
     }
 
     void goldenReferenceTestSuite(float absoluteTolerance, float relativeTolerance)
     {
         SKIP_IF_WINDOWS();
 
+        applyTomlToleranceOverride(currentTestName(), absoluteTolerance, relativeTolerance);
+
         auto tensorMap = _graphAndTensors.hostBufferMap();
 
         hipdnn_test_sdk::utilities::CpuReferenceGraphExecutor().execute(
diff --git a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
index d884b3e2f50e..ff06e24d1fb5 100644
--- a/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
+++ b/dnn-providers/integration-tests/src/harness/IntegrationGraphVerificationHarness.hpp
@@ -11,8 +11,6 @@
 #include <hipdnn_frontend/Graph.hpp>
 #include <hipdnn_frontend/Utilities.hpp>
 #include <hipdnn_frontend/attributes/TensorAttributes.hpp>
-#include <hipdnn_frontend/node/RMSNormNode.hpp>
-#include <hipdnn_frontend/node/ReductionNode.hpp>
 #include <hipdnn_plugin_sdk/PluginLogging.hpp>
 #include <hipdnn_test_sdk/utilities/CpuFpReferenceMiopenRmsValidation.hpp>
 #include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
@@ -29,6 +27,9 @@
 #include "harness/SharedHandle.hpp"
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
+#include "harness/input_init/SynthesizeInputs.hpp"
+#include "harness/tolerance/ToleranceResolver.hpp"
 
 namespace hipdnn_integration_tests
 {
@@ -41,7 +42,6 @@ template <typename DataType, typename TestCaseType>
 class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<TestCaseType>
 {
 protected:
-    int _deviceId = 0;
     std::string _testCaseNote;
     std::string _testCaseLayout;
     std::unordered_map<int64_t, std::string> _tensorIdToNameMap;
@@ -53,20 +53,10 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
     {
         SKIP_IF_NO_DEVICES();
 
-        // Initialize HIP
-        ASSERT_EQ(hipInit(0), hipSuccess);
-        ASSERT_EQ(hipGetDevice(&_deviceId), hipSuccess);
-
-        // Check for any engine specific test skips
-        if(auto* info = ::testing::UnitTest::GetInstance()->current_test_info(); info != nullptr)
-        {
-            const std::string testName = std::string(info->test_suite_name()) + "." + info->name();
-            if(auto skipReason = TestConfig::get().findSkipForTest(testName))
-            {
-                GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] "
-                             << *skipReason;
-            }
-        }
+        // HIP initializes lazily on first runtime use; the shared hipdnn handle
+        // (getSharedHandle -> hipdnnCreate) does this before any graph executes,
+        // so no explicit hipInit is needed here.
+        skipIfTomlMatched(currentTestName());
     }
 
     void setTestCaseNote(std::string note)
@@ -81,41 +71,64 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
 
     virtual void runGraphTest() = 0;
 
-    // Determine tolerance for an output tensor based on the graph and
-    // configured tolerance mode for the engine.
+    // Determine the FINAL tolerance for an output tensor: an aggregation-policy
+    // default plus the TOML per-test override, both via
+    // harness/tolerance/ToleranceResolver.hpp. The resolver is keyed on the
+    // serialized flatbuffer graph: we serialize with to_binary() — the same
+    // pattern initializeBundle() already uses — and read the output tensor's dtype
+    // from the flatbuffer.
+    //
+    // Policy = outputOpTolerance (the last non-Pointwise op), which reproduces
+    // this harness's historical getTolerance() behavior so the C++ graph tests
+    // keep their exact tolerances. (The bundle harness uses the maxAcrossNodes
+    // default; the two agree for the common one-real-op + activation case.) The
+    // returned value is already overridden, so registerValidator stores it as-is.
     float getTolerance(const hipdnn_frontend::graph::Graph& graph,
                        const std::shared_ptr<hipdnn_frontend::graph::TensorAttributes>& output)
     {
         ToleranceMode mode = TestConfig::get().getToleranceMode();
+        if(mode != ToleranceMode::DEFAULT)
+        {
+            ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
+            return 0.0f;
+        }
 
-        if(mode == ToleranceMode::DEFAULT)
+        auto [serialized, serErr] = graph.to_binary();
+        if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
         {
-            // We determine the tolerance based on the last non-PointwiseNode
-            // (the root op). This will be gradually updated to use dynamic
-            // calculation as possible; eventually, the tolerance will be
-            // entirely dynamically determined in the default case.
-            //
-            // NOTE: after validate(), the graph's sub-nodes are in topological order.
-            const hipdnn_frontend::graph::INode* rootOp = nullptr;
-            graph.visit([&](const hipdnn_frontend::graph::INode& node) {
-                if(dynamic_cast<const hipdnn_frontend::graph::PointwiseNode*>(&node) == nullptr
-                   && dynamic_cast<const hipdnn_frontend::graph::Graph*>(&node) == nullptr)
-                {
-                    rootOp = &node;
-                }
-            });
+            ADD_FAILURE() << "getTolerance: graph serialization failed";
+            return 0.0f;
+        }
 
-            if(rootOp == nullptr)
-            {
-                ADD_FAILURE() << "getTolerance: no root op found in graph";
-                return 0.0f;
-            }
+        const auto wrapper
+            = hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper::fromSerializedBlob(
+                serialized.data(), serialized.size());
+        if(!wrapper.isValid())
+        {
+            ADD_FAILURE() << "getTolerance: serialized graph failed verification";
+            return 0.0f;
+        }
 
-            return toleranceForNode(*rootOp, output->get_data_type());
+        const auto& tensorMap = wrapper.getTensorMap();
+        const auto it = tensorMap.find(output->get_uid());
+        if(it == tensorMap.end())
+        {
+            ADD_FAILURE() << "getTolerance: output tensor uid " << output->get_uid()
+                          << " not found in serialized graph";
+            return 0.0f;
         }
 
-        ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
-        return 0.0f;
+        float atol = 0.0f;
+        float rtol = 0.0f;
+        tolerance::resolveTolerance(wrapper,
+                                    it->second->data_type(),
+                                    currentTestName(),
+                                    atol,
+                                    rtol,
+                                    tolerance::outputOpTolerance);
+        // getTolerance's single-float contract predates split atol/rtol; under the
+        // current resolver the two are equal (same default, same override).
+        return atol;
     }
 
     void verifyGraph(hipdnn_frontend::graph::Graph& graph, unsigned int seed)
@@ -201,6 +214,9 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
             << "At least one output tensor id must be specified for "
                "validation.";
 
+        tolerance::warnIfMultipleOutputs(gpuBundle.outputTensorIds.size(),
+                                         "IntegrationGraphVerificationHarness");
+
         HIPDNN_PLUGIN_LOG_INFO("Validating " << gpuBundle.outputTensorIds.size()
                                              << " output tensors");
 
@@ -253,25 +269,10 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
                            float absoluteTolerance,
                            float relativeTolerance)
     {
-        // Check for per-test tolerance override from TOML config
-        float finalAtol = absoluteTolerance;
-        float finalRtol = relativeTolerance;
-
-        auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-        if(testInfo != nullptr)
-        {
-            std::string testName
-                = std::string(testInfo->test_suite_name()) + "." + testInfo->name();
-            auto override = TestConfig::get().findToleranceOverride(testName);
-            if(override.has_value())
-            {
-                finalAtol = override->atol;
-                finalRtol = override->rtol;
-                HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for " << testName
-                                                                         << ": atol=" << finalAtol
-                                                                         << " rtol=" << finalRtol);
-            }
-        }
+        // Tolerances arrive already resolved (default + TOML override) from
+        // getTolerance via ToleranceResolver; no override is applied here.
+        const float finalAtol = absoluteTolerance;
+        const float finalRtol = relativeTolerance;
 
         // Since the graph can infer properties + Ids, we defer validator registration until right
         // before validation in verifyGraph
@@ -332,67 +333,82 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
         });
     }
 
-    virtual void initializeBundle([[maybe_unused]] const hipdnn_frontend::graph::Graph& graph,
+    virtual void initializeBundle(const hipdnn_frontend::graph::Graph& graph,
                                   hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
                                   unsigned int seed)
     {
         bundle.sentinelFillOutputTensors();
 
-        for(auto& tensorPair : bundle.tensors)
+        auto [serialized, serErr] = graph.to_binary();
+        if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
         {
-            if(!bundle.isOutput(tensorPair.first))
+            initializeBundleFallback(bundle, seed);
+            return;
+        }
+
+        const auto* fb = hipdnn_flatbuffers_sdk::data_objects::GetGraph(serialized.data());
+        if(fb == nullptr || fb->nodes() == nullptr)
+        {
+            initializeBundleFallback(bundle, seed);
+            return;
+        }
+
+        std::vector<int64_t> leafInputUids;
+        InputTensorMap inputs;
+        for(auto& [uid, tensor] : bundle.tensors)
+        {
+            if(!bundle.isOutput(uid))
             {
-                bundle.randomizeTensor(tensorPair.first, -1.0f, 1.0f, seed);
+                leafInputUids.push_back(uid);
+                inputs[uid] = std::move(tensor);
             }
         }
-    }
 
-    static float toleranceForNode(const hipdnn_frontend::graph::INode& node,
-                                  hipdnn_frontend::DataType dataType)
-    {
-        switch(dataType)
+        std::mt19937 rng(seed);
+        SynthesisTracker tracker(leafInputUids, inputs);
+
+        bool synthesisOk = true;
+        for(const auto* node : *fb->nodes())
         {
-        case hipdnn_frontend::DataType::FLOAT:
-            return toleranceForNodeTyped<float>(node);
-        case hipdnn_frontend::DataType::HALF:
-            return toleranceForNodeTyped<half>(node);
-        case hipdnn_frontend::DataType::BFLOAT16:
-            return toleranceForNodeTyped<bfloat16>(node);
-        default:
-            ADD_FAILURE() << "toleranceForNode: unsupported data type";
-            return 0.0f;
+            if(node == nullptr)
+            {
+                continue;
+            }
+            auto result = synthesizeNodeInputs(*node, tracker, rng);
+            if(!result.filled)
+            {
+                synthesisOk = false;
+                break;
+            }
+        }
+
+        if(synthesisOk)
+        {
+            auto finalResult = tracker.finish("synthesis");
+            synthesisOk = finalResult.filled;
+        }
+
+        for(auto& [uid, tensor] : inputs)
+        {
+            bundle.tensors[uid] = std::move(tensor);
+        }
+
+        if(!synthesisOk)
+        {
+            initializeBundleFallback(bundle, seed);
         }
     }
 
-    template <typename T>
-    static float toleranceForNodeTyped(const hipdnn_frontend::graph::INode& node)
+    void initializeBundleFallback(hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
+                                  unsigned int seed)
     {
-        namespace fe = hipdnn_frontend::graph;
-        using namespace hipdnn_test_sdk::utilities;
-
-        if(dynamic_cast<const fe::ConvolutionFpropNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceFwd<T>());
-        if(dynamic_cast<const fe::ConvolutionDgradNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceBwd<T>());
-        if(dynamic_cast<const fe::ConvolutionWgradNode*>(&node) != nullptr)
-            return static_cast<float>(conv::getToleranceWrw<T>());
-        if(dynamic_cast<const fe::BatchnormInferenceNodeVarianceExt*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceInferenceWithVariance<T>());
-        if(dynamic_cast<const fe::BatchnormInferenceNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceInference<T>());
-        if(dynamic_cast<const fe::BatchnormNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceTraining<T>());
-        if(dynamic_cast<const fe::BatchnormBackwardNode*>(&node) != nullptr)
-            return static_cast<float>(batchnorm::getToleranceBackward<T>());
-        if(dynamic_cast<const fe::MatmulNode*>(&node) != nullptr)
-            return static_cast<float>(matmul::getTolerance<T>());
-        if(dynamic_cast<const fe::ReductionNode*>(&node) != nullptr)
-            return static_cast<float>(reduction::getTolerance<T>());
-        if(dynamic_cast<const fe::RMSNormNode*>(&node) != nullptr)
-            return static_cast<float>(rmsnorm::getTolerance<T>());
-
-        ADD_FAILURE() << "toleranceForNodeTyped: unsupported node type";
-        return 0.0f;
+        for(auto& [uid, tensor] : bundle.tensors)
+        {
+            if(!bundle.isOutput(uid))
+            {
+                bundle.randomizeTensor(uid, -1.0f, 1.0f, seed);
+            }
+        }
     }
 
     void executeGpuGraph(hipdnnHandle_t handle,
diff --git a/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp b/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp
new file mode 100644
index 000000000000..869664ba843e
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/ReferenceCapabilityError.hpp
@@ -0,0 +1,40 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace hipdnn_integration_tests
+{
+
+// Signals "this reference executor has no plan for this op" — a CAPABILITY MISS,
+// not a runtime failure. The golden-verification harness distinguishes three
+// reference outcomes:
+//
+//   A  capability miss  — ref cannot run this op   -> ReferenceCapabilityError
+//   B  disagreement     — ref ran, output != engine-> mismatch at compare time
+//   C  runtime error    — ref CAN run it but threw -> any other std::exception
+//
+// In `auto` mode a case-A miss falls through to the next reference; in explicit
+// gpu/cpu mode it SKIPs. A case-C error is loud (auto: fall through + loud
+// report; explicit / end-of-auto: FAIL). Throwing the right type at the source
+// is what lets the harness tell A from C.
+//
+// Deriving from std::runtime_error keeps existing `catch(const std::exception&)`
+// / `catch(const std::runtime_error&)` call sites working unchanged.
+//
+// NOTE: the GPU reference executor (our code) throws this directly at its
+// capability-miss sites. The CPU reference executor lives in the shared test_sdk
+// library and throws a plain std::runtime_error for BOTH A and C; the
+// CpuReferenceGraphExecutorAdapter translates that into a ReferenceCapabilityError
+// (it cannot tell A from C, so it conservatively treats every test_sdk throw as a
+// capability miss and carries the original message for the report).
+class ReferenceCapabilityError : public std::runtime_error
+{
+public:
+    using std::runtime_error::runtime_error;
+};
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/TestConfig.hpp b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
index e9ed9b27c106..8abcd8b1adae 100644
--- a/dnn-providers/integration-tests/src/harness/TestConfig.hpp
+++ b/dnn-providers/integration-tests/src/harness/TestConfig.hpp
@@ -35,6 +35,84 @@ enum class ReferenceExecutorType
     GPU,
 };
 
+// How a bundle's engine output is verified (RFC 0010 §4.4). This governs the
+// BUNDLE tests only and is independent of ReferenceExecutorType (which governs
+// the parameterized tests' choice of which ref executor to exercise).
+//
+//   AUTO   — per-test fallback: golden -> GPU ref -> CPU ref -> SKIP+report
+//   GOLDEN — golden data only; SKIP if a bundle has no golden outputs
+//   GPU    — ignore golden; compare engine against the GPU reference executor
+//   CPU    — ignore golden; compare engine against the CPU reference executor
+enum class VerificationMode
+{
+    AUTO,
+    GOLDEN,
+    GPU,
+    CPU,
+};
+
+// Parse a verification-mode string (case-insensitive) into the enum. Throws
+// std::runtime_error on an unrecognized value. Shared by the CLI flag parser and
+// the env-var fallback so both accept exactly the same spellings.
+inline VerificationMode parseVerificationMode(std::string value)
+{
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    if(value == "auto")
+    {
+        return VerificationMode::AUTO;
+    }
+    if(value == "golden")
+    {
+        return VerificationMode::GOLDEN;
+    }
+    if(value == "gpu")
+    {
+        return VerificationMode::GPU;
+    }
+    if(value == "cpu")
+    {
+        return VerificationMode::CPU;
+    }
+    throw std::runtime_error("Invalid verification mode '" + value
+                             + "'; expected 'auto', 'golden', 'gpu', or 'cpu'");
+}
+
+// Resolve verification mode: CLI value wins, then env var, then nullopt (caller
+// defaults to AUTO). Factored out of TestConfig::initialize() so the resolution
+// logic is independently testable.
+inline std::optional<VerificationMode>
+    resolveVerificationMode(std::optional<VerificationMode> cliValue)
+{
+    if(cliValue.has_value())
+    {
+        return cliValue;
+    }
+    auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_VERIFICATION_MODE");
+    if(!envVal.empty())
+    {
+        return parseVerificationMode(envVal);
+    }
+    return std::nullopt;
+}
+
+// Resolve golden data dir: CLI value wins, then env var, then nullopt.
+inline std::optional<std::filesystem::path>
+    resolveGoldenDataDir(std::optional<std::filesystem::path> cliValue)
+{
+    if(cliValue.has_value())
+    {
+        return cliValue;
+    }
+    auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_GOLDEN_DATA_DIR");
+    if(!envVal.empty())
+    {
+        return std::filesystem::path(envVal);
+    }
+    return std::nullopt;
+}
+
 // Singleton class for storing CLI-based test configuration.
 // All arguments are independently optional:
 //   - articlePath: omit to use hipDNN's default plugin discovery
@@ -64,7 +142,8 @@ class TestConfig
                            std::optional<ReferenceExecutorType> referenceExecutorType
                            = std::nullopt,
                            bool allowBundles = false,
-                           std::optional<std::filesystem::path> goldenDataDir = std::nullopt)
+                           std::optional<std::filesystem::path> goldenDataDir = std::nullopt,
+                           std::optional<VerificationMode> verificationMode = std::nullopt)
     {
         TestConfig& instance = get();
         if(instance._initialized)
@@ -118,15 +197,8 @@ class TestConfig
             }
         }
 
-        instance._goldenDataDir = std::move(goldenDataDir);
-        if(!instance._goldenDataDir.has_value())
-        {
-            auto envVal = hipdnn_data_sdk::utilities::getEnv("HIPDNN_TEST_GOLDEN_DATA_DIR");
-            if(!envVal.empty())
-            {
-                instance._goldenDataDir = std::filesystem::path(envVal);
-            }
-        }
+        instance._goldenDataDir = resolveGoldenDataDir(std::move(goldenDataDir));
+        instance._verificationMode = resolveVerificationMode(verificationMode);
 
         // Detect device 0's gfx arch and VRAM once at startup. Used by
         // [[test_skips]] and golden-ref metadata guards (arch/VRAM checks).
@@ -291,6 +363,14 @@ class TestConfig
         return _goldenDataDir.value();
     }
 
+    // Bundle verification mode. Resolved once at init: CLI flag >
+    // HIPDNN_TEST_VERIFICATION_MODE env var > AUTO default.
+    VerificationMode getVerificationMode() const
+    {
+        throwIfNotInitialized();
+        return _verificationMode.value_or(VerificationMode::AUTO);
+    }
+
 private:
     TestConfig() = default;
 
@@ -307,6 +387,7 @@ class TestConfig
     std::optional<TestSettings> _testSettings;
     std::optional<ReferenceExecutorType> _referenceExecutorType;
     std::optional<std::filesystem::path> _goldenDataDir;
+    std::optional<VerificationMode> _verificationMode;
     std::string _currentArch;
     std::size_t _currentDeviceVramMb = 0;
     std::string _currentPlatform;
diff --git a/dnn-providers/integration-tests/src/harness/TomlGuards.hpp b/dnn-providers/integration-tests/src/harness/TomlGuards.hpp
new file mode 100644
index 000000000000..ccc098e663a8
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/TomlGuards.hpp
@@ -0,0 +1,63 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <hipdnn_plugin_sdk/PluginLogging.hpp>
+
+#include "harness/TestConfig.hpp"
+
+namespace hipdnn_integration_tests
+{
+
+inline std::string currentTestName()
+{
+    auto* info = ::testing::UnitTest::GetInstance()->current_test_info();
+    if(info == nullptr)
+    {
+        return {};
+    }
+    return std::string(info->test_suite_name()) + "." + info->name();
+}
+
+inline bool applyTomlToleranceOverride(const std::string& testName, float& atol, float& rtol)
+{
+    if(testName.empty())
+    {
+        return false;
+    }
+    auto ovr = TestConfig::get().findToleranceOverride(testName);
+    if(!ovr)
+    {
+        return false;
+    }
+    atol = ovr->atol;
+    rtol = ovr->rtol;
+    HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for " << testName << ": atol=" << atol
+                                                             << " rtol=" << rtol);
+    return true;
+}
+
+inline std::optional<std::string> checkTomlSkip(const std::string& testName)
+{
+    if(testName.empty())
+    {
+        return std::nullopt;
+    }
+    return TestConfig::get().findSkipForTest(testName);
+}
+
+inline void skipIfTomlMatched(const std::string& testName)
+{
+    if(auto reason = checkTomlSkip(testName))
+    {
+        GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] " << *reason;
+    }
+}
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp b/dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
similarity index 96%
rename from dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
index 56ea6dc254ff..800c1288d4ea 100644
--- a/dnn-providers/integration-tests/src/harness/golden/BundleDiscovery.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/BundleDiscovery.hpp
@@ -14,7 +14,7 @@
 
 #include <hipdnn_plugin_sdk/PluginLogging.hpp>
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 // Naming types, kept together. DerivedTestName is the output of deriveTestName()
@@ -191,9 +191,11 @@ inline DerivedTestName deriveTestName(const std::filesystem::path& jsonPath,
 
     if(relativeDir.empty())
     {
-        throw std::runtime_error(
-            "Bundle .json must live in a sub-folder of the data root, not at the root itself: "
-            + jsonPath.string() + "; expected {folder}/{file}.json");
+        // --gd points directly at a bundle folder (the .json is at the root).
+        // Use the folder name as the suite so "--gd .../graph_only_bundle" works.
+        const std::string suite = sanitizeForGtest(bundleDir.filename().string());
+        const std::string test = sanitizeForGtest(jsonPath.stem().string());
+        return {suite, test};
     }
 
     std::string suite;
@@ -270,4 +272,4 @@ inline std::vector<DiscoveredBundle> discoverBundles(const std::filesystem::path
     return bundles;
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp b/dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
similarity index 94%
rename from dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp
rename to dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
index 5f45fc361f5a..6cfbc4e74322 100644
--- a/dnn-providers/integration-tests/src/harness/golden/BundleRegistration.hpp
+++ b/dnn-providers/integration-tests/src/harness/bundle/BundleRegistration.hpp
@@ -15,10 +15,10 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 
 #include "harness/TestConfig.hpp"
-#include "harness/golden/BundleDiscovery.hpp"
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
 
-namespace hipdnn_integration_tests::golden
+namespace hipdnn_integration_tests::bundle
 {
 
 namespace detail
@@ -58,7 +58,7 @@ inline void registerBundles(const std::vector<LoadedBundle>& bundles)
             __FILE__,
             __LINE__,
             [loaded = bundle.bundle, path = bundle.jsonPath]() -> ::testing::Test* {
-                auto* test = new IntegrationGraphGoldenReferenceVerificationHarness(
+                auto* test = new IntegrationBundleVerificationHarness(
                     /*requiresDevice=*/true);
                 test->setBundle(loaded, path);
                 return test;
@@ -159,7 +159,7 @@ inline void registerBundleTests()
 
     detail::registerBundles(bundles);
 
-    HIPDNN_PLUGIN_LOG_INFO("Registered " << bundles.size() << " golden bundle test(s)");
+    HIPDNN_PLUGIN_LOG_INFO("Registered " << bundles.size() << " bundle test(s)");
 }
 
-} // namespace hipdnn_integration_tests::golden
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
new file mode 100644
index 000000000000..c5ebb2507f31
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.cpp
@@ -0,0 +1,660 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
+
+#include <algorithm>
+#include <ostream>
+#include <random>
+#include <set>
+#include <sstream>
+
+#include <hipdnn_data_sdk/utilities/Workspace.hpp>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_frontend/Graph.hpp>
+#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
+#include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
+#include <hipdnn_test_sdk/utilities/FlatbufferDatatypeMapping.hpp>
+#include <hipdnn_test_sdk/utilities/TensorDiff.hpp>
+#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
+#include <hipdnn_test_sdk/utilities/detail/FlatbufferTensorAttributesUtils.hpp>
+
+#include "harness/CpuReferenceGraphExecutorAdapter.hpp"
+#include "harness/ReferenceCapabilityError.hpp"
+#include "harness/SharedHandle.hpp"
+#include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
+#include "harness/bundle/UnverifiableBundleReport.hpp"
+#include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
+#include "harness/input_init/SynthesizeInputs.hpp"
+#include "harness/tolerance/ToleranceResolver.hpp"
+
+namespace hipdnn_integration_tests::bundle
+{
+
+// ---- virtual defaults ------------------------------------------------------
+
+void IntegrationBundleVerificationHarness::executeGraphThroughEngine(
+    std::unordered_map<int64_t, void*>& variantPack)
+{
+    auto handle = getSharedHandle();
+
+    const std::vector<uint8_t> graphBytes(
+        _bundle->graphBuffer.data(), _bundle->graphBuffer.data() + _bundle->graphBuffer.size());
+
+    hipdnn_frontend::graph::Graph graph;
+    auto err = graph.from_binary(handle, graphBytes);
+    ASSERT_TRUE(err.is_good()) << "from_binary failed: " << err.get_message();
+
+    std::vector<int64_t> engineIds;
+    auto status = graph.get_ranked_engine_ids(engineIds);
+
+    const auto graphSummary = [&] {
+        return std::to_string(_bundle->outputTensorUids.size()) + " output tensor(s), "
+               + std::to_string(engineIds.size()) + " ranked engine(s)";
+    };
+
+    if(TestConfig::get().hasEngineName())
+    {
+        int64_t targetEngineId = TestConfig::get().getEngineId();
+        if(status.is_bad()
+           || std::find(engineIds.begin(), engineIds.end(), targetEngineId) == engineIds.end())
+        {
+            throw std::runtime_error("Engine " + std::string(TestConfig::get().getEngineName())
+                                     + " does not support this graph (" + graphSummary() + ")");
+        }
+        graph.set_preferred_engine_id_ext(targetEngineId);
+    }
+    else
+    {
+        if(status.is_bad() || engineIds.empty())
+        {
+            throw std::runtime_error("No engine supports this graph (" + graphSummary() + ")");
+        }
+    }
+
+    auto result = graph.create_execution_plans();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    result = graph.check_support();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    result = graph.build_plans();
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+
+    int64_t workspaceSize = 0;
+    result = graph.get_workspace_size(workspaceSize);
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+    ASSERT_GE(workspaceSize, 0);
+    const hipdnn_data_sdk::utilities::Workspace workspace(static_cast<size_t>(workspaceSize));
+
+    result = graph.execute(handle, variantPack, workspace.get());
+    ASSERT_TRUE(result.is_good()) << result.get_message();
+}
+
+void IntegrationBundleVerificationHarness::runReferenceExecutor(
+    ReferenceExecutorType type, std::unordered_map<int64_t, void*>& variantPack)
+{
+    auto executor = makeReferenceExecutor(type);
+    executor->execute(_bundle->graphBuffer.data(), _bundle->graphBuffer.size(), variantPack);
+}
+
+std::unique_ptr<IReferenceGraphExecutor>
+    IntegrationBundleVerificationHarness::makeReferenceExecutor(ReferenceExecutorType type)
+{
+    switch(type)
+    {
+    case ReferenceExecutorType::CPU:
+        return std::make_unique<CpuReferenceGraphExecutorAdapter>();
+    case ReferenceExecutorType::GPU:
+        return std::make_unique<gpu_graph_executor::GpuReferenceGraphExecutor>();
+    default:
+        throw std::runtime_error("Unknown reference executor type");
+    }
+}
+
+// ---- top-level dispatch ----------------------------------------------------
+
+VerificationMode IntegrationBundleVerificationHarness::getVerificationMode() const
+{
+    return TestConfig::get().getVerificationMode();
+}
+
+void IntegrationBundleVerificationHarness::runComparison()
+{
+    if(_bundle->outputTensorUids.empty())
+    {
+        skipUnverifiable("bundle has no output tensors to compare");
+        return;
+    }
+
+    if(!ensureInputsAvailable())
+    {
+        return;
+    }
+
+    switch(getVerificationMode())
+    {
+    case VerificationMode::GOLDEN:
+        runGoldenMode();
+        return;
+    case VerificationMode::GPU:
+        runExplicitRefMode(ReferenceExecutorType::GPU);
+        return;
+    case VerificationMode::CPU:
+        runExplicitRefMode(ReferenceExecutorType::CPU);
+        return;
+    case VerificationMode::AUTO:
+        runAutoMode();
+        return;
+    default:
+        FAIL() << "Unknown verification mode";
+        return;
+    }
+}
+
+namespace
+{
+// GTEST_SKIP() expands to `return;`, so it can only be used from a void-returning
+// function. This wrapper records the skip (and its message) and returns from
+// itself; the skip state persists for the caller, which then returns nullopt.
+void skipEngineCouldNotRun(const std::filesystem::path& bundlePath, const std::string& error)
+{
+    std::ostringstream msg;
+    msg << "Engine could not execute bundle " << bundlePath;
+    if(!error.empty())
+    {
+        msg << ": " << error;
+    }
+    GTEST_SKIP() << msg.str();
+}
+} // namespace
+
+std::optional<OutputTensors> IntegrationBundleVerificationHarness::runEngineOrSkip()
+{
+    std::string error;
+    auto engineOutputs = runEngineCapturingOutputs(error);
+    if(!engineOutputs && !::testing::Test::HasFatalFailure())
+    {
+        skipEngineCouldNotRun(_bundlePath, error);
+    }
+    return engineOutputs;
+}
+
+void IntegrationBundleVerificationHarness::runGoldenMode()
+{
+    if(!_bundle->hasGoldenOutputs)
+    {
+        skipUnverifiable("no golden data (verification-mode=golden)");
+        return;
+    }
+    auto engineOutputs = runEngineOrSkip();
+    if(!engineOutputs)
+    {
+        return;
+    }
+    compareAgainstGolden(*engineOutputs);
+}
+
+void IntegrationBundleVerificationHarness::runExplicitRefMode(ReferenceExecutorType type)
+{
+    auto engineOutputs = runEngineOrSkip();
+    if(!engineOutputs)
+    {
+        return;
+    }
+
+    OutputTensors refOutputs;
+    const RefRunResult result = runReferenceCapturingOutputs(type, refOutputs);
+    switch(result.status)
+    {
+    case RefStatus::CAPABILITY_MISS:
+        skipUnverifiable(refLabel(type) + " cannot run this op: " + result.message);
+        return;
+    case RefStatus::RUNTIME_ERROR:
+        recordRefError(refLabel(type) + " errored: " + result.message);
+        FAIL() << refLabel(type) << " errored (verification-mode=" << refLabel(type)
+               << "): " << result.message;
+        return;
+    case RefStatus::RAN:
+        compareOutputs(*engineOutputs, refOutputs);
+        return;
+    default:
+        FAIL() << "Unknown RefStatus";
+        return;
+    }
+}
+
+void IntegrationBundleVerificationHarness::runAutoMode()
+{
+    auto engineOutputs = runEngineOrSkip();
+    if(!engineOutputs)
+    {
+        return;
+    }
+
+    if(_bundle->hasGoldenOutputs)
+    {
+        compareAgainstGolden(*engineOutputs);
+        return;
+    }
+
+    // GPU ref (non-final): capability miss or runtime error -> fall through.
+    bool gpuRefErrored = false;
+    {
+        OutputTensors refOutputs;
+        const RefRunResult gpu
+            = runReferenceCapturingOutputs(ReferenceExecutorType::GPU, refOutputs);
+        if(gpu.status == RefStatus::RAN)
+        {
+            compareOutputs(*engineOutputs, refOutputs);
+            return;
+        }
+        if(gpu.status == RefStatus::RUNTIME_ERROR)
+        {
+            gpuRefErrored = true;
+            recordRefError("GPU reference errored (auto mode, falling through to CPU): "
+                           + gpu.message);
+        }
+    }
+
+    // CPU ref (final): capability miss -> unverifiable; runtime error -> FAIL.
+    {
+        OutputTensors refOutputs;
+        const RefRunResult cpu
+            = runReferenceCapturingOutputs(ReferenceExecutorType::CPU, refOutputs);
+        switch(cpu.status)
+        {
+        case RefStatus::CAPABILITY_MISS:
+            skipUnverifiable(gpuRefErrored
+                                 ? "no usable reference (golden absent; GPU ref errored, CPU ref "
+                                   "cannot run this op; see reference-error report): "
+                                       + cpu.message
+                                 : "no reference available (golden absent; GPU and CPU ref "
+                                   "cannot run this op): "
+                                       + cpu.message);
+            return;
+        case RefStatus::RUNTIME_ERROR:
+            recordRefError("CPU reference errored (auto mode, last resort): " + cpu.message);
+            FAIL() << "CPU reference errored (auto mode, last resort): " << cpu.message;
+            return;
+        case RefStatus::RAN:
+            compareOutputs(*engineOutputs, refOutputs);
+            return;
+        default:
+            FAIL() << "Unknown RefStatus";
+            return;
+        }
+    }
+}
+
+// ---- inputs ----------------------------------------------------------------
+
+bool IntegrationBundleVerificationHarness::ensureInputsAvailable()
+{
+    if(_bundle->tensors.has_value())
+    {
+        return true;
+    }
+    return synthesizeInputs();
+}
+
+bool IntegrationBundleVerificationHarness::synthesizeInputs()
+{
+    const auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+    const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                       _bundle->outputTensorUids.end());
+
+    InputTensorMap inputs;
+    std::vector<int64_t> allLeafInputUids;
+    for(const auto& [uid, attrs] : tensorAttrMap)
+    {
+        if(attrs->virtual_() || outputUids.count(uid) != 0)
+        {
+            continue;
+        }
+        inputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*attrs);
+        inputs[uid]->fillTensorWithValue(0.f);
+        allLeafInputUids.push_back(uid);
+    }
+
+    std::mt19937 rng(
+        static_cast<std::mt19937::result_type>(_bundle->metadata.seed.value_or(K_DEFAULT_SEED)));
+
+    SynthesisTracker tracker(allLeafInputUids, inputs);
+    for(uint32_t i = 0; i < wrapper.nodeCount(); ++i)
+    {
+        const auto& node = wrapper.getNode(i);
+        const SynthesisResult outcome = synthesizeNodeInputs(node, tracker, rng);
+        if(!outcome.filled)
+        {
+            skipUnverifiable(outcome.reason);
+            return false;
+        }
+    }
+
+    const SynthesisResult finalResult = tracker.finish("synthesis");
+    if(!finalResult.filled)
+    {
+        skipUnverifiable(finalResult.reason);
+        return false;
+    }
+
+    _bundle->tensors = std::move(inputs);
+    return true;
+}
+
+// ---- engine + reference runs -----------------------------------------------
+
+// Output buffers are filled with a sentinel (NaN for float types, type max for
+// integer types) rather than zero. This is the standard hipdnn practice — see
+// CpuReferenceGraphExecutor and GraphTensorBundle::sentinelFillOutputTensors —
+// and it arms allClose's NaN/sentinel guard: any output element the executor
+// fails to write stays NaN and is caught as a hard failure. Zero-filling would
+// make an unwritten output indistinguishable from a legitimately-computed zero,
+// so engine and reference could silently agree on garbage (both untouched zeros)
+// and the comparison would vacuously pass.
+OutputTensors IntegrationBundleVerificationHarness::allocateSentinelOutputs() const
+{
+    const auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+
+    OutputTensors outputs;
+    for(const int64_t uid : _bundle->outputTensorUids)
+    {
+        outputs[uid] = hipdnn_test_sdk::detail::createTensorFromAttribute(*tensorAttrMap.at(uid));
+        outputs[uid]->fillWithSentinelValue();
+    }
+    return outputs;
+}
+
+std::unordered_map<int64_t, void*>
+    IntegrationBundleVerificationHarness::buildVariantPack(OutputTensors& outputs,
+                                                           bool useDevice) const
+{
+    std::unordered_map<int64_t, void*> variantPack;
+    const std::set<int64_t> outputUids(_bundle->outputTensorUids.begin(),
+                                       _bundle->outputTensorUids.end());
+
+    for(auto& [uid, tensor] : *_bundle->tensors)
+    {
+        if(outputUids.count(uid) != 0)
+        {
+            continue;
+        }
+        variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+    }
+    for(auto& [uid, tensor] : outputs)
+    {
+        variantPack[uid] = useDevice ? tensor->rawDeviceData() : tensor->rawHostData();
+    }
+    return variantPack;
+}
+
+std::optional<OutputTensors>
+    IntegrationBundleVerificationHarness::runEngineCapturingOutputs(std::string& error)
+{
+    OutputTensors engineOutputs = allocateSentinelOutputs();
+    auto variantPack = buildVariantPack(engineOutputs, /*useDevice=*/_requiresDevice);
+
+    bool threw = false;
+    try
+    {
+        executeGraphThroughEngine(variantPack);
+    }
+    catch(const std::exception& e)
+    {
+        threw = true;
+        error = e.what();
+    }
+
+    if(::testing::Test::HasFatalFailure())
+    {
+        return std::nullopt;
+    }
+    if(threw)
+    {
+        return std::nullopt;
+    }
+
+    markOutputsModified(engineOutputs);
+    return engineOutputs;
+}
+
+IntegrationBundleVerificationHarness::RefRunResult
+    IntegrationBundleVerificationHarness::runReferenceCapturingOutputs(ReferenceExecutorType type,
+                                                                       OutputTensors& refOutputs)
+{
+    refOutputs = allocateSentinelOutputs();
+    const bool useDevice = _requiresDevice && (type == ReferenceExecutorType::GPU);
+    auto variantPack = buildVariantPack(refOutputs, useDevice);
+
+    try
+    {
+        runReferenceExecutor(type, variantPack);
+    }
+    catch(const ReferenceCapabilityError& e)
+    {
+        return {RefStatus::CAPABILITY_MISS, e.what()};
+    }
+    catch(const std::exception& e)
+    {
+        return {RefStatus::RUNTIME_ERROR, e.what()};
+    }
+
+    markOutputsModifiedFor(refOutputs, useDevice);
+    return {RefStatus::RAN, {}};
+}
+
+void IntegrationBundleVerificationHarness::markOutputsModified(OutputTensors& outputs) const
+{
+    markOutputsModifiedFor(outputs, _requiresDevice);
+}
+
+void IntegrationBundleVerificationHarness::markOutputsModifiedFor(OutputTensors& outputs,
+                                                                  bool device)
+{
+    for(auto& [uid, tensor] : outputs)
+    {
+        if(device)
+        {
+            tensor->markDeviceModified();
+        }
+        else
+        {
+            tensor->markHostModified();
+        }
+    }
+}
+
+// ---- comparison ------------------------------------------------------------
+
+void IntegrationBundleVerificationHarness::compareAgainstGolden(OutputTensors& engineOutputs)
+{
+    compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+        return *_bundle->tensors->at(uid);
+    });
+}
+
+void IntegrationBundleVerificationHarness::compareOutputs(OutputTensors& engineOutputs,
+                                                          OutputTensors& expected)
+{
+    compareEach(engineOutputs, [&](int64_t uid) -> hipdnn_data_sdk::utilities::ITensor& {
+        return *expected.at(uid);
+    });
+}
+
+template <typename ExpectedLookup>
+void IntegrationBundleVerificationHarness::compareEach(OutputTensors& engineOutputs,
+                                                       ExpectedLookup expectedFor)
+{
+    auto wrapper = _bundle->graphWrapper();
+    const auto& tensorAttrMap = wrapper.getTensorMap();
+
+    tolerance::warnIfMultipleOutputs(_bundle->outputTensorUids.size(),
+                                     "IntegrationBundleVerificationHarness");
+
+    for(const int64_t uid : _bundle->outputTensorUids)
+    {
+        auto& actualTensor = *engineOutputs.at(uid);
+        auto& expectedTensor = expectedFor(uid);
+
+        auto* attrs = tensorAttrMap.at(uid);
+        const auto dataType = attrs->data_type();
+
+        float atol = 0.0f;
+        float rtol = 0.0f;
+        tolerance::resolveTolerance(wrapper, dataType, currentTestName(), atol, rtol);
+
+        compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
+    }
+}
+
+// ---- reporting helpers -----------------------------------------------------
+
+void IntegrationBundleVerificationHarness::skipUnverifiable(const std::string& reason)
+{
+    UnverifiableBundleReport::get().record(
+        _bundlePath.string(), reason, UnverifiableSeverity::UNVERIFIABLE);
+    GTEST_SKIP() << "Unverifiable: " << reason << " (" << _bundlePath << ")";
+}
+
+void IntegrationBundleVerificationHarness::recordRefError(const std::string& reason)
+{
+    UnverifiableBundleReport::get().record(
+        _bundlePath.string(), reason, UnverifiableSeverity::REF_ERROR);
+}
+
+std::string IntegrationBundleVerificationHarness::refLabel(ReferenceExecutorType type)
+{
+    return type == ReferenceExecutorType::GPU ? "GPU reference" : "CPU reference";
+}
+
+// ---- comparison + tolerance machinery --------------------------------------
+
+void IntegrationBundleVerificationHarness::compareOutputTensor(
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol) const
+{
+    auto validator = hipdnn_test_sdk::utilities::createAllCloseValidator(dataType, atol, rtol);
+    const bool passed = validator->allClose(expected, actual);
+
+    if(!passed)
+    {
+        std::ostringstream report;
+        report << reportHeader(uid, attrs, dataType, expected, atol, rtol);
+        writeTensorDiffReport(report, uid, attrs, dataType, expected, actual, atol, rtol);
+        EXPECT_TRUE(false) << report.str();
+    }
+}
+
+void IntegrationBundleVerificationHarness::writeTensorDiffReport(
+    std::ostream& os,
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol)
+{
+    using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
+    using hipdnn_data_sdk::types::bfloat16;
+    using hipdnn_data_sdk::types::half;
+
+    switch(dataType)
+    {
+    case DT::FLOAT:
+        writeFpDiffReport<float>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::HALF:
+        writeFpDiffReport<half>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::BFLOAT16:
+        writeFpDiffReport<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    case DT::DOUBLE:
+        writeFpDiffReport<double>(os, uid, attrs, expected, actual, atol, rtol);
+        return;
+    default:
+        os << "  (no element-wise diff available for this data type)\n";
+    }
+}
+
+template <typename T>
+void IntegrationBundleVerificationHarness::writeFpDiffReport(
+    std::ostream& os,
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    hipdnn_data_sdk::utilities::ITensor& actual,
+    float atol,
+    float rtol)
+{
+    const auto summary
+        = hipdnn_test_sdk::utilities::computeTensorDiff<T>(expected, actual, atol, rtol);
+    hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
+}
+
+std::string IntegrationBundleVerificationHarness::labelFor(
+    int64_t uid, const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
+{
+    const auto* name = attrs.name();
+    return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
+}
+
+std::string IntegrationBundleVerificationHarness::reportHeader(
+    int64_t uid,
+    const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+    hipdnn_data_sdk::utilities::ITensor& expected,
+    float atol,
+    float rtol) const
+{
+    std::ostringstream os;
+    os << "\nBundle output comparison FAILED\n"
+       << "  Bundle: " << _bundlePath << "\n"
+       << "  Tensor: " << labelFor(uid, attrs) << " (UID " << uid << ", output)\n"
+       << "  Shape:  " << hipdnn_test_sdk::utilities::StreamVec(expected.dims()) << "  "
+       << dataTypeName(dataType) << "\n"
+       << "  Tolerance: atol=" << atol << " rtol=" << rtol << "\n";
+    return os.str();
+}
+
+std::string IntegrationBundleVerificationHarness::dataTypeName(
+    hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
+{
+    return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
+}
+
+void IntegrationBundleVerificationHarness::applyMetadataGuards() const
+{
+    // VRAM is an execution-feasibility guard: the engine allocates the same
+    // buffers and runs the same graph regardless of how its output is verified,
+    // so this applies to every bundle (golden or reference-verified).
+    if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
+           _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
+    {
+        GTEST_SKIP() << *reason;
+    }
+
+    // Arch-lock only matters for golden data: golden output VALUES are
+    // numerically arch-specific (AITER / GPU-ref generated). Inputs are not
+    // arch-locked. When there is no golden data the engine output is verified
+    // against a reference executor run on THIS device, so the bundle's recorded
+    // arch is irrelevant and must not gate the test.
+    if(_bundle->hasGoldenOutputs)
+    {
+        if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
+               _bundle->metadata, TestConfig::get().getCurrentArch()))
+        {
+            GTEST_SKIP() << *reason;
+        }
+    }
+}
+
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
new file mode 100644
index 000000000000..65a28997d2b0
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationBundleVerificationHarness.hpp
@@ -0,0 +1,257 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <iosfwd>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
+
+#include "harness/IReferenceGraphExecutor.hpp"
+#include "harness/TestConfig.hpp"
+#include "harness/TomlGuards.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
+
+namespace hipdnn_integration_tests::bundle
+{
+
+// Output tensors, keyed by uid. Used both for the engine's computed "actual"
+// outputs and for an expected source (golden from disk, or a reference executor's
+// output). Each set is a distinct allocation so engine and reference never write
+// the same buffers.
+using OutputTensors
+    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
+
+// Verifies a bundle's engine output against an expected source chosen by the
+// verification mode (RFC 0010 §4.4):
+//
+//   actual   = the engine (the system under test), run once into fresh buffers.
+//   expected = golden data from disk, OR a reference executor's output.
+//
+// Auto mode fallback chain: golden → GPU ref → CPU ref → SKIP.
+// When golden outputs are present on disk, the comparison uses them directly
+// and no reference executor is run at all.
+//
+// Memory invariants for running engine + a reference off the same inputs:
+//   * INPUT tensors are read-only by both executors and are NEVER mark*Modified().
+//     The engine's rawDeviceData() uploads host->device (state becomes BOTH
+//     valid); a later CPU-ref rawHostData() therefore sees the host copy still
+//     valid and does NOT download — inputs stay intact across both runs.
+//   * OUTPUT buffers are separate ITensor objects per executor (engineOutputs vs
+//     refOutputs), so the two runs cannot stomp each other. Only output buffers
+//     are mark*Modified().
+//   * Virtual (inter-node) tensors are allocated internally by each executor; the
+//     variant packs we build carry only real (input + output) tensors.
+//
+// NOTE: Stages 1-3 of init unification are done (ALMIOPEN-1969 follow-up).
+//   Both harnesses share SynthesisTracker + SynthesizeInputs from harness/input_init/.
+//   Remaining: 3 non-golden overrides kept for fused-graph range conflicts or
+//   specialized stress tests (BN backward activ, BN fwd training activ,
+//   conv backward weights large-values).
+class IntegrationBundleVerificationHarness : public ::testing::Test
+{
+public:
+    explicit IntegrationBundleVerificationHarness(bool requiresDevice)
+        : _requiresDevice(requiresDevice)
+    {
+    }
+
+    void setBundle(std::shared_ptr<IntegrationTestBundle> bundle, std::filesystem::path path)
+    {
+        _bundle = std::move(bundle);
+        _bundlePath = std::move(path);
+    }
+
+protected:
+    // NOLINTNEXTLINE(readability-identifier-naming)
+    void SetUp() override
+    {
+        if(_requiresDevice)
+        {
+            SKIP_IF_NO_DEVICES();
+        }
+
+        if(_bundle == nullptr)
+        {
+            GTEST_SKIP() << "No bundle set";
+        }
+
+        skipIfTomlMatched(currentTestName());
+
+        applyMetadataGuards();
+    }
+
+    // NOLINTNEXTLINE(readability-identifier-naming)
+    void TestBody() override
+    {
+        runComparison();
+    }
+
+    // Builds the graph, selects an engine, and executes. Throws on unsupported graph (→ SKIP).
+    virtual void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack);
+
+    // Runs the named reference executor. Throws ReferenceCapabilityError on capability miss.
+    virtual void runReferenceExecutor(ReferenceExecutorType type,
+                                      std::unordered_map<int64_t, void*>& variantPack);
+
+    // Constructs the executor object (CpuReferenceGraphExecutorAdapter or
+    // GpuReferenceGraphExecutor) — does not allocate buffers or run anything.
+    // Skipped in auto mode when golden data is present.
+    virtual std::unique_ptr<IReferenceGraphExecutor>
+        makeReferenceExecutor(ReferenceExecutorType type);
+
+    // Returns the active verification mode. Override in tests to inject a mode
+    // without touching the TestConfig singleton.
+    virtual VerificationMode getVerificationMode() const;
+
+    // Skips the test when the bundle's metadata is incompatible with the
+    // current device (VRAM/arch). Virtual so isolated unit tests that don't
+    // exercise hardware guards can override it — production reads from the
+    // TestConfig singleton, which is only initialized by the real test main.
+    virtual void applyMetadataGuards() const;
+
+private:
+    bool _requiresDevice;
+    std::filesystem::path _bundlePath;
+    std::shared_ptr<IntegrationTestBundle> _bundle;
+
+    static constexpr int64_t K_DEFAULT_SEED = 42;
+
+    enum class RefStatus
+    {
+        RAN,
+        CAPABILITY_MISS,
+        RUNTIME_ERROR,
+    };
+    struct RefRunResult
+    {
+        RefStatus status;
+        std::string message;
+    };
+
+    // ── top-level dispatch ────────────────────────────────────────────────
+    void runComparison();
+    void runGoldenMode();
+    void runExplicitRefMode(ReferenceExecutorType type);
+    void runAutoMode();
+
+    // ── inputs ──────────────────────────────────────────────────────────
+    bool ensureInputsAvailable();
+
+    // Synthesizes leaf input tensors for the graph when no golden data exists.
+    //
+    // Phase 1 — allocate: walks the graph's tensor list, skips virtual
+    //   (inter-node) and output tensors, allocates a CPU-side buffer for
+    //   each remaining leaf input tensor (shape/dtype from TensorAttributes).
+    //
+    // Phase 2 — fill: iterates each node (internal op) and calls its
+    //   registered fill function via synthesizeNodeInputs(). Each fill
+    //   function reads its tensor UIDs from the node's attributes and
+    //   declares each one as FREE (random values), STRUCTURED (needs
+    //   specific format), or DERIVED (needs another op's output) through
+    //   a shared SynthesisTracker.
+    //
+    // Phase 3 — verify: calls tracker.finish() which checks that every
+    //   leaf input was accounted for by some fill function and none were
+    //   refused (STRUCTURED/DERIVED). Returns false and SKIPs the test
+    //   if any leaf was missed or refused.
+    //
+    // On success, moves the filled tensors into the bundle so downstream
+    // executors (engine, GPU ref, CPU ref) can upload them to the GPU.
+    bool synthesizeInputs();
+
+    // ── buffer allocation + execution ───────────────────────────────────
+    // allocateSentinelOutputs / buildVariantPack prepare the buffers;
+    // runEngine* / runReference* call the executors and capture results.
+    // Outputs are sentinel-filled (NaN) so an unwritten output element is
+    // caught by allClose rather than masquerading as a computed zero.
+    OutputTensors allocateSentinelOutputs() const;
+    std::unordered_map<int64_t, void*> buildVariantPack(OutputTensors& outputs,
+                                                        bool useDevice) const;
+    // Runs the engine into fresh output buffers. Returns nullopt if the
+    // engine threw (its message is written to `error`) or raised a fatal
+    // GTest failure (in which case `error` is left empty).
+    std::optional<OutputTensors> runEngineCapturingOutputs(std::string& error);
+
+    // Runs the engine and returns its outputs, or nullopt if it could not
+    // run. On nullopt the caller must simply return: this has already
+    // issued the appropriate verdict (a fatal failure propagates as-is,
+    // otherwise the test is SKIPped). Shared preamble for all three modes.
+    std::optional<OutputTensors> runEngineOrSkip();
+
+    RefRunResult runReferenceCapturingOutputs(ReferenceExecutorType type,
+                                              OutputTensors& refOutputs);
+    void markOutputsModified(OutputTensors& outputs) const;
+    static void markOutputsModifiedFor(OutputTensors& outputs, bool device);
+
+    // ── tolerances ──────────────────────────────────────────────────────
+    // Default tolerance derivation (max-across-nodes, per-op/per-dtype lookup)
+    // is shared with the graph harness via harness/tolerance/ToleranceResolver.hpp;
+    // the TOML per-test override is layered on top in compareEach.
+
+    // ── comparison ──────────────────────────────────────────────────────
+    void compareAgainstGolden(OutputTensors& engineOutputs);
+    void compareOutputs(OutputTensors& engineOutputs, OutputTensors& expected);
+
+    template <typename ExpectedLookup>
+    void compareEach(OutputTensors& engineOutputs, ExpectedLookup expectedFor);
+
+    void compareOutputTensor(int64_t uid,
+                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+                             hipdnn_data_sdk::utilities::ITensor& expected,
+                             hipdnn_data_sdk::utilities::ITensor& actual,
+                             float atol,
+                             float rtol) const;
+
+    // ── reporting ───────────────────────────────────────────────────────
+    void skipUnverifiable(const std::string& reason);
+    void recordRefError(const std::string& reason);
+    static std::string refLabel(ReferenceExecutorType type);
+
+    static std::string
+        labelFor(int64_t uid, const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs);
+
+    std::string reportHeader(int64_t uid,
+                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+                             hipdnn_data_sdk::utilities::ITensor& expected,
+                             float atol,
+                             float rtol) const;
+
+    static std::string dataTypeName(hipdnn_flatbuffers_sdk::data_objects::DataType dataType);
+
+    static void
+        writeTensorDiffReport(std::ostream& os,
+                              int64_t uid,
+                              const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                              hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
+                              hipdnn_data_sdk::utilities::ITensor& expected,
+                              hipdnn_data_sdk::utilities::ITensor& actual,
+                              float atol,
+                              float rtol);
+
+    template <typename T>
+    static void
+        writeFpDiffReport(std::ostream& os,
+                          int64_t uid,
+                          const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
+                          hipdnn_data_sdk::utilities::ITensor& expected,
+                          hipdnn_data_sdk::utilities::ITensor& actual,
+                          float atol,
+                          float rtol);
+};
+
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp
new file mode 100644
index 000000000000..9bf691ae06e3
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/bundle/IntegrationTestBundle.hpp
@@ -0,0 +1,328 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <optional>
+#include <set>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_flatbuffers_sdk/utilities/json/Graph.hpp>
+#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
+#include <hipdnn_test_sdk/utilities/LoadGraphAndTensors.hpp>
+
+namespace hipdnn_integration_tests::bundle
+{
+
+// Loaded tensors keyed by tensor UID. Holds every tensor declared by the graph —
+// inputs carry their data, output tensors carry their expected (golden) values as
+// loaded from the .bin blobs. The harness saves the outputs as golden and zeroes
+// them before execution.
+using TensorMap = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
+
+// One test's worth of bundle data loaded from disk.
+//
+//   graphBuffer      — the parsed graph, as a flatbuffer. Always present in a
+//                      loaded bundle; the engine deserializes it (from_binary)
+//                      and the harness walks it (GraphWrapper) for dtypes and
+//                      tolerances. A bundle that cannot even produce a graph is a
+//                      LoadError, not a bundle.
+//   metadata         — .meta.json contents (VRAM / arch guards). Mandatory ONLY
+//                      for golden bundles (those shipping output .bin blobs);
+//                      metadata validates golden data, so a bundle without it is
+//                      a LoadError. For a no-golden bundle (graph-only, or
+//                      inputs-only verified against a reference) absent metadata
+//                      is valid and this is default-constructed (all fields
+//                      empty); the optional-aware consumers fall back to defaults.
+//   outputTensorUids — UIDs of the graph's output tensors, derived from the
+//                      graph. Always available (even for a graph-only bundle),
+//                      so the harness knows which tensors to compare / allocate.
+//   tensors          — the loaded tensor data, keyed by uid. Holds the INPUT
+//                      tensors (with their data) whenever they are present on
+//                      disk, plus the OUTPUT tensors carrying their golden values
+//                      iff every output blob is present (see hasGoldenOutputs).
+//                      Absent (nullopt) only when the input blobs themselves are
+//                      not on disk — a true graph-only bundle. The harness may
+//                      still synthesize inputs for such a bundle (tier 3); if it
+//                      cannot, it SKIPs.
+//   hasGoldenOutputs — true iff every output tensor's .bin blob was present and
+//                      loaded into `tensors`. When false, `tensors` (if present)
+//                      carries inputs only — the engine output must be verified
+//                      against a reference executor, not golden data.
+struct IntegrationTestBundle
+{
+    flatbuffers::DetachedBuffer graphBuffer;
+    hipdnn_test_sdk::utilities::BundleMetadata metadata;
+    std::vector<int64_t> outputTensorUids;
+    std::optional<TensorMap> tensors;
+    bool hasGoldenOutputs = false;
+
+    // View over the graph flatbuffer, valid as long as this bundle lives.
+    hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper graphWrapper() const
+    {
+        return hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper{graphBuffer.data(),
+                                                                          graphBuffer.size()};
+    }
+};
+
+// Why a load did NOT produce a bundle. These are the FAIL outcomes — an authoring
+// error in the bundle itself. (A bundle that loads but lacks tensor data is a
+// successfully-loaded graph-only bundle, not a LoadError; the harness SKIPs it.)
+enum class LoadError
+{
+    MALFORMED_JSON, // the graph .json is not syntactically valid JSON
+    INVALID_GRAPH_SCHEMA, // valid JSON, but not a valid graph (cannot build flatbuffer)
+    MISSING_METADATA, // golden bundle's .meta.json companion is absent or invalid
+    TENSOR_LOAD_FAILED // a tensor .bin is present but failed to load (wrong size,
+    // unreadable, unsupported dtype, ...)
+};
+
+// A load either yields a bundle or explains why it could not. std::visit at the
+// call site forces both cases to be handled.
+using LoadResult = std::variant<IntegrationTestBundle, LoadError>;
+
+inline const char* toString(LoadError error)
+{
+    switch(error)
+    {
+    case LoadError::MALFORMED_JSON:
+        return "graph JSON is not parseable";
+    case LoadError::INVALID_GRAPH_SCHEMA:
+        return "graph JSON is not a valid graph";
+    case LoadError::MISSING_METADATA:
+        return "missing or invalid .meta.json companion";
+    case LoadError::TENSOR_LOAD_FAILED:
+        return "tensor .bin present but failed to load";
+    default:
+        return "unknown load error";
+    }
+}
+
+namespace detail
+{
+
+// The on-disk blob path for a tensor: "{stem}.tensor{uid}.bin", matching the
+// loader's own derivation.
+inline std::filesystem::path tensorBlobPath(const std::filesystem::path& jsonPath, int64_t uid)
+{
+    auto basePath = jsonPath;
+    basePath.replace_extension();
+    return {basePath.string() + ".tensor" + std::to_string(uid) + ".bin"};
+}
+
+// True iff every uid in `uids` has its companion .bin blob on disk. An empty
+// `uids` set returns true (vacuously) — callers handle "no such tensors"
+// separately (e.g. a graph with no inputs, or no outputs).
+inline bool blobsPresentFor(const std::vector<int64_t>& uids, const std::filesystem::path& jsonPath)
+{
+    for(const int64_t uid : uids)
+    {
+        if(!std::filesystem::exists(tensorBlobPath(jsonPath, uid)))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// The uids of every tensor declared in the graph's "tensors" array. Empty if the
+// array is absent/empty (a graph-only bundle). Tensors without a "uid" are
+// skipped (malformed entries are caught later when building the flatbuffer).
+inline std::vector<int64_t> allTensorUids(const nlohmann::json& graphJson)
+{
+    std::vector<int64_t> uids;
+    if(!graphJson.contains("tensors") || !graphJson.at("tensors").is_array())
+    {
+        return uids;
+    }
+    for(const auto& tensor : graphJson.at("tensors"))
+    {
+        if(tensor.contains("uid"))
+        {
+            uids.push_back(tensor.at("uid").get<int64_t>());
+        }
+    }
+    return uids;
+}
+
+} // namespace detail
+
+// Load a bundle from its graph .json path, classifying the outcome.
+//
+// This deliberately does NOT call test_sdk's loadGraphAndTensors(), whose
+// all-or-nothing contract ("graph AND at least one tensor, or throw") conflicts
+// with our design where a graph-only bundle is legitimate. Instead it composes
+// the same test_sdk primitives (json -> flatbuffer graph, per-tensor blob load)
+// under our own policy:
+//
+//   * graph .json not parseable           -> LoadError::MALFORMED_JSON      (FAIL)
+//   * parseable but not a valid graph     -> LoadError::INVALID_GRAPH_SCHEMA(FAIL)
+//   * golden bundle, no/invalid .meta.json-> LoadError::MISSING_METADATA    (FAIL)
+//   * no-golden bundle, no .meta.json      -> bundle, metadata default-constructed
+//   * valid graph, input .bin data absent -> bundle, tensors == nullopt     (tier-3:
+//                                            harness may synthesize, else SKIP)
+//   * valid graph, .bin present but broken-> LoadError::TENSOR_LOAD_FAILED  (FAIL)
+//   * valid graph, inputs present,
+//       outputs absent                    -> bundle, tensors set,
+//                                            hasGoldenOutputs == false (verify via ref)
+//   * valid graph, inputs + outputs present-> bundle, hasGoldenOutputs == true (golden)
+//
+// Inputs and outputs are loaded INDEPENDENTLY (partial loading): a bundle that
+// ships input blobs but no output (golden) blobs is legitimate — its engine
+// output is verified against a reference executor instead of golden data. Output
+// uids come from getOutputTensorUidsFromGraph; everything else declared in the
+// graph is treated as an input.
+//
+// The function is total: it never lets an exception escape. Every outcome is
+// either a loaded bundle or a classified LoadError.
+inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPath)
+{
+    // 1. Read and parse the graph .json. Unreadable or unparseable -> FAIL.
+    std::ifstream stream(jsonPath);
+    if(!stream)
+    {
+        return LoadError::MALFORMED_JSON;
+    }
+
+    const auto graphJson = nlohmann::json::parse(stream, nullptr, /*allow_exceptions=*/false);
+    if(graphJson.is_discarded())
+    {
+        return LoadError::MALFORMED_JSON;
+    }
+
+    // 2. Verify the graph by building the flatbuffer. A structurally invalid
+    //    graph throws -> INVALID_GRAPH_SCHEMA.
+    flatbuffers::FlatBufferBuilder builder;
+    try
+    {
+        auto offset = hipdnn_flatbuffers_sdk::json::to<hipdnn_flatbuffers_sdk::data_objects::Graph>(
+            builder, graphJson);
+        builder.Finish(offset);
+    }
+    catch(const std::exception&)
+    {
+        return LoadError::INVALID_GRAPH_SCHEMA;
+    }
+
+    // 3. Capture the graph and derive the output UIDs (always available, even
+    //    for a graph-only bundle).
+    IntegrationTestBundle bundle;
+    bundle.graphBuffer = builder.Release();
+    bundle.outputTensorUids = hipdnn_test_sdk::utilities::getOutputTensorUidsFromGraph(graphJson);
+
+    // 4. Metadata is mandatory ONLY for golden bundles — those shipping output
+    //    .bin blobs. Metadata (arch lock, provenance, seed) exists to validate
+    //    golden data; a bundle with no golden outputs (pure graph-only, or
+    //    inputs-only verified against a reference) has nothing for it to
+    //    validate, so absent metadata is fine and we default-construct it.
+    //
+    //    loadBundleMetadata returns nullopt both when the .meta.json is absent
+    //    and when it is present but invalid (bad JSON / bad format_version). For
+    //    a golden bundle either case is an authoring error -> FAIL.
+    const bool goldenOutputsPresent = !bundle.outputTensorUids.empty()
+                                      && detail::blobsPresentFor(bundle.outputTensorUids, jsonPath);
+
+    auto metadata = hipdnn_test_sdk::utilities::loadBundleMetadata(jsonPath);
+    if(!metadata.has_value())
+    {
+        if(goldenOutputsPresent)
+        {
+            return LoadError::MISSING_METADATA;
+        }
+        metadata.emplace(); // graph-only / no-golden: empty metadata is valid.
+    }
+    bundle.metadata = std::move(*metadata);
+
+    // 5. Load tensor .bin data, inputs and outputs INDEPENDENTLY.
+    //
+    //    Output uids are the graph's outputs; every other declared tensor is an
+    //    input. We load inputs only if all input blobs are present, and outputs
+    //    (golden) only if all output blobs are present:
+    //
+    //      * all input blobs present  -> tensors gets the inputs
+    //      * all output blobs present -> tensors also gets the golden outputs and
+    //                                    hasGoldenOutputs = true
+    //      * input blobs absent       -> tensors stays nullopt (tier-3: harness
+    //                                    may synthesize inputs, else SKIP)
+    //
+    //    A blob that is present but fails to load (wrong size, unreadable,
+    //    unsupported dtype) throws inside tensorFromFileAndAttributes; we catch it
+    //    and classify it as TENSOR_LOAD_FAILED so the loader stays total.
+    {
+        const std::vector<int64_t> allUids = detail::allTensorUids(graphJson);
+
+        const std::set<int64_t> outputUidSet(bundle.outputTensorUids.begin(),
+                                             bundle.outputTensorUids.end());
+        std::vector<int64_t> inputUids;
+        for(const int64_t uid : allUids)
+        {
+            if(outputUidSet.count(uid) == 0)
+            {
+                inputUids.push_back(uid);
+            }
+        }
+
+        // A graph with no declared inputs cannot be fed; treat as graph-only.
+        const bool inputsPresent
+            = !inputUids.empty() && detail::blobsPresentFor(inputUids, jsonPath);
+        const bool outputsPresent = goldenOutputsPresent; // computed in step 4
+
+        if(inputsPresent)
+        {
+            const auto& graph
+                = *hipdnn_flatbuffers_sdk::data_objects::GetGraph(bundle.graphBuffer.data());
+
+            // uid -> attributes, so we can load a chosen subset of tensors.
+            std::unordered_map<int64_t,
+                               const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes*>
+                attrByUid;
+            for(const auto* attributes : *graph.tensors())
+            {
+                attrByUid[attributes->uid()] = attributes;
+            }
+
+            const auto loadUids = [&](const std::vector<int64_t>& uids, TensorMap& into) {
+                for(const int64_t uid : uids)
+                {
+                    const auto it = attrByUid.find(uid);
+                    if(it == attrByUid.end())
+                    {
+                        continue;
+                    }
+                    into[uid] = hipdnn_test_sdk::utilities::tensorFromFileAndAttributes(
+                        detail::tensorBlobPath(jsonPath, uid), *it->second);
+                }
+            };
+
+            try
+            {
+                TensorMap tensorMap;
+                loadUids(inputUids, tensorMap);
+                if(outputsPresent)
+                {
+                    loadUids(bundle.outputTensorUids, tensorMap);
+                    bundle.hasGoldenOutputs = true;
+                }
+                bundle.tensors = std::move(tensorMap);
+            }
+            catch(const std::exception&)
+            {
+                return LoadError::TENSOR_LOAD_FAILED;
+            }
+        }
+    }
+
+    return bundle;
+}
+
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp b/dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp
new file mode 100644
index 000000000000..5ebe1902ba5b
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/bundle/UnverifiableBundleReport.hpp
@@ -0,0 +1,125 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace hipdnn_integration_tests::bundle
+{
+
+// Why a bundle could not be verified. The two severities are printed in separate
+// sections so a genuine reference bug is never lost among expected coverage gaps.
+//
+//   Unverifiable — expected coverage gap (no golden data, no reference can run
+//                  the op, inputs could not be synthesized, ...). The engine was
+//                  not accused; we simply had no oracle. Quiet but listed.
+//   RefError     — a reference executor that CAN run the op threw at runtime
+//                  (case C) and the harness fell through to keep verifying the
+//                  engine. This is a reference bug and must be loud.
+enum class UnverifiableSeverity
+{
+    UNVERIFIABLE,
+    REF_ERROR,
+};
+
+// Process-wide collector of bundles that ended a run without a verdict. Mirrors
+// SupportMatrixCollector: a thread-safe singleton populated during test execution
+// and printed once after RUN_ALL_TESTS(). Records do not affect the GTest exit
+// code — an unverifiable bundle SKIPs; this report is the visible trail.
+class UnverifiableBundleReport
+{
+public:
+    struct Record
+    {
+        std::string bundle; // bundle path / identifier
+        std::string reason; // human-readable explanation
+        UnverifiableSeverity severity;
+    };
+
+    static UnverifiableBundleReport& get()
+    {
+        static UnverifiableBundleReport s_instance;
+        return s_instance;
+    }
+
+    UnverifiableBundleReport(const UnverifiableBundleReport&) = delete;
+    UnverifiableBundleReport& operator=(const UnverifiableBundleReport&) = delete;
+    UnverifiableBundleReport(UnverifiableBundleReport&&) = delete;
+    UnverifiableBundleReport& operator=(UnverifiableBundleReport&&) = delete;
+
+    void record(std::string bundle, std::string reason, UnverifiableSeverity severity)
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        _records.push_back({std::move(bundle), std::move(reason), severity});
+    }
+
+    std::vector<Record> getRecords() const
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        return _records;
+    }
+
+    void reset()
+    {
+        const std::lock_guard<std::mutex> lock(_mutex);
+        _records.clear();
+    }
+
+    // Print both severity sections to `os`. No-op when nothing was recorded.
+    void print(std::ostream& os = std::cout) const
+    {
+        std::vector<Record> records;
+        {
+            const std::lock_guard<std::mutex> lock(_mutex);
+            records = _records;
+        }
+        if(records.empty())
+        {
+            return;
+        }
+
+        printSection(os, records, UnverifiableSeverity::REF_ERROR, "REFERENCE EXECUTOR ERRORS");
+        printSection(os, records, UnverifiableSeverity::UNVERIFIABLE, "UNVERIFIABLE BUNDLES");
+    }
+
+private:
+    UnverifiableBundleReport() = default;
+
+    static void printSection(std::ostream& os,
+                             const std::vector<Record>& records,
+                             UnverifiableSeverity severity,
+                             const char* heading)
+    {
+        size_t count = 0;
+        for(const auto& r : records)
+        {
+            if(r.severity == severity)
+            {
+                ++count;
+            }
+        }
+        if(count == 0)
+        {
+            return;
+        }
+
+        os << "\n==== " << heading << " (" << count << ") ====\n";
+        for(const auto& r : records)
+        {
+            if(r.severity == severity)
+            {
+                os << "  - " << r.bundle << ": " << r.reason << "\n";
+            }
+        }
+    }
+
+    mutable std::mutex _mutex;
+    std::vector<Record> _records;
+};
+
+} // namespace hipdnn_integration_tests::bundle
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
deleted file mode 100644
index db50296cacc3..000000000000
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <filesystem>
-#include <memory>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <hipdnn_data_sdk/utilities/Tensor.hpp>
-#include <hipdnn_data_sdk/utilities/Workspace.hpp>
-#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
-#include <hipdnn_frontend/Graph.hpp>
-#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
-#include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
-#include <hipdnn_test_sdk/utilities/FlatbufferDatatypeMapping.hpp>
-#include <hipdnn_test_sdk/utilities/TensorDiff.hpp>
-#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
-#include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
-
-#include "harness/SharedHandle.hpp"
-#include "harness/TestConfig.hpp"
-#include "harness/golden/BundleDiscovery.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
-
-namespace hipdnn_integration_tests::golden
-{
-
-// Saved expected output tensors, keyed by output tensor UID. Extracted from a
-// loaded bundle's output tensors just before execution: the harness keeps these
-// as the golden reference and zeroes the live tensors so the runner computes
-// into clean buffers.
-using GoldenOutputs
-    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
-
-class IntegrationGraphGoldenReferenceVerificationHarness : public ::testing::Test
-{
-public:
-    explicit IntegrationGraphGoldenReferenceVerificationHarness(bool requiresDevice)
-        : _requiresDevice(requiresDevice)
-    {
-    }
-
-    // The bundle is loaded once at registration time and shared into the test's
-    // factory; the harness does not load from disk. The path is kept only for
-    // diagnostic messages.
-    void setBundle(std::shared_ptr<IntegrationTestBundle> bundle, std::filesystem::path path)
-    {
-        _bundle = std::move(bundle);
-        _bundlePath = std::move(path);
-    }
-
-protected:
-    // NOLINTNEXTLINE(readability-identifier-naming)
-    void SetUp() override
-    {
-        if(_requiresDevice)
-        {
-            SKIP_IF_NO_DEVICES();
-        }
-
-        if(_bundle == nullptr)
-        {
-            GTEST_SKIP() << "No bundle set";
-        }
-
-        // A graph-only bundle (no tensor data on disk, or .bin not pulled via
-        // DVC) cannot be executed or compared -> SKIP.
-        if(!_bundle->tensors.has_value())
-        {
-            GTEST_SKIP() << "Tensor data not available (graph-only bundle or DVC not pulled?): "
-                         << _bundlePath;
-        }
-
-        applyMetadataGuards();
-    }
-
-    // Save each output tensor's loaded data as the golden reference, then zero
-    // the live tensor so the runner computes into a clean buffer. Returns the
-    // golden map keyed by output UID.
-    GoldenOutputs extractGolden(TensorMap& tensorMap) const
-    {
-        GoldenOutputs golden;
-        const auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-
-        for(const int64_t uid : _bundle->outputTensorUids)
-        {
-            const auto dataType = tensorAttrMap.at(uid)->data_type();
-            auto& livePtr = tensorMap.at(uid);
-
-            auto zeroed = std::visit(
-                [&](auto nativeType) {
-                    using DataType = decltype(nativeType);
-                    auto tensorPtr = std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>(
-                        new hipdnn_data_sdk::utilities::Tensor<DataType>(livePtr->dims(),
-                                                                         livePtr->strides()));
-                    tensorPtr->fillTensorWithValue(0.f);
-                    return tensorPtr;
-                },
-                hipdnn_test_sdk::utilities::datatypeToNativeVariant(dataType));
-
-            std::swap(zeroed, livePtr); // live map now holds the zero buffer
-            golden[uid] = std::move(zeroed); // golden holds the original data
-        }
-        return golden;
-    }
-
-    // NOLINTNEXTLINE(readability-identifier-naming)
-    void TestBody() override
-    {
-        runGoldenComparison();
-    }
-
-    // Builds the graph from its serialized bytes, selects an engine (honouring
-    // an explicit --engine if given), builds plans, and executes into the
-    // variant pack. "Unsupported graph" is signalled by throwing (the harness
-    // translates that into a SKIP). Genuine build/execute errors use ASSERT_*.
-    virtual void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack)
-    {
-        auto handle = getSharedHandle();
-
-        const std::vector<uint8_t> graphBytes(
-            _bundle->graphBuffer.data(), _bundle->graphBuffer.data() + _bundle->graphBuffer.size());
-
-        hipdnn_frontend::graph::Graph graph;
-        auto err = graph.from_binary(handle, graphBytes);
-        ASSERT_TRUE(err.is_good()) << "from_binary failed: " << err.get_message();
-
-        std::vector<int64_t> engineIds;
-        auto status = graph.get_ranked_engine_ids(engineIds);
-
-        const auto graphSummary = [&] {
-            return std::to_string(_bundle->outputTensorUids.size()) + " output tensor(s), "
-                   + std::to_string(engineIds.size()) + " ranked engine(s)";
-        };
-
-        if(TestConfig::get().hasEngineName())
-        {
-            int64_t targetEngineId = TestConfig::get().getEngineId();
-            if(status.is_bad()
-               || std::find(engineIds.begin(), engineIds.end(), targetEngineId) == engineIds.end())
-            {
-                throw std::runtime_error("Engine " + std::string(TestConfig::get().getEngineName())
-                                         + " does not support this graph (" + graphSummary() + ")");
-            }
-            graph.set_preferred_engine_id_ext(targetEngineId);
-        }
-        else
-        {
-            if(status.is_bad() || engineIds.empty())
-            {
-                throw std::runtime_error("No engine supports this graph (" + graphSummary() + ")");
-            }
-        }
-
-        auto result = graph.create_execution_plans();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        result = graph.check_support();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        result = graph.build_plans();
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-
-        int64_t workspaceSize = 0;
-        result = graph.get_workspace_size(workspaceSize);
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-        ASSERT_GE(workspaceSize, 0);
-        const hipdnn_data_sdk::utilities::Workspace workspace(static_cast<size_t>(workspaceSize));
-
-        result = graph.execute(handle, variantPack, workspace.get());
-        ASSERT_TRUE(result.is_good()) << result.get_message();
-    }
-
-private:
-    bool _requiresDevice;
-    std::filesystem::path _bundlePath;
-    std::shared_ptr<IntegrationTestBundle> _bundle;
-
-    void runGoldenComparison()
-    {
-        auto& tensorMap = *_bundle->tensors;
-
-        if(_bundle->outputTensorUids.empty())
-        {
-            GTEST_SKIP() << "Bundle has no output tensors to compare: " << _bundlePath;
-        }
-
-        const auto golden = extractGolden(tensorMap);
-
-        // Build the variant pack from the tensor map. Device tests use GPU
-        // pointers (rawDeviceData); CPU-only unit tests use host pointers so
-        // they can run on CI without a GPU.
-        std::unordered_map<int64_t, void*> variantPack;
-        for(auto& [uid, tensor] : tensorMap)
-        {
-            variantPack[uid] = _requiresDevice ? tensor->rawDeviceData() : tensor->rawHostData();
-        }
-
-        // executeGraphThroughEngine signals "unsupported graph" by throwing;
-        // the harness translates that into a SKIP. ASSERT_NO_FATAL_FAILURE
-        // still wraps the call so that a genuine GTest assertion inside the
-        // executor FAILs rather than falling through to the comparison.
-        bool executorThrew = false;
-        std::string executorError;
-        try
-        {
-            ASSERT_NO_FATAL_FAILURE(executeGraphThroughEngine(variantPack));
-        }
-        catch(const std::exception& e)
-        {
-            executorThrew = true;
-            executorError = e.what();
-        }
-
-        if(executorThrew)
-        {
-            GTEST_SKIP() << "Executor could not run bundle " << _bundlePath << ": "
-                         << executorError;
-        }
-
-        for(auto uid : _bundle->outputTensorUids)
-        {
-            if(_requiresDevice)
-            {
-                tensorMap.at(uid)->markDeviceModified();
-            }
-            else
-            {
-                tensorMap.at(uid)->markHostModified();
-            }
-        }
-
-        auto wrapper = _bundle->graphWrapper();
-        const auto& tensorAttrMap = wrapper.getTensorMap();
-
-        for(auto uid : _bundle->outputTensorUids)
-        {
-            auto& actualTensor = *tensorMap.at(uid);
-            auto& expectedTensor = *golden.at(uid);
-
-            auto* attrs = tensorAttrMap.at(uid);
-            auto dataType = attrs->data_type();
-
-            float atol = 0.0f;
-            float rtol = 0.0f;
-            resolveTolerances(wrapper, dataType, atol, rtol);
-
-            compareOutputTensor(uid, *attrs, dataType, expectedTensor, actualTensor, atol, rtol);
-        }
-    }
-
-    // Compare one output tensor against its golden reference via the allClose
-    // validator (which covers both CPU and GPU validation paths). Only on failure
-    // do we compute and report the element-wise tensor diff for diagnostics.
-    void compareOutputTensor(int64_t uid,
-                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                             hipdnn_data_sdk::utilities::ITensor& expected,
-                             hipdnn_data_sdk::utilities::ITensor& actual,
-                             float atol,
-                             float rtol) const
-    {
-        auto validator = hipdnn_test_sdk::utilities::createAllCloseValidator(dataType, atol, rtol);
-        const bool passed = validator->allClose(expected, actual);
-
-        if(!passed)
-        {
-            std::ostringstream report;
-            report << reportHeader(uid, attrs, dataType, expected, atol, rtol);
-            appendTensorDiff(report, uid, attrs, dataType, expected, actual, atol, rtol);
-            EXPECT_TRUE(false) << report.str();
-        }
-    }
-
-    // Appends an element-wise diff summary for FP types; non-FP types get a
-    // generic note (computeTensorDiff has no integer specialization).
-    static void
-        appendTensorDiff(std::ostream& os,
-                         int64_t uid,
-                         const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                         hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                         hipdnn_data_sdk::utilities::ITensor& expected,
-                         hipdnn_data_sdk::utilities::ITensor& actual,
-                         float atol,
-                         float rtol)
-    {
-        using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
-        using hipdnn_data_sdk::types::bfloat16;
-        using hipdnn_data_sdk::types::half;
-
-        switch(dataType)
-        {
-        case DT::FLOAT:
-            appendFpDiff<float>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::HALF:
-            appendFpDiff<half>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::BFLOAT16:
-            appendFpDiff<bfloat16>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        case DT::DOUBLE:
-            appendFpDiff<double>(os, uid, attrs, expected, actual, atol, rtol);
-            return;
-        default:
-            os << "  (no element-wise diff available for this data type)\n";
-        }
-    }
-
-    template <typename T>
-    static void appendFpDiff(std::ostream& os,
-                             int64_t uid,
-                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                             hipdnn_data_sdk::utilities::ITensor& expected,
-                             hipdnn_data_sdk::utilities::ITensor& actual,
-                             float atol,
-                             float rtol)
-    {
-        const auto summary
-            = hipdnn_test_sdk::utilities::computeTensorDiff<T>(expected, actual, atol, rtol);
-        hipdnn_test_sdk::utilities::printTensorDiffSummary(os, labelFor(uid, attrs), summary);
-    }
-
-    // The human-readable label for an output tensor: its name if it has one,
-    // otherwise "uid=N".
-    static std::string labelFor(int64_t uid,
-                                const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs)
-    {
-        const auto* name = attrs.name();
-        return (name != nullptr && !name->empty()) ? name->str() : ("uid=" + std::to_string(uid));
-    }
-
-    // Common header for a failed comparison (RFC 0011 §4.3 "What a failure looks
-    // like"): bundle path, tensor UID/name, shape + dtype, and tolerance. The
-    // per-element diff (worst index, expected/actual/abs-diff, mismatch count) is
-    // appended by the caller from the TensorDiffSummary it already computed.
-    std::string reportHeader(int64_t uid,
-                             const hipdnn_flatbuffers_sdk::data_objects::TensorAttributes& attrs,
-                             hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                             hipdnn_data_sdk::utilities::ITensor& expected,
-                             float atol,
-                             float rtol) const
-    {
-        std::ostringstream os;
-        os << "\nGolden comparison FAILED\n"
-           << "  Bundle: " << _bundlePath << "\n"
-           << "  Tensor: " << labelFor(uid, attrs) << " (UID " << uid << ", output)\n"
-           << "  Shape:  " << hipdnn_test_sdk::utilities::StreamVec(expected.dims()) << "  "
-           << dataTypeName(dataType) << "\n"
-           << "  Tolerance: atol=" << atol << " rtol=" << rtol << "\n";
-        return os.str();
-    }
-
-    static std::string dataTypeName(hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        return hipdnn_flatbuffers_sdk::data_objects::EnumNameDataType(dataType);
-    }
-
-    static void
-        resolveTolerances(const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-                          hipdnn_flatbuffers_sdk::data_objects::DataType dataType,
-                          float& atol,
-                          float& rtol)
-    {
-        const float defaultTolerance = deriveDefaultTolerance(wrapper, dataType);
-        atol = defaultTolerance;
-        rtol = defaultTolerance;
-    }
-
-    template <typename T>
-    static float
-        toleranceForNodeAttributes(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType)
-    {
-        using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
-        namespace tol = hipdnn_test_sdk::utilities;
-
-        switch(attrType)
-        {
-        case NA::ConvolutionFwdAttributes:
-            return tol::conv::getToleranceFwd<T>();
-        case NA::ConvolutionBwdAttributes:
-            return tol::conv::getToleranceBwd<T>();
-        case NA::ConvolutionWrwAttributes:
-            return tol::conv::getToleranceWrw<T>();
-        case NA::BatchnormInferenceAttributes:
-            return tol::batchnorm::getToleranceInference<T>();
-        case NA::BatchnormInferenceAttributesVarianceExt:
-            return tol::batchnorm::getToleranceInferenceWithVariance<T>();
-        case NA::BatchnormAttributes:
-            return tol::batchnorm::getToleranceTraining<T>();
-        case NA::BatchnormBackwardAttributes:
-            return tol::batchnorm::getToleranceBackward<T>();
-        case NA::MatmulAttributes:
-            return tol::matmul::getTolerance<T>();
-        case NA::ReductionAttributes:
-            return tol::reduction::getTolerance<T>();
-        case NA::RMSNormAttributes:
-            return tol::rmsnorm::getTolerance<T>();
-        case NA::PointwiseAttributes:
-            return tol::pointwise::getTolerance<T>();
-        case NA::LayernormAttributes:
-            return tol::layernorm::getTolerance<T>();
-        default:
-            return 1e-3f;
-        }
-    }
-
-    // A bundle graph may fuse several ops (e.g. Convolution + Pointwise
-    // activation). Each op type has its own numerical tolerance, so the only
-    // tolerance that holds for the fused output is the loosest one across all
-    // nodes: a tolerance tight enough for Conv (e.g. 1e-3) would wrongly fail an
-    // activation output that legitimately needs 1e-2. We therefore take the max
-    // tolerance over every node rather than picking a single "root" node.
-    static float deriveDefaultTolerance(
-        const hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper& wrapper,
-        hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        const auto nodeCount = wrapper.nodeCount();
-
-        bool found = false;
-        float maxTolerance = 0.0f;
-        for(uint32_t i = 0; i < nodeCount; ++i)
-        {
-            const auto attrType = wrapper.getNode(i).attributes_type();
-            const float nodeTolerance = toleranceForDataType(attrType, dataType);
-            maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
-            found = true;
-        }
-
-        return found ? maxTolerance : 1e-3f;
-    }
-
-    // Dispatch a single node's tolerance lookup on the bundle's data type.
-    static float toleranceForDataType(hipdnn_flatbuffers_sdk::data_objects::NodeAttributes attrType,
-                                      hipdnn_flatbuffers_sdk::data_objects::DataType dataType)
-    {
-        using DT = hipdnn_flatbuffers_sdk::data_objects::DataType;
-        using hipdnn_data_sdk::types::bfloat16;
-        using hipdnn_data_sdk::types::half;
-
-        switch(dataType)
-        {
-        case DT::FLOAT:
-            return toleranceForNodeAttributes<float>(attrType);
-        case DT::HALF:
-            return toleranceForNodeAttributes<half>(attrType);
-        case DT::BFLOAT16:
-            return toleranceForNodeAttributes<bfloat16>(attrType);
-        default:
-            return 1e-3f;
-        }
-    }
-
-    void applyMetadataGuards() const
-    {
-        // metadata is mandatory, so a loaded bundle always has it (a bundle with
-        // no .meta.json fails to load and never reaches here). Individual fields
-        // (VRAM, arch) are still optional within BundleMetadata; the guards below
-        // no-op when their field is absent, so they can be called unconditionally.
-        if(auto reason = hipdnn_test_sdk::utilities::checkVramRequirement(
-               _bundle->metadata, TestConfig::get().getCurrentDeviceVramMb()))
-        {
-            GTEST_SKIP() << *reason;
-        }
-
-        if(auto reason = hipdnn_test_sdk::utilities::checkArchCompatibility(
-               _bundle->metadata, TestConfig::get().getCurrentArch()))
-        {
-            GTEST_SKIP() << *reason;
-        }
-    }
-};
-
-} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp b/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
deleted file mode 100644
index a05f33612437..000000000000
--- a/dnn-providers/integration-tests/src/harness/golden/IntegrationTestBundle.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <cstdint>
-#include <filesystem>
-#include <fstream>
-#include <memory>
-#include <optional>
-#include <unordered_map>
-#include <variant>
-#include <vector>
-
-#include <nlohmann/json.hpp>
-
-#include <hipdnn_data_sdk/utilities/Tensor.hpp>
-#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
-#include <hipdnn_flatbuffers_sdk/utilities/json/Graph.hpp>
-#include <hipdnn_test_sdk/utilities/BundleMetadata.hpp>
-#include <hipdnn_test_sdk/utilities/LoadGraphAndTensors.hpp>
-
-namespace hipdnn_integration_tests::golden
-{
-
-// Loaded tensors keyed by tensor UID. Holds every tensor declared by the graph —
-// inputs carry their data, output tensors carry their expected (golden) values as
-// loaded from the .bin blobs. The harness saves the outputs as golden and zeroes
-// them before execution.
-using TensorMap = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
-
-// One test's worth of bundle data loaded from disk.
-//
-//   graphBuffer      — the parsed graph, as a flatbuffer. Always present in a
-//                      loaded bundle; the engine deserializes it (from_binary)
-//                      and the harness walks it (GraphWrapper) for dtypes and
-//                      tolerances. A bundle that cannot even produce a graph is a
-//                      LoadError, not a bundle.
-//   metadata         — .meta.json contents (VRAM / arch guards). MANDATORY: a
-//                      bundle without a valid .meta.json is a LoadError, so a
-//                      loaded bundle always carries real metadata.
-//   outputTensorUids — UIDs of the graph's output tensors, derived from the
-//                      graph. Always available (even for a graph-only bundle),
-//                      so the harness knows which tensors to compare / allocate.
-//   tensors          — the loaded tensor data. Absent (nullopt) for a graph-only
-//                      bundle (no "tensors" in the graph, or .bin data not pulled
-//                      via DVC); such a bundle cannot be executed/compared and
-//                      the harness SKIPs it.
-struct IntegrationTestBundle
-{
-    flatbuffers::DetachedBuffer graphBuffer;
-    hipdnn_test_sdk::utilities::BundleMetadata metadata;
-    std::vector<int64_t> outputTensorUids;
-    std::optional<TensorMap> tensors;
-
-    // View over the graph flatbuffer, valid as long as this bundle lives.
-    hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper graphWrapper() const
-    {
-        return hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper{graphBuffer.data(),
-                                                                          graphBuffer.size()};
-    }
-};
-
-// Why a load did NOT produce a bundle. These are the FAIL outcomes — an authoring
-// error in the bundle itself. (A bundle that loads but lacks tensor data is a
-// successfully-loaded graph-only bundle, not a LoadError; the harness SKIPs it.)
-enum class LoadError
-{
-    MALFORMED_JSON, // the graph .json is not syntactically valid JSON
-    INVALID_GRAPH_SCHEMA, // valid JSON, but not a valid graph (cannot build flatbuffer)
-    MISSING_METADATA, // required .meta.json companion is absent or invalid
-    TENSOR_LOAD_FAILED // a tensor .bin is present but failed to load (wrong size,
-    // unreadable, unsupported dtype, ...)
-};
-
-// A load either yields a bundle or explains why it could not. std::visit at the
-// call site forces both cases to be handled.
-using LoadResult = std::variant<IntegrationTestBundle, LoadError>;
-
-inline const char* toString(LoadError error)
-{
-    switch(error)
-    {
-    case LoadError::MALFORMED_JSON:
-        return "graph JSON is not parseable";
-    case LoadError::INVALID_GRAPH_SCHEMA:
-        return "graph JSON is not a valid graph";
-    case LoadError::MISSING_METADATA:
-        return "missing or invalid .meta.json companion";
-    case LoadError::TENSOR_LOAD_FAILED:
-        return "tensor .bin present but failed to load";
-    default:
-        return "unknown load error";
-    }
-}
-
-namespace detail
-{
-
-// True iff every tensor declared in the graph has its companion .bin blob on
-// disk. The blob path is "{stem}.tensor{uid}.bin", matching the loader's own
-// derivation. A graph with no "tensors" array is graph-only -> returns false.
-inline bool allTensorBlobsPresent(const nlohmann::json& graphJson,
-                                  const std::filesystem::path& jsonPath)
-{
-    if(!graphJson.contains("tensors") || !graphJson.at("tensors").is_array()
-       || graphJson.at("tensors").empty())
-    {
-        return false;
-    }
-
-    auto basePath = jsonPath;
-    basePath.replace_extension();
-    for(const auto& tensor : graphJson.at("tensors"))
-    {
-        if(!tensor.contains("uid"))
-        {
-            return false;
-        }
-        const auto uid = tensor.at("uid").get<int64_t>();
-        const auto binPath
-            = std::filesystem::path(basePath.string() + ".tensor" + std::to_string(uid) + ".bin");
-        if(!std::filesystem::exists(binPath))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-} // namespace detail
-
-// Load a bundle from its graph .json path, classifying the outcome.
-//
-// This deliberately does NOT call test_sdk's loadGraphAndTensors(), whose
-// all-or-nothing contract ("graph AND at least one tensor, or throw") conflicts
-// with our design where a graph-only bundle is legitimate. Instead it composes
-// the same test_sdk primitives (json -> flatbuffer graph, per-tensor blob load)
-// under our own policy:
-//
-//   * graph .json not parseable           -> LoadError::MALFORMED_JSON      (FAIL)
-//   * parseable but not a valid graph     -> LoadError::INVALID_GRAPH_SCHEMA(FAIL)
-//   * valid graph, no .meta.json companion-> LoadError::MISSING_METADATA    (FAIL)
-//   * valid graph, tensor .bin data absent-> bundle with tensors == nullopt (SKIP)
-//   * valid graph, .bin present but broken-> LoadError::TENSOR_LOAD_FAILED  (FAIL)
-//   * valid graph, all .bin present       -> fully loaded bundle            (RUN)
-//
-// The function is total: it never lets an exception escape. Every outcome is
-// either a loaded bundle or a classified LoadError.
-inline LoadResult loadIntegrationTestBundle(const std::filesystem::path& jsonPath)
-{
-    // 1. Read and parse the graph .json. Unreadable or unparseable -> FAIL.
-    std::ifstream stream(jsonPath);
-    if(!stream)
-    {
-        return LoadError::MALFORMED_JSON;
-    }
-
-    const auto graphJson = nlohmann::json::parse(stream, nullptr, /*allow_exceptions=*/false);
-    if(graphJson.is_discarded())
-    {
-        return LoadError::MALFORMED_JSON;
-    }
-
-    // 2. Verify the graph by building the flatbuffer. A structurally invalid
-    //    graph throws -> INVALID_GRAPH_SCHEMA.
-    flatbuffers::FlatBufferBuilder builder;
-    try
-    {
-        auto offset = hipdnn_flatbuffers_sdk::json::to<hipdnn_flatbuffers_sdk::data_objects::Graph>(
-            builder, graphJson);
-        builder.Finish(offset);
-    }
-    catch(const std::exception&)
-    {
-        return LoadError::INVALID_GRAPH_SCHEMA;
-    }
-
-    // 3. Metadata is MANDATORY: every valid-graph bundle must ship a valid
-    //    .meta.json companion. loadBundleMetadata returns nullopt both when the
-    //    file is absent and when it is present but invalid (bad JSON / bad
-    //    format_version) — either way it is an authoring error -> FAIL.
-    auto metadata = hipdnn_test_sdk::utilities::loadBundleMetadata(jsonPath);
-    if(!metadata.has_value())
-    {
-        return LoadError::MISSING_METADATA;
-    }
-
-    // 4. Graph + metadata verified: capture them and the output UIDs (always
-    //    available, even for a graph-only bundle).
-    IntegrationTestBundle bundle;
-    bundle.graphBuffer = builder.Release();
-    bundle.metadata = std::move(*metadata);
-    bundle.outputTensorUids = hipdnn_test_sdk::utilities::getOutputTensorUidsFromGraph(graphJson);
-
-    // 5. Load tensor .bin data if every blob is present; otherwise leave
-    //    tensors == nullopt (graph-only bundle -> harness SKIPs). A blob that is
-    //    present but fails to load (wrong size, unreadable, unsupported dtype)
-    //    throws inside tensorFromFileAndAttributes; we catch it here and classify
-    //    it as TENSOR_LOAD_FAILED so the loader is total (every outcome is either
-    //    a bundle or a named LoadError, never a raw escaping exception).
-    if(detail::allTensorBlobsPresent(graphJson, jsonPath))
-    {
-        const auto& graph
-            = *hipdnn_flatbuffers_sdk::data_objects::GetGraph(bundle.graphBuffer.data());
-        auto basePath = jsonPath;
-        basePath.replace_extension();
-
-        try
-        {
-            TensorMap tensorMap;
-            for(const auto* attributes : *graph.tensors())
-            {
-                const auto tensorPath
-                    = basePath.string() + ".tensor" + std::to_string(attributes->uid()) + ".bin";
-                tensorMap[attributes->uid()]
-                    = hipdnn_test_sdk::utilities::tensorFromFileAndAttributes(tensorPath,
-                                                                              *attributes);
-            }
-            bundle.tensors = std::move(tensorMap);
-        }
-        catch(const std::exception&)
-        {
-            return LoadError::TENSOR_LOAD_FAILED;
-        }
-    }
-
-    return bundle;
-}
-
-} // namespace hipdnn_integration_tests::golden
diff --git a/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp b/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
index 671dcb248d19..ab5aea01933f 100644
--- a/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
+++ b/dnn-providers/integration-tests/src/harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp
@@ -8,6 +8,7 @@
 
 #include "detail/GpuPlanBuilderRegistry.hpp"
 #include "harness/IReferenceGraphExecutor.hpp"
+#include "harness/ReferenceCapabilityError.hpp"
 
 namespace hipdnn_integration_tests::gpu_graph_executor
 {
@@ -83,8 +84,8 @@ class GpuReferenceGraphExecutor : public IReferenceGraphExecutor
         {
             const std::string nodeName
                 = node.name() == nullptr ? " unknown" : " " + node.name()->str();
-            throw std::runtime_error("GPU plan builder is not applicable for the given node:"
-                                     + nodeName);
+            throw ReferenceCapabilityError("GPU plan builder is not applicable for the given node:"
+                                           + nodeName);
         }
 
         return planBuilder.buildNodePlan(graph, node);
@@ -123,15 +124,17 @@ class GpuReferenceGraphExecutor : public IReferenceGraphExecutor
         case NodeAttrs::BlockScaleQuantizeAttributes:
         {
             const std::string nodeName = node.name() == nullptr ? "unknown" : node.name()->str();
-            throw std::runtime_error("GPU plan not yet implemented for node '" + nodeName
-                                     + "'. Register a GPU plan for this operation type.");
+            throw ReferenceCapabilityError("GPU plan not yet implemented for node '" + nodeName
+                                           + "'. Register a GPU plan for this operation type.");
         }
 
         case NodeAttrs::CustomOpAttributes:
-            throw std::runtime_error("GPU reference executor does not support custom operations");
+            throw ReferenceCapabilityError(
+                "GPU reference executor does not support custom operations");
 
         default:
-            throw std::runtime_error("Unsupported node type for GPU signature key generation");
+            throw ReferenceCapabilityError(
+                "Unsupported node type for GPU signature key generation");
         }
     }
 
diff --git a/dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp b/dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp
new file mode 100644
index 000000000000..3a5ca814438e
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/input_init/SynthesisTracker.hpp
@@ -0,0 +1,230 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+
+namespace hipdnn_integration_tests
+{
+
+// Pre-allocated input tensors keyed by uid, handed to a fill function to populate.
+using InputTensorMap
+    = std::unordered_map<int64_t, std::unique_ptr<hipdnn_data_sdk::utilities::ITensor>>;
+
+// Result of a synthesis step — returned by per-node fill functions and by
+// tracker.finish(). filled==true means synthesis can proceed; filled==false
+// means at least one input could not be synthesized — reason says which and why.
+struct SynthesisResult
+{
+    bool filled = false;
+    std::string reason;
+
+    static SynthesisResult ok()
+    {
+        return {true, {}};
+    }
+    static SynthesisResult unsupported(std::string why)
+    {
+        return {false, std::move(why)};
+    }
+};
+
+// Tracks which leaf inputs of a bundle's graph have been accounted for by the
+// per-node fill functions. A bundle contains a graph of one or more nodes — a
+// single conv, or a fused chain like conv → bias_add → relu. One tracker is
+// created for the entire graph's leaf inputs (non-virtual, non-output tensors),
+// shared across all fill functions, and finish() is called once at the end.
+//
+// Graph structure (conv + bias + relu fused graph):
+//
+//   Data flows top-down. Roots are the leaf input tensors that the tracker
+//   owns; the sink is the graph output tensor.
+//
+//        x (root/leaf)  w (root/leaf)  bias (root/leaf)
+//         uid=1          uid=2           uid=4
+//           \             /                |
+//            \           /                 |
+//         ┌──────────────┐                 |
+//         │   ConvFwd    │  (internal)     |
+//         └──────┬───────┘                 |
+//                |                         |
+//          conv_y (virtual, uid=10)        |
+//                |                         |
+//                \                        /
+//              ┌──────────────────────┐
+//              │   Pointwise ADD      │  (internal)
+//              └──────────┬───────────┘
+//                         |
+//                   bias_out (virtual, uid=11)
+//                         |
+//              ┌──────────┴───────────┐
+//              │   Pointwise RELU     │  (internal)
+//              └──────────┬───────────┘
+//                         |
+//                    out (sink/leaf, uid=6)
+//
+//   Roots  = leaf input tensors, owned by tracker: {1, 2, 4}
+//   Virtual = inter-node edges, not owned → fillFree/markDerived skip them
+//   Sink   = graph output tensor, not owned
+//
+// Each leaf input must be declared as one of three mutually exclusive roles:
+//
+//   FREE       — random values in a range work. The range can be tight (e.g.
+//                variance in [0.5, 1.5] to stay positive) or wide (e.g. x in
+//                [-1, 1]). What matters is that any value in the range is valid.
+//
+//   STRUCTURED — random values in any range won't work. The data needs to be
+//                consistent with other state or follow a specific format.
+//
+//                Example 1: dropout seeds — forward and backward must use the
+//                same seed so they generate the same drop pattern. A randomly
+//                synthesized seed for a standalone backward won't match any
+//                forward pass, producing wrong gradients.
+//
+//                Example 2: page table indices (paged attention) — when serving
+//                multiple users, each user's K and V data grows at different
+//                rates. Instead of pre-allocating a large contiguous block per
+//                user, GPU memory is pooled into equal-size chunks handed out
+//                on demand. A user's data ends up scattered across
+//                non-contiguous chunks. The page table tensor holds chunk
+//                indices telling the kernel where each user's data lives.
+//                Randomly generated indices would not correspond to valid
+//                allocated chunks, producing incorrect reads or crashes.
+//
+//                Example 3: peer_stats (multi-GPU batchnorm) — when a batch
+//                is split across multiple GPUs, each GPU computes local
+//                statistics (mean, variance) for its chunk. To produce
+//                correct global statistics, each GPU must read the others'
+//                partial results. The peer_stats tensor holds references to
+//                other GPUs' memory regions. Randomly generated values would
+//                point to invalid cross-device memory.
+//
+//   DERIVED    — the value must come from another op's output, not from random
+//                generation. In a fused fwd+bwd graph the forward output flows
+//                to the backward input as a virtual tensor (not owned, silently
+//                skipped). In a standalone backward, the same tensor is a leaf
+//                input. markDerived records it and lets finish() refuse (SKIP),
+//                used when no recipe exists to produce a consistent value.
+//
+// finish() succeeds only when every owned leaf input was declared as some role
+// AND none were STRUCTURED or DERIVED. Undeclared inputs and refused inputs both
+// produce a diagnostic message so the caller knows what went wrong.
+//
+// PRECONDITION — a validated, well-formed graph. The tracker trusts the leaf
+// set it is handed and the virtual_ flag on every tensor:
+//
+//   * A required input referenced by a node is assumed to be a real leaf tensor
+//     (not mislabeled virtual or aliased to an output). If it were, fillFree
+//     would silently no-op on a non-owned uid and finish() would never see it.
+//   * A virtual tensor is assumed to genuinely have a producer node. A standalone
+//     backward whose `o`/`stats` were erroneously flagged virtual would skip the
+//     markDerived refusal and "succeed" with garbage.
+//
+// Both of those malformed-graph states are rejected upstream — at bundle load
+// (the flatbuffer build in loadIntegrationTestBundle) and again by the engine's
+// own graph validation (from_binary / check_support / build_plans), which
+// requires every virtual tensor to have a producer. By the time synthesis runs,
+// the graph is well-formed, so the tracker does not re-validate topology.
+class SynthesisTracker
+{
+public:
+    SynthesisTracker(const std::vector<int64_t>& ownedLeafInputUids, InputTensorMap& inputs)
+        : _inputs(inputs)
+        , _owned(ownedLeafInputUids.begin(), ownedLeafInputUids.end())
+    {
+    }
+
+    // Declares `uid` as FREE — fills it with random values in [lo, hi] and accounts for it.
+    void fillFree(int64_t uid, float lo, float hi, std::mt19937& rng)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        const auto seed = static_cast<unsigned int>(rng());
+        _inputs.at(uid)->fillTensorWithRandomValues(lo, hi, seed);
+        _accounted.insert(uid);
+    }
+
+    // Declares `uid` as STRUCTURED — accounts for it but records a refusal.
+    void markStructured(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (structured input)");
+    }
+
+    // Declares `uid` as DERIVED — accounts for it but records a refusal.
+    void markDerived(int64_t uid, const char* role)
+    {
+        if(!isOwned(uid))
+        {
+            return;
+        }
+        _accounted.insert(uid);
+        _refusals.push_back(std::string(role) + " (derived from another computation)");
+    }
+
+    // Returns ok() when all owned leaf inputs were filled with random data.
+    // Returns unsupported() when synthesis cannot produce valid data for
+    // this graph — either because a leaf input is STRUCTURED/DERIVED
+    // (we know about it but can't fill it), or because a leaf input was
+    // never declared by any node's fill function.
+    // Note: virtual inter-node tensors are not owned, so STRUCTURED/DERIVED
+    // calls on them are silently ignored. Absent optional tensors (uid 0 by
+    // hipdnn convention) are the caller's responsibility — fill functions
+    // should guard against calling fillFree/markStructured on uid 0 when the
+    // attribute means "not present."
+    SynthesisResult finish(const char* opName) const
+    {
+        std::vector<std::string> reasons = _refusals;
+        for(const int64_t uid : _owned)
+        {
+            if(_accounted.count(uid) == 0)
+            {
+                reasons.push_back("tensor uid=" + std::to_string(uid)
+                                  + " (no role declared by initializer)");
+            }
+        }
+
+        if(reasons.empty())
+        {
+            return SynthesisResult::ok();
+        }
+
+        std::ostringstream os;
+        os << opName << " inputs cannot be synthesized: ";
+        for(size_t i = 0; i < reasons.size(); ++i)
+        {
+            os << (i == 0 ? "" : ", ") << reasons[i];
+        }
+        return SynthesisResult::unsupported(os.str());
+    }
+
+private:
+    bool isOwned(int64_t uid) const
+    {
+        return _owned.count(uid) != 0;
+    }
+
+    InputTensorMap& _inputs; // leaf inputs only (non-virtual, non-output tensors)
+    std::set<int64_t> _owned;
+    std::set<int64_t> _accounted;
+    std::vector<std::string> _refusals;
+};
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp b/dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp
new file mode 100644
index 000000000000..1094cb5c1806
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/input_init/SynthesizeInputs.hpp
@@ -0,0 +1,495 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "harness/input_init/SynthesisTracker.hpp"
+
+namespace hipdnn_integration_tests
+{
+
+// ── Per-op fill functions ─────────────────────────────────────────────────────
+// Each function declares inputs for one node in the graph. A single
+// SynthesisTracker is shared across all nodes in the graph — the caller
+// (synthesizeInputs in the harness .cpp) creates it with the whole-graph leaf
+// input UIDs, passes it through each fill function, then calls finish() once
+// after all nodes have been processed. This graph-level tracking is essential
+// for fused/multi-node graphs: each node only accounts for its own UIDs, and
+// the final finish() verifies that every leaf input was covered by some node.
+//
+// Every function follows the same pattern:
+//   1. Cast the node to its concrete attribute type.
+//   2. Declare each input as FREE (fill with random values), STRUCTURED (can't
+//      synthesize — needs specific format), or DERIVED (must come from another
+//      op's output). See SynthesisTracker.hpp for role definitions.
+//   3. Return ok() if the attribute cast succeeded, or unsupported() if not.
+//
+// Fills must be deterministic given `rng` so re-running the same graph produces
+// identical inputs for reproducible comparisons.
+//
+// To add a new op: copy fillConvFwdInputs (simplest example), adapt for your
+// op's attributes, and add one case to the switch in synthesizeNodeInputs().
+// Function names follow the pattern fill<AttributeName>Inputs.
+
+// ── Convolution ───────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillConvFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionFwdAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not ConvolutionFwdAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+inline SynthesisResult fillConvBwdDataInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionBwdAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not ConvolutionBwdAttributes");
+    }
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->w_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+inline SynthesisResult
+    fillConvBwdWeightsInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                             SynthesisTracker& tracker,
+                             std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ConvolutionWrwAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not ConvolutionWrwAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── Batchnorm ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult
+    fillBatchnormInferenceInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                 SynthesisTracker& tracker,
+                                 std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormInferenceAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BatchnormInferenceAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid(), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->inv_variance_tensor_uid(), 0.5f, 1.5f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+inline SynthesisResult
+    fillBatchnormInferenceVarianceInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormInferenceAttributesVarianceExt();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BatchnormInferenceAttributesVarianceExt");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid(), -1.0f, 1.0f, rng);
+    // Variance must be non-negative
+    tracker.fillFree(a->variance_tensor_uid(), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// peer_stats holds references to other GPUs' memory for multi-GPU batchnorm —
+// randomly generated values would point to invalid cross-device memory.
+inline SynthesisResult
+    fillBatchnormTrainingInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BatchnormAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    tracker.fillFree(a->prev_running_mean_tensor_uid().value_or(0), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->prev_running_variance_tensor_uid().value_or(0), -2.0f, 2.0f, rng);
+    tracker.fillFree(a->momentum_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+
+    if(a->peer_stats_tensor_uid() != nullptr)
+    {
+        for(const int64_t uid : *a->peer_stats_tensor_uid())
+        {
+            tracker.markStructured(uid, "peer_stats");
+        }
+    }
+
+    return SynthesisResult::ok();
+}
+
+// mean/inv_variance are optional (may come from forward). peer_stats: see above.
+inline SynthesisResult
+    fillBatchnormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BatchnormBackwardAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BatchnormBackwardAttributes");
+    }
+    tracker.fillFree(a->dy_tensor_uid(), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->mean_tensor_uid().value_or(0), -0.1f, 0.1f, rng);
+    tracker.fillFree(a->inv_variance_tensor_uid().value_or(0), 1.9f, 2.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -0.1f, 0.1f, rng);
+
+    if(a->peer_stats_tensor_uid() != nullptr)
+    {
+        for(const int64_t uid : *a->peer_stats_tensor_uid())
+        {
+            tracker.markStructured(uid, "peer_stats");
+        }
+    }
+
+    return SynthesisResult::ok();
+}
+
+// ── Matmul ────────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillMatmulInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                        SynthesisTracker& tracker,
+                                        std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_MatmulAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not MatmulAttributes");
+    }
+    tracker.fillFree(a->a_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->b_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── Pointwise ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillPointwiseInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_PointwiseAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not PointwiseAttributes");
+    }
+    tracker.fillFree(a->in_0_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->in_1_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->in_2_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->axis_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── Reduction ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillReductionInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ReductionAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not ReductionAttributes");
+    }
+    tracker.fillFree(a->in_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── LayerNorm ─────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillLayernormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                           SynthesisTracker& tracker,
+                                           std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_LayernormAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not LayernormAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// mean and inv_variance are computed by the forward pass — a standalone backward
+// can't produce correct gradients without them.
+inline SynthesisResult
+    fillLayernormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                SynthesisTracker& tracker,
+                                std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_LayernormBackwardAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not LayernormBackwardAttributes");
+    }
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markDerived(a->mean_tensor_uid().value_or(0), "mean (forward output)");
+    tracker.markDerived(a->inv_variance_tensor_uid().value_or(0), "inv_variance (forward output)");
+    tracker.fillFree(a->epsilon_tensor_uid().value_or(0), 0.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── RMSNorm ───────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillRmsnormInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                         SynthesisTracker& tracker,
+                                         std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_RMSNormAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not RMSNormAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->epsilon_tensor_uid(), 0.0f, 1.0f, rng);
+    tracker.fillFree(a->bias_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// inv_rms is computed by the forward pass.
+inline SynthesisResult
+    fillRmsnormBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                              SynthesisTracker& tracker,
+                              std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_RMSNormBackwardAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not RMSNormBackwardAttributes");
+    }
+    tracker.fillFree(a->dy_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markDerived(a->inv_rms_tensor_uid(), "inv_rms (forward output)");
+    return SynthesisResult::ok();
+}
+
+// ── Resample ──────────────────────────────────────────────────────────────────
+
+inline SynthesisResult fillResampleFwdInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_ResampleFwdAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not ResampleFwdAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── Block-scale quantization ──────────────────────────────────────────────────
+
+// Scale tensor holds per-block quantization factors that must match the
+// quantized data — random scales would produce garbage dequantized values.
+inline SynthesisResult
+    fillBlockScaleDequantizeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                   SynthesisTracker& tracker,
+                                   std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BlockScaleDequantizeAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BlockScaleDequantizeAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.markStructured(a->scale_tensor_uid(), "scale (block quantization scales)");
+    return SynthesisResult::ok();
+}
+
+inline SynthesisResult
+    fillBlockScaleQuantizeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                 SynthesisTracker& tracker,
+                                 std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_BlockScaleQuantizeAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not BlockScaleQuantizeAttributes");
+    }
+    tracker.fillFree(a->x_tensor_uid(), -1.0f, 1.0f, rng);
+    return SynthesisResult::ok();
+}
+
+// ── SDPA ──────────────────────────────────────────────────────────────────────
+
+// Q/K/V/mask accept random values, as does scale (the softmax multiplier, e.g.
+// 1/sqrt(head_dim) — any positive value is mathematically valid). The FP8/MX
+// descale/scale factors are STRUCTURED, NOT free: each must equal the actual
+// quantization factor used to produce its tensor's data. A random descale does
+// not break the engine-vs-reference comparison (both read the same shared value)
+// but it lets values drift out of FP8 range and saturate identically on both
+// sides — a vacuous pass that verifies nothing. We therefore refuse to fabricate
+// them, mirroring fillBlockScaleDequantizeInputs. Real FP8 coverage comes from
+// authored bundles that ship the matching scales as data. The remaining inputs
+// are STRUCTURED for their own reasons: seq lengths encode actual sequence
+// boundaries, page tables map to allocated GPU memory chunks, block masks define
+// sparse attention patterns, and dropout seed/offset must match between fwd and
+// bwd. Most of these are optional — absent ones (uid 0) are silently ignored.
+inline SynthesisResult fillSdpaForwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                             SynthesisTracker& tracker,
+                                             std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_SdpaAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not SdpaAttributes");
+    }
+
+    tracker.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+
+    // FP8/MX quantization scale factors must match the data's true scale — see
+    // the header comment. Refuse rather than fabricate a meaningless value.
+    tracker.markStructured(a->descale_q_tensor_uid().value_or(0), "descale_q");
+    tracker.markStructured(a->descale_k_tensor_uid().value_or(0), "descale_k");
+    tracker.markStructured(a->descale_v_tensor_uid().value_or(0), "descale_v");
+    tracker.markStructured(a->descale_s_tensor_uid().value_or(0), "descale_s");
+    tracker.markStructured(a->scale_s_tensor_uid().value_or(0), "scale_s");
+    tracker.markStructured(a->scale_o_tensor_uid().value_or(0), "scale_o");
+
+    tracker.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    tracker.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    tracker.markStructured(a->page_table_k_tensor_uid().value_or(0), "page_table_k");
+    tracker.markStructured(a->page_table_v_tensor_uid().value_or(0), "page_table_v");
+    tracker.markStructured(a->block_mask_tensor_uid().value_or(0), "block_mask");
+    tracker.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
+    tracker.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
+
+    return SynthesisResult::ok();
+}
+
+// Q/K/V/dO accept random values. O (the forward output) and stats (softmax
+// statistics) are DERIVED — they must come from a forward pass to produce
+// correct gradients. In a fused forward+backward graph these are virtual
+// inter-node tensors (not owned, so silently skipped). A standalone backward
+// without a forward is refused.
+inline SynthesisResult
+    fillSdpaBackwardInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                           SynthesisTracker& tracker,
+                           std::mt19937& rng)
+{
+    const auto* a = node.attributes_as_SdpaBackwardAttributes();
+    if(a == nullptr)
+    {
+        return SynthesisResult::unsupported("not SdpaBackwardAttributes");
+    }
+
+    tracker.fillFree(a->q_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->k_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->v_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->do_tensor_uid(), -1.0f, 1.0f, rng);
+    tracker.fillFree(a->scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->dropout_scale_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->dropout_scale_inv_tensor_uid().value_or(0), 0.1f, 1.0f, rng);
+    tracker.fillFree(a->attn_mask_tensor_uid().value_or(0), -1.0f, 1.0f, rng);
+
+    tracker.markDerived(a->o_tensor_uid(), "o (forward output)");
+    tracker.markDerived(a->stats_tensor_uid(), "stats (forward softmax stats)");
+
+    tracker.markStructured(a->seq_len_q_tensor_uid().value_or(0), "seq_len_q");
+    tracker.markStructured(a->seq_len_kv_tensor_uid().value_or(0), "seq_len_kv");
+    tracker.markStructured(a->seed_tensor_uid().value_or(0), "dropout_seed");
+    tracker.markStructured(a->offset_tensor_uid().value_or(0), "dropout_offset");
+
+    return SynthesisResult::ok();
+}
+
+// ── Dispatch ──────────────────────────────────────────────────────────────────
+// Routes a node to its fill function based on the flatbuffer attribute type.
+// The harness calls this once per node in the graph — for a fused graph like
+// conv+bias+relu, each node is dispatched separately with only its own inputs.
+// Returns ok() when all of the node's inputs were filled, or unsupported() with
+// a diagnostic when the op is unrecognized or an input can't be synthesized.
+
+inline SynthesisResult synthesizeNodeInputs(const hipdnn_flatbuffers_sdk::data_objects::Node& node,
+                                            SynthesisTracker& tracker,
+                                            std::mt19937& rng)
+{
+    using NA = hipdnn_flatbuffers_sdk::data_objects::NodeAttributes;
+
+    switch(node.attributes_type())
+    {
+    case NA::ConvolutionFwdAttributes:
+        return fillConvFwdInputs(node, tracker, rng);
+    case NA::ConvolutionBwdAttributes:
+        return fillConvBwdDataInputs(node, tracker, rng);
+    case NA::ConvolutionWrwAttributes:
+        return fillConvBwdWeightsInputs(node, tracker, rng);
+    case NA::BatchnormInferenceAttributes:
+        return fillBatchnormInferenceInputs(node, tracker, rng);
+    case NA::BatchnormInferenceAttributesVarianceExt:
+        return fillBatchnormInferenceVarianceInputs(node, tracker, rng);
+    case NA::BatchnormAttributes:
+        return fillBatchnormTrainingInputs(node, tracker, rng);
+    case NA::BatchnormBackwardAttributes:
+        return fillBatchnormBackwardInputs(node, tracker, rng);
+    case NA::MatmulAttributes:
+        return fillMatmulInputs(node, tracker, rng);
+    case NA::PointwiseAttributes:
+        return fillPointwiseInputs(node, tracker, rng);
+    case NA::ReductionAttributes:
+        return fillReductionInputs(node, tracker, rng);
+    case NA::LayernormAttributes:
+        return fillLayernormInputs(node, tracker, rng);
+    case NA::LayernormBackwardAttributes:
+        return fillLayernormBackwardInputs(node, tracker, rng);
+    case NA::RMSNormAttributes:
+        return fillRmsnormInputs(node, tracker, rng);
+    case NA::RMSNormBackwardAttributes:
+        return fillRmsnormBackwardInputs(node, tracker, rng);
+    case NA::ResampleFwdAttributes:
+        return fillResampleFwdInputs(node, tracker, rng);
+    case NA::BlockScaleDequantizeAttributes:
+        return fillBlockScaleDequantizeInputs(node, tracker, rng);
+    case NA::BlockScaleQuantizeAttributes:
+        return fillBlockScaleQuantizeInputs(node, tracker, rng);
+    case NA::SdpaAttributes:
+        return fillSdpaForwardInputs(node, tracker, rng);
+    case NA::SdpaBackwardAttributes:
+        return fillSdpaBackwardInputs(node, tracker, rng);
+    default:
+        return SynthesisResult::unsupported("no input synthesis registered for this op");
+    }
+}
+
+} // namespace hipdnn_integration_tests
diff --git a/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp b/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp
new file mode 100644
index 000000000000..2d28d835c5f5
--- /dev/null
+++ b/dnn-providers/integration-tests/src/harness/tolerance/ToleranceResolver.hpp
@@ -0,0 +1,250 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <hipdnn_data_sdk/types.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+#include <hipdnn_flatbuffers_sdk/flatbuffer_utilities/GraphWrapper.hpp>
+#include <hipdnn_plugin_sdk/PluginLogging.hpp>
+#include <hipdnn_test_sdk/utilities/TestTolerances.hpp>
+
+#include "harness/TomlGuards.hpp"
+
+// Shared default-tolerance resolution for both verification harnesses
+// (ALMIOPEN-2216). Both the programmatic graph harness and the bundle harness
+// reduce to the same question — "given a serialized graph and an output dtype,
+// what default atol/rtol should the comparison use?" — so the policy lives here,
+// keyed on the flatbuffer GraphWrapper, which is the common representation: the
+// bundle harness already holds one, and the graph harness obtains it via
+// Graph::to_binary().
+//
+// This header owns POLICY only; the per-operation / per-dtype tolerance NUMBERS
+// stay in hipdnn_test_sdk TestTolerances.hpp and are read, never modified.
+//
+// TODO(dynamic tolerance): the per-op tolerance source here is the FIXED table
+// (TestTolerances.hpp). The codebase already ships a dynamic, shape/dtype-aware
+// model — hipdnn_test_sdk DynamicTolerances.hpp + per-op DynamicTolerances{Matmul,
+// Conv,BatchNorm,LayerNorm,RMSNorm,Sdpa,Pointwise}.hpp and
+// pointwise/PointwiseErrorClassification.hpp — already wired into other test
+// fixtures (conv gpu-ref, sdpa backward, cpu-executor plan tests). RFC 0011
+// §"Tolerance Framework" / "Future Work #1" defines the upgrade: replace the
+// fixed level-3 default with DynamicTolerances, keyed on graph properties
+// (op, dtype, tensor dims), without changing the three-level chain or this
+// aggregation policy. When promoting, add a sibling aggregation function that
+// routes through the existing DynamicTolerances functions instead of
+// TestTolerances.hpp, and pass it to resolveTolerance; also add
+// sub-bf16 dtypes (FP4) which the current DataType switch lacks (falls through to
+// 1e-3). See ALMIOPEN-2216.
+//
+// Two policy decisions are encoded here, each kept independently evolvable:
+//
+//   * Aggregation = max-across-nodes. The output tolerance is the loosest
+//     per-node tolerance in the graph. This is the conservative envelope: it can
+//     be too loose on a long fused chain but is never too tight, so it never
+//     manufactures a false failure. Root-op-only selection (the graph harness's
+//     prior heuristic) is unsafe — an upstream high-K / low-precision node
+//     dominates the error, so picking the "root" can under-tolerance and fail a
+//     correct kernel. A principled alternative (analytic error propagation along
+//     the producer chain) is the documented future upgrade; it needs per-op
+//     condition-number models and is deferred.
+//
+//   * dtype key = the OUTPUT tensor's dtype (passed in by the caller). Truly
+//     per-node dtype keying — each node keyed on its own output-edge dtype — only
+//     differs from this in mixed-I/O fused graphs, and recovering a node's
+//     output dtype needs a per-op tensor-UID extractor (the flatbuffer Node
+//     carries only compute_data_type, not its I/O tensors). That extractor is
+//     the same machinery the per-output subgraph walk needs, so per-node dtype is
+//     deferred together with multi-output support (ALMIOPEN-2216).
+//
+// resolveTolerance() is the single entry point for both harnesses: it derives
+// the max-across-nodes default and then applies the TOML per-test override (the
+// highest-priority layer) in one place, so neither harness applies the override
+// separately and the layering order lives here alone.
+
+namespace hipdnn_integration_tests::tolerance
+{
+
+namespace fb = hipdnn_flatbuffers_sdk::flatbuffer_utilities;
+namespace data = hipdnn_flatbuffers_sdk::data_objects;
+
+// Per-op tolerance for one node attribute type, at a fixed element type T.
+// Maps a flatbuffer NodeAttributes tag onto the corresponding TestTolerances.hpp
+// entry. Unknown ops fall back to a conservative 1e-3.
+template <typename T>
+inline float toleranceForNodeAttributes(data::NodeAttributes attrType)
+{
+    using NA = data::NodeAttributes;
+    namespace tol = hipdnn_test_sdk::utilities;
+
+    switch(attrType)
+    {
+    case NA::ConvolutionFwdAttributes:
+        return tol::conv::getToleranceFwd<T>();
+    case NA::ConvolutionBwdAttributes:
+        return tol::conv::getToleranceBwd<T>();
+    case NA::ConvolutionWrwAttributes:
+        return tol::conv::getToleranceWrw<T>();
+    case NA::BatchnormInferenceAttributes:
+        return tol::batchnorm::getToleranceInference<T>();
+    case NA::BatchnormInferenceAttributesVarianceExt:
+        return tol::batchnorm::getToleranceInferenceWithVariance<T>();
+    case NA::BatchnormAttributes:
+        return tol::batchnorm::getToleranceTraining<T>();
+    case NA::BatchnormBackwardAttributes:
+        return tol::batchnorm::getToleranceBackward<T>();
+    case NA::MatmulAttributes:
+        return tol::matmul::getTolerance<T>();
+    case NA::ReductionAttributes:
+        return tol::reduction::getTolerance<T>();
+    case NA::RMSNormAttributes:
+        return tol::rmsnorm::getTolerance<T>();
+    case NA::PointwiseAttributes:
+        return tol::pointwise::getTolerance<T>();
+    case NA::LayernormAttributes:
+        return tol::layernorm::getTolerance<T>();
+    case NA::SdpaAttributes:
+    case NA::SdpaBackwardAttributes:
+        return tol::sdpa::getToleranceFwd<T>();
+    default:
+        return 1e-3f;
+    }
+}
+
+// Dispatch the element-type template on a runtime DataType.
+inline float toleranceForNode(data::NodeAttributes attrType, data::DataType dataType)
+{
+    using DT = data::DataType;
+    using hipdnn_data_sdk::types::bfloat16;
+    using hipdnn_data_sdk::types::half;
+
+    switch(dataType)
+    {
+    case DT::FLOAT:
+        return toleranceForNodeAttributes<float>(attrType);
+    case DT::HALF:
+        return toleranceForNodeAttributes<half>(attrType);
+    case DT::BFLOAT16:
+        return toleranceForNodeAttributes<bfloat16>(attrType);
+    default:
+        return 1e-3f;
+    }
+}
+
+// An aggregation policy reduces the per-node tolerances of a graph to one
+// default tolerance for an output. It is just a function (graph, dtype) -> float;
+// new policies are added as new functions, and resolveTolerance() takes the
+// chosen one as a parameter. No enum/switch — the policy IS the function.
+using AggregationPolicy = float (*)(const fb::GraphWrapper&, data::DataType);
+
+// Conservative policy (the default): max-across-nodes — the loosest per-node
+// tolerance in the graph. Never tighter than any single node, so it cannot
+// manufacture a false failure; for a fused output (which genuinely accumulates
+// error from every op on its chain) the loosest contributing op is the correct
+// floor. Returns 1e-3 for a graph with no nodes.
+inline float maxAcrossNodes(const fb::GraphWrapper& wrapper, data::DataType dataType)
+{
+    const auto nodeCount = wrapper.nodeCount();
+
+    bool found = false;
+    float maxTolerance = 0.0f;
+    for(uint32_t i = 0; i < nodeCount; ++i)
+    {
+        const auto attrType = wrapper.getNode(i).attributes_type();
+        const float nodeTolerance = toleranceForNode(attrType, dataType);
+        maxTolerance = found ? std::max(maxTolerance, nodeTolerance) : nodeTolerance;
+        found = true;
+    }
+
+    return found ? maxTolerance : 1e-3f;
+}
+
+// Output-op policy: the tolerance of the last non-Pointwise node in topological
+// order — i.e. the op that produces the graph's output. This reproduces the
+// graph harness's historical getTolerance() behavior so the C++ graph tests keep
+// their exact tolerances as they migrate. It is tighter than maxAcrossNodes only
+// on fused chains whose loosest op is NOT the output op; for the common case
+// (one real op + activation) the two policies are identical, since the activation
+// is Pointwise (skipped) and the single real op is both loosest and last.
+//
+// NOTE: this is a heuristic, not a principled tight bound — it attributes the
+// whole output's tolerance to one op and ignores upstream error accumulation.
+// Kept only for migration parity; max remains the default everywhere else, and
+// the principled tighten path is the future DynamicTolerances upgrade. Falls back
+// to maxAcrossNodes if every node is Pointwise (no clear producing op).
+inline float outputOpTolerance(const fb::GraphWrapper& wrapper, data::DataType dataType)
+{
+    const auto nodeCount = wrapper.nodeCount();
+
+    bool foundRoot = false;
+    data::NodeAttributes rootAttr = data::NodeAttributes::NONE;
+    for(uint32_t i = 0; i < nodeCount; ++i)
+    {
+        const auto attrType = wrapper.getNode(i).attributes_type();
+        if(attrType != data::NodeAttributes::PointwiseAttributes)
+        {
+            rootAttr = attrType; // last non-Pointwise wins (topological order)
+            foundRoot = true;
+        }
+    }
+
+    if(!foundRoot)
+    {
+        return maxAcrossNodes(wrapper, dataType);
+    }
+    return toleranceForNode(rootAttr, dataType);
+}
+
+// Future policies live here as sibling functions, e.g.:
+//   float propagatedBound(wrapper, dtype);    // analytic error propagation
+//   float dynamic(wrapper, dtype);            // wired to DynamicTolerances.hpp
+// Each is added without touching resolveTolerance or any caller — pass it in.
+
+// Warn (once per call site) when a graph has more than one output tensor.
+//
+// Every current aggregation policy reduces over the WHOLE graph, not the subgraph
+// that produces a given output: maxAcrossNodes takes the loosest of all nodes,
+// outputOpTolerance takes the single last non-Pointwise node. For a multi-output
+// graph neither is scoped to the output being toleranced, so a tolerance may be
+// attributed from an unrelated branch. The precise fix (per-output subgraph
+// scoping) is deferred together with per-node dtype keying (ALMIOPEN-2216),
+// because both need a per-op tensor-UID extractor. Until then we surface the
+// imprecision loudly rather than letting it pass silently.
+inline void warnIfMultipleOutputs(std::size_t outputCount, const char* context)
+{
+    if(outputCount > 1)
+    {
+        HIPDNN_PLUGIN_LOG_WARN(context
+                               << ": graph has " << outputCount
+                               << " output tensors; tolerance is reduced over the whole graph, not "
+                                  "the per-output subgraph (deferred, ALMIOPEN-2216)");
+    }
+}
+
+// Resolve the FINAL absolute/relative tolerance for an output tensor of the
+// given dtype: the chosen aggregation policy's default (max-across-nodes unless
+// overridden), then the TOML per-test override (highest priority) applied on top.
+// This is the single tolerance entry point for both harnesses — neither applies
+// the override separately, so the layering order (default -> override) lives in
+// exactly one place. The aggregation policy is a parameter (default
+// maxAcrossNodes) so a caller can select a different policy without any change
+// here.
+inline void resolveTolerance(const fb::GraphWrapper& wrapper,
+                             data::DataType dataType,
+                             const std::string& testName,
+                             float& atol,
+                             float& rtol,
+                             AggregationPolicy aggregate = maxAcrossNodes)
+{
+    const float defaultTolerance = aggregate(wrapper, dataType);
+    atol = defaultTolerance;
+    rtol = defaultTolerance;
+    applyTomlToleranceOverride(testName, atol, rtol);
+}
+
+} // namespace hipdnn_integration_tests::tolerance
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
index fca914087f87..22289c0c4a02 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormBackward.cpp
@@ -127,29 +127,6 @@ class BatchnormBackward : public IntegrationGraphVerificationHarness<DataType, B
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BatchnormBwdTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BatchnormBwdTensorIds::DY_UID)
-            ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-        bundle.tensors.at(BatchnormBwdTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-
-        if(!CalcStats)
-        {
-            bundle.tensors.at(BatchnormBwdTensorIds::MEAN_UID)
-                ->fillTensorWithRandomValues(-0.1f, 0.1f, seed);
-
-            bundle.tensors.at(BatchnormBwdTensorIds::INV_VARIANCE_UID)
-                ->fillTensorWithRandomValues(1.9f, 2.0f, seed);
-        }
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
index 46738378aadf..353948aabf8d 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardInferenceWithVariance.cpp
@@ -117,24 +117,6 @@ class BatchnormForwardInferenceWithVariance
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BnInfVarTensorIds::X_UID)->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::MEAN_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        // Variance must be non-negative; use positive range
-        bundle.tensors.at(BnInfVarTensorIds::VARIANCE_UID)
-            ->fillTensorWithRandomValues(0.1f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
index 979ad80feb5a..f50cc84e4d88 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormForwardTraining.cpp
@@ -208,41 +208,6 @@ class BatchnormForwardTraining
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        // Note: Epsilon and momentum are pass-by-value (set via set_value()), not buffers
-
-        // X input: default range
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-
-        // Scale and bias: -2.0 to 2.0 to match MIOpen
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 1);
-        bundle.tensors.at(BatchnormFwdTrainingTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 2);
-
-        // Running mean: only initialize PREV (input), leave NEXT (output) with sentinel
-        if(bundle.tensors.find(BatchnormFwdTrainingTensorIds::PREV_RUNNING_MEAN_UID)
-           != bundle.tensors.end())
-        {
-            bundle.tensors.at(BatchnormFwdTrainingTensorIds::PREV_RUNNING_MEAN_UID)
-                ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 1000);
-        }
-
-        // Running variance: only initialize PREV (input), leave NEXT (output) with sentinel
-        if(bundle.tensors.find(BatchnormFwdTrainingTensorIds::PREV_RUNNING_VARIANCE_UID)
-           != bundle.tensors.end())
-        {
-            bundle.tensors.at(BatchnormFwdTrainingTensorIds::PREV_RUNNING_VARIANCE_UID)
-                ->fillTensorWithRandomValues(-2.0f, 2.0f, seed + 2000);
-        }
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
index 3273a926df23..d136d07e90ce 100644
--- a/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
+++ b/dnn-providers/integration-tests/src/integration_tests/batchnorm/IntegrationGpuBatchnormFwdInferenceVarianceActiv.cpp
@@ -149,25 +149,6 @@ class BatchnormFwdInferenceVarianceActiv
     }
 
 protected:
-    void initializeBundle([[maybe_unused]] const graph::Graph& graph,
-                          GraphTensorBundle& bundle,
-                          unsigned int seed) override
-    {
-        bundle.sentinelFillOutputTensors();
-
-        bundle.tensors.at(BnInfVarActivTensorIds::X_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::MEAN_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        // Variance must be non-negative; use positive range
-        bundle.tensors.at(BnInfVarActivTensorIds::VARIANCE_UID)
-            ->fillTensorWithRandomValues(0.1f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::SCALE_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-        bundle.tensors.at(BnInfVarActivTensorIds::BIAS_UID)
-            ->fillTensorWithRandomValues(-1.0f, 1.0f, seed);
-    }
-
     void runGraphTest() override
     {
         const auto& testCase = this->GetParam();
diff --git a/dnn-providers/integration-tests/src/main.cpp b/dnn-providers/integration-tests/src/main.cpp
index 47645cdbd9eb..a2b787ca18f7 100644
--- a/dnn-providers/integration-tests/src/main.cpp
+++ b/dnn-providers/integration-tests/src/main.cpp
@@ -21,7 +21,8 @@
 #include "harness/SharedHandle.hpp"
 #include "harness/SupportMatrixCollector.hpp"
 #include "harness/TestConfig.hpp"
-#include "harness/golden/BundleRegistration.hpp"
+#include "harness/bundle/BundleRegistration.hpp"
+#include "harness/bundle/UnverifiableBundleReport.hpp"
 
 namespace
 {
@@ -96,10 +97,17 @@ int main(int argc, char** argv) noexcept
             .implicit_value(true)
             .help("Enable golden reference bundle test registration. "
                   "Can also be set via HIPDNN_TEST_ALLOW_BUNDLES=1 env var.");
-        parser.add_argument("--golden-data-dir")
+        parser.add_argument("--gd", "--golden-data-dir")
             .help("Path to the integration test bundle data directory. "
                   "Defaults to <exe>/../lib/integration_test_bundles/. "
                   "Can also be set via HIPDNN_TEST_GOLDEN_DATA_DIR env var.");
+        // --verification-mode governs BUNDLE tests (how the engine's output is
+        // verified). It is independent of --reference-executor, which governs the
+        // parameterized tests (which ref executor is exercised as the SUT).
+        parser.add_argument("--vm", "--verification-mode")
+            .help("How bundle engine output is verified: 'auto' (default; golden -> "
+                  "GPU ref -> CPU ref -> skip), 'golden', 'gpu', or 'cpu'. "
+                  "Can also be set via HIPDNN_TEST_VERIFICATION_MODE env var.");
 
         std::vector<std::string> remainingArgs;
         try
@@ -169,6 +177,22 @@ int main(int argc, char** argv) noexcept
             goldenDataDir = parser.get<std::string>("--golden-data-dir");
         }
 
+        // Parse --verification-mode (case-insensitive); invalid value -> exit 1.
+        std::optional<hipdnn_integration_tests::VerificationMode> verificationMode;
+        if(parser.is_used("--verification-mode"))
+        {
+            try
+            {
+                verificationMode = hipdnn_integration_tests::parseVerificationMode(
+                    parser.get<std::string>("--verification-mode"));
+            }
+            catch(const std::exception& e)
+            {
+                std::cerr << "Error: " << e.what() << '\n';
+                return 1;
+            }
+        }
+
         // Parse --test-article argument and load explicit plugin if provided
         std::optional<std::filesystem::path> articlePath;
         if(parser.is_used("--test-article"))
@@ -211,7 +235,8 @@ int main(int argc, char** argv) noexcept
                                                          std::move(configPath),
                                                          refExecType,
                                                          allowBundles,
-                                                         std::move(goldenDataDir));
+                                                         std::move(goldenDataDir),
+                                                         verificationMode);
 
         // Reconstruct argc/argv for GTest from remaining (unknown) args.
         // argv[0] (program name) must be first — GTest requires it.
@@ -267,10 +292,14 @@ int main(int argc, char** argv) noexcept
             return 1;
         }
 
-        hipdnn_integration_tests::golden::registerBundleTests();
+        hipdnn_integration_tests::bundle::registerBundleTests();
 
         const int result = RUN_ALL_TESTS();
 
+        // Print bundles that ended without a verdict (no oracle / reference bug).
+        // Informational only — these SKIP, so they do not affect `result`.
+        hipdnn_integration_tests::bundle::UnverifiableBundleReport::get().print();
+
         // Generate support matrix if requested
         if(hipdnn_integration_tests::SupportMatrixCollector::get().isEnabled())
         {
diff --git a/dnn-providers/integration-tests/tests/CMakeLists.txt b/dnn-providers/integration-tests/tests/CMakeLists.txt
index eeb4e99d84bc..9bd2c2970847 100644
--- a/dnn-providers/integration-tests/tests/CMakeLists.txt
+++ b/dnn-providers/integration-tests/tests/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 add_executable(hipdnn_integration_tests_unit_tests
     main.cpp
+    ../src/harness/bundle/IntegrationBundleVerificationHarness.cpp
     TestArchMatch.cpp
     TestBundleMetadata.cpp
     TestGraphDescription.cpp
@@ -17,7 +18,10 @@ add_executable(hipdnn_integration_tests_unit_tests
     TestReferenceGraphExecutorFactory.cpp
     TestBundleDiscovery.cpp
     TestVerificationPaths.cpp
-    TestGoldenVerificationHarness.cpp
+    TestBundleVerificationHarness.cpp
+    TestSynthesisTracker.cpp
+    TestSynthesizeInputs.cpp
+    TestVerificationModePaths.cpp
 )
 
 target_include_directories(hipdnn_integration_tests_unit_tests
diff --git a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
index f3b8b8179c8d..513dfd4be142 100644
--- a/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleDiscovery.cpp
@@ -12,10 +12,10 @@
 #include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/LoadGraphAndTensors.hpp>
 
-#include "harness/golden/BundleDiscovery.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 // NOLINTBEGIN(readability-identifier-naming)
 
@@ -63,9 +63,10 @@ class TestBundleDiscoveryFixture : public ::testing::Test
                R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
     }
 
-    // Writes a valid {name}.meta.json companion. Metadata is mandatory for any
-    // bundle expected to load successfully (loadIntegrationTestBundle returns
-    // LoadError::MISSING_METADATA without it).
+    // Writes a valid {name}.meta.json companion. Metadata is mandatory for a
+    // golden bundle (one shipping output .bin blobs) — loadIntegrationTestBundle
+    // returns LoadError::MISSING_METADATA for those without it — and optional for
+    // a no-golden / graph-only bundle.
     static void writeMetadata(const std::filesystem::path& dir, const std::string& name)
     {
         std::ofstream(dir / (name + ".meta.json"))
@@ -143,11 +144,14 @@ TEST_F(TestBundleDiscoveryFixture, TieredGoldenDataLayoutIsDiscovered)
     EXPECT_EQ(result.front().testName, "Small");
 }
 
-TEST_F(TestBundleDiscoveryFixture, JsonAtRootThrows)
+TEST_F(TestBundleDiscoveryFixture, JsonAtRootUsesFolderNameAsSuite)
 {
-    // A .json directly at the data root has no folder to form a suite -> throw.
+    // A .json directly at the data root uses the root folder name as suite.
     std::ofstream(_tempDir / "graph.json") << R"({"tensors": []})";
-    EXPECT_THROW(discoverBundles(_tempDir), std::runtime_error);
+    auto result = discoverBundles(_tempDir);
+    ASSERT_EQ(result.size(), 1u);
+    EXPECT_EQ(result[0].suiteName, sanitizeForGtest(_tempDir.filename().string()));
+    EXPECT_EQ(result[0].testName, "graph");
 }
 
 TEST_F(TestBundleDiscoveryFixture, EmptyLeafFolderWarnsAndSkips)
@@ -339,14 +343,33 @@ TEST_F(TestBundleDiscoveryFixture, LoadBundlePopulatesMetadataWhenPresent)
     EXPECT_EQ(*bundle.metadata.seed, 42);
 }
 
-// A valid-graph bundle WITHOUT a .meta.json companion is a load error: metadata
-// is mandatory.
-TEST_F(TestBundleDiscoveryFixture, LoadBundleMissingMetadataIsError)
+// A graph-only bundle (no .bin blobs, hence no golden data) without a .meta.json
+// companion loads successfully: metadata validates golden data, and there is
+// none here, so absent metadata is valid and default-constructed.
+TEST_F(TestBundleDiscoveryFixture, LoadGraphOnlyBundleMissingMetadataLoads)
 {
     auto dir = _tempDir / "op" / "nometa";
-    createMinimalBundle(dir, "nometa"); // graph only, no .meta.json
+    createMinimalBundle(dir, "nometa"); // graph only, no .meta.json, no .bin
     const auto jsonPath = dir / "nometa.json";
 
+    auto result = loadIntegrationTestBundle(jsonPath);
+    ASSERT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
+    const auto& bundle = std::get<IntegrationTestBundle>(result);
+
+    EXPECT_FALSE(bundle.tensors.has_value()); // graph-only: no tensor data
+    EXPECT_FALSE(bundle.hasGoldenOutputs);
+    EXPECT_FALSE(bundle.metadata.operation.has_value()); // default-constructed
+}
+
+// A GOLDEN bundle (output .bin blobs present) WITHOUT a .meta.json companion is
+// a load error: metadata is mandatory whenever there is golden data to validate.
+TEST_F(TestBundleDiscoveryFixture, LoadGoldenBundleMissingMetadataIsError)
+{
+    auto dir = _tempDir / "op" / "goldennometa";
+    createLoadableBundle(dir, "goldennometa"); // writes .bin (inputs+outputs) + meta
+    std::filesystem::remove(dir / "goldennometa.meta.json"); // drop the metadata
+    const auto jsonPath = dir / "goldennometa.json";
+
     auto result = loadIntegrationTestBundle(jsonPath);
     ASSERT_TRUE(std::holds_alternative<LoadError>(result));
     EXPECT_EQ(std::get<LoadError>(result), LoadError::MISSING_METADATA);
@@ -359,7 +382,7 @@ TEST_F(TestBundleDiscoveryFixture, LoadBundleMissingBinIsGraphOnly)
 {
     auto dir = _tempDir / "op" / "nobin";
     createMinimalBundle(dir, "nobin");
-    writeMetadata(dir, "nobin"); // metadata is mandatory even for graph-only
+    writeMetadata(dir, "nobin"); // metadata present (optional here, but exercised)
     const auto jsonPath = dir / "nobin.json";
 
     auto result = loadIntegrationTestBundle(jsonPath);
diff --git a/dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp b/dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
similarity index 94%
rename from dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp
rename to dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
index 95dcfb6ea887..cffe75ed9383 100644
--- a/dnn-providers/integration-tests/tests/TestGoldenVerificationHarness.cpp
+++ b/dnn-providers/integration-tests/tests/TestBundleVerificationHarness.cpp
@@ -1,7 +1,7 @@
 // Copyright © Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// Unit tests for IntegrationGraphGoldenReferenceVerificationHarness's core
+// Unit tests for IntegrationBundleVerificationHarness's core
 // contract: how it translates an executor's behaviour into a GTest outcome.
 //
 //   executor throws (unsupported graph) -> SKIP
@@ -27,12 +27,12 @@
 
 #include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
 
-#include "harness/golden/IntegrationGraphGoldenReferenceVerificationHarness.hpp"
-#include "harness/golden/IntegrationTestBundle.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 namespace
 {
@@ -40,19 +40,19 @@ namespace
 // Exposes the harness's protected SetUp/TestBody so a test can drive the full
 // lifecycle directly, and overrides executeGraphThroughEngine with a stub so the
 // tests run on CPU-only CI without a real GPU engine.
-class TestableHarness : public IntegrationGraphGoldenReferenceVerificationHarness
+class TestableHarness : public IntegrationBundleVerificationHarness
 {
 public:
     using StubFunc = std::function<void(std::unordered_map<int64_t, void*>&)>;
 
     explicit TestableHarness(StubFunc stub)
-        : IntegrationGraphGoldenReferenceVerificationHarness(/*requiresDevice=*/false)
+        : IntegrationBundleVerificationHarness(/*requiresDevice=*/false)
         , _stub(std::move(stub))
     {
     }
 
-    using IntegrationGraphGoldenReferenceVerificationHarness::SetUp;
-    using IntegrationGraphGoldenReferenceVerificationHarness::TestBody;
+    using IntegrationBundleVerificationHarness::SetUp;
+    using IntegrationBundleVerificationHarness::TestBody;
 
 protected:
     void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
diff --git a/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
new file mode 100644
index 000000000000..ac6e3e930d7c
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestSynthesisTracker.cpp
@@ -0,0 +1,175 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+
+#include "harness/input_init/SynthesisTracker.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_integration_tests;
+
+namespace
+{
+
+InputTensorMap makeTensors(const std::vector<int64_t>& uids)
+{
+    InputTensorMap map;
+    for(const int64_t uid : uids)
+    {
+        map[uid] = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(
+            std::vector<int64_t>{2, 3}, std::vector<int64_t>{3, 1});
+        map[uid]->fillTensorWithValue(0.f);
+    }
+    return map;
+}
+
+} // namespace
+
+// All owned inputs declared FREE -> ok().
+TEST(TestSynthesisTracker, AllFreeSucceeds)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    const std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.fillFree(2, -1.f, 1.f, rng);
+    tracker.fillFree(3, -1.f, 1.f, rng);
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// An owned input left undeclared -> unsupported().
+TEST(TestSynthesisTracker, UndeclaredInputFails)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    const std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    // uid 2 and 3 never declared
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("uid=2"), std::string::npos);
+    EXPECT_NE(result.reason.find("uid=3"), std::string::npos);
+}
+
+// A STRUCTURED input -> unsupported() with diagnostic.
+TEST(TestSynthesisTracker, StructuredInputFails)
+{
+    auto inputs = makeTensors({1, 2});
+    const std::vector<int64_t> owned = {1, 2};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(2, "page_table");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("page_table"), std::string::npos);
+    EXPECT_NE(result.reason.find("structured"), std::string::npos);
+}
+
+// A DERIVED input -> unsupported() with diagnostic.
+TEST(TestSynthesisTracker, DerivedInputFails)
+{
+    auto inputs = makeTensors({1, 2});
+    const std::vector<int64_t> owned = {1, 2};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markDerived(2, "forward_output");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("forward_output"), std::string::npos);
+    EXPECT_NE(result.reason.find("derived"), std::string::npos);
+}
+
+// uid 0 (absent optional tensor) is silently ignored, not treated as owned.
+TEST(TestSynthesisTracker, ZeroUidIgnored)
+{
+    auto inputs = makeTensors({1});
+    const std::vector<int64_t> owned = {1};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(0, "absent_optional");
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// A uid not in the owned set is silently ignored.
+TEST(TestSynthesisTracker, NonOwnedUidIgnored)
+{
+    auto inputs = makeTensors({1, 99});
+    const std::vector<int64_t> owned = {1};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.fillFree(99, -1.f, 1.f, rng); // not owned, ignored
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// Empty owned set -> ok() trivially (no inputs to account for).
+TEST(TestSynthesisTracker, EmptyOwnedSucceeds)
+{
+    InputTensorMap inputs;
+    const std::vector<int64_t> owned;
+
+    const SynthesisTracker tracker(owned, inputs);
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_TRUE(result.filled);
+}
+
+// Mixed: some FREE, one STRUCTURED, one undeclared -> both problems reported.
+TEST(TestSynthesisTracker, MixedFailuresReportAll)
+{
+    auto inputs = makeTensors({1, 2, 3});
+    const std::vector<int64_t> owned = {1, 2, 3};
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(owned, inputs);
+    tracker.fillFree(1, -1.f, 1.f, rng);
+    tracker.markStructured(2, "seed");
+    // uid 3 undeclared
+
+    const auto result = tracker.finish("TestOp");
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("seed"), std::string::npos);
+    EXPECT_NE(result.reason.find("uid=3"), std::string::npos);
+}
+
+// SynthesisResult::ok() and ::unsupported() factory methods.
+TEST(TestSynthesisResult, FactoryMethods)
+{
+    const auto ok = SynthesisResult::ok();
+    EXPECT_TRUE(ok.filled);
+    EXPECT_TRUE(ok.reason.empty());
+
+    const auto bad = SynthesisResult::unsupported("cannot synthesize X");
+    EXPECT_FALSE(bad.filled);
+    EXPECT_EQ(bad.reason, "cannot synthesize X");
+}
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
new file mode 100644
index 000000000000..c9de5c4c1405
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestSynthesizeInputs.cpp
@@ -0,0 +1,434 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <vector>
+
+#include <hipdnn_data_sdk/utilities/Tensor.hpp>
+#include <hipdnn_flatbuffers_sdk/data_objects/graph_generated.h>
+
+#include "harness/input_init/SynthesizeInputs.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_flatbuffers_sdk::data_objects;
+using namespace hipdnn_integration_tests;
+
+namespace
+{
+
+const std::vector<int64_t> kDims = {2, 3};
+const std::vector<int64_t> kStrides = {3, 1};
+
+InputTensorMap makeTensors(const std::vector<int64_t>& uids)
+{
+    InputTensorMap map;
+    for(const int64_t uid : uids)
+    {
+        map[uid] = std::make_unique<hipdnn_data_sdk::utilities::Tensor<float>>(kDims, kStrides);
+        map[uid]->fillTensorWithValue(0.f);
+    }
+    return map;
+}
+
+struct GraphResult
+{
+    flatbuffers::FlatBufferBuilder builder;
+    const Graph* graph = nullptr;
+
+    const Node& node(uint32_t i) const
+    {
+        return *graph->nodes()->Get(i);
+    }
+
+    std::vector<int64_t> leafInputUids(const std::set<int64_t>& outputUids) const
+    {
+        std::vector<int64_t> uids;
+        for(const auto* t : *graph->tensors())
+        {
+            if(!t->virtual_() && outputUids.count(t->uid()) == 0)
+            {
+                uids.push_back(t->uid());
+            }
+        }
+        return uids;
+    }
+};
+
+// ── Conv fwd (single node) ──────────────────────────────────────────────────
+
+GraphResult buildConvFwdGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "y", DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 3);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── Conv + bias (2-node fused) ──────────────────────────────────────────────
+// conv.y (uid 10) is virtual; bias (uid 4) is leaf
+
+GraphResult buildConvBiasGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "conv_y", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 4, "bias", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 5, "out", DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
+    auto add = CreatePointwiseAttributes(b,
+                                         PointwiseMode::ADD,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         10,
+                                         4,
+                                         flatbuffers::nullopt,
+                                         5);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── Conv + bias + relu (3-node fused) ───────────────────────────────────────
+// conv.y (uid 10) virtual, bias_add.out (uid 11) virtual, relu.in_0=uid 11, relu.out_0=uid 6
+
+GraphResult buildConvBiasReluGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "x", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "w", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "conv_y", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 4, "bias", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 11, "bias_out", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 6, "out", DataType::FLOAT, &kStrides, &kDims));
+
+    auto conv = CreateConvolutionFwdAttributesDirect(b, 1, 2, 10);
+    auto add = CreatePointwiseAttributes(b,
+                                         PointwiseMode::ADD,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         flatbuffers::nullopt,
+                                         10,
+                                         4,
+                                         flatbuffers::nullopt,
+                                         11);
+    auto relu = CreatePointwiseAttributes(b,
+                                          PointwiseMode::RELU_FWD,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          11,
+                                          flatbuffers::nullopt,
+                                          flatbuffers::nullopt,
+                                          6);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "conv", DataType::FLOAT, NodeAttributes::ConvolutionFwdAttributes, conv.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "bias_add", DataType::FLOAT, NodeAttributes::PointwiseAttributes, add.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "relu", DataType::FLOAT, NodeAttributes::PointwiseAttributes, relu.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA forward (no structured optionals) ──────────────────────────────────
+
+GraphResult buildSdpaFwdGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+
+    auto sdpa = CreateSdpaAttributes(b, 1, 2, 3, 4);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA forward with structured seq_len_q ──────────────────────────────────
+
+GraphResult buildSdpaFwdWithStructuredGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 5, "seq_len_q", DataType::FLOAT, &kStrides, &kDims));
+
+    auto sdpa = CreateSdpaAttributes(b,
+                                     1,
+                                     2,
+                                     3,
+                                     4,
+                                     flatbuffers::nullopt, // attn_mask
+                                     flatbuffers::nullopt, // scale
+                                     5); // seq_len_q
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, sdpa.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA backward standalone ────────────────────────────────────────────────
+// O and stats are leaf inputs (not virtual) → DERIVED → refuses
+
+GraphResult buildSdpaBwdStandaloneGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 4, "o", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 6, "stats", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv", DataType::FLOAT, &kStrides, &kDims));
+
+    auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 4, 5, 6, 7, 8, 9);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+// ── SDPA fwd+bwd fused ─────────────────────────────────────────────────────
+// O (uid 10) and stats (uid 11) are virtual inter-node tensors.
+// Leaf inputs: Q(1), K(2), V(3) from fwd + dO(5) from bwd.
+// Outputs: dQ(7), dK(8), dV(9).
+
+GraphResult buildSdpaFwdBwdFusedGraph()
+{
+    GraphResult r;
+    auto& b = r.builder;
+
+    std::vector<flatbuffers::Offset<TensorAttributes>> tensors;
+    tensors.push_back(CreateTensorAttributesDirect(b, 1, "q", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 2, "k", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 3, "v", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 10, "o", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(
+        CreateTensorAttributesDirect(b, 11, "stats", DataType::FLOAT, &kStrides, &kDims, true));
+    tensors.push_back(CreateTensorAttributesDirect(b, 5, "do", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 7, "dq", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 8, "dk", DataType::FLOAT, &kStrides, &kDims));
+    tensors.push_back(CreateTensorAttributesDirect(b, 9, "dv", DataType::FLOAT, &kStrides, &kDims));
+
+    auto fwd = CreateSdpaAttributes(b,
+                                    1,
+                                    2,
+                                    3,
+                                    10,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    flatbuffers::nullopt,
+                                    11); // stats_tensor_uid
+
+    auto bwd = CreateSdpaBackwardAttributes(b, 1, 2, 3, 10, 5, 11, 7, 8, 9);
+
+    std::vector<flatbuffers::Offset<Node>> nodes;
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_fwd", DataType::FLOAT, NodeAttributes::SdpaAttributes, fwd.Union()));
+    nodes.push_back(CreateNodeDirect(
+        b, "sdpa_bwd", DataType::FLOAT, NodeAttributes::SdpaBackwardAttributes, bwd.Union()));
+
+    auto graph = CreateGraphDirect(
+        b, "test", DataType::FLOAT, DataType::FLOAT, DataType::FLOAT, &tensors, &nodes);
+    b.Finish(graph);
+
+    r.graph = GetGraph(b.GetBufferPointer());
+    return r;
+}
+
+SynthesisResult runSynthesis(const GraphResult& gr, const std::set<int64_t>& outputUids)
+{
+    const auto leafUids = gr.leafInputUids(outputUids);
+    auto inputs = makeTensors(leafUids);
+    std::mt19937 rng(42);
+
+    SynthesisTracker tracker(leafUids, inputs);
+    for(uint32_t i = 0; i < gr.graph->nodes()->size(); ++i)
+    {
+        const SynthesisResult nodeResult
+            = synthesizeNodeInputs(*gr.graph->nodes()->Get(i), tracker, rng);
+        if(!nodeResult.filled)
+        {
+            return nodeResult;
+        }
+    }
+    return tracker.finish("test");
+}
+
+} // namespace
+
+// ── Test cases ──────────────────────────────────────────────────────────────
+
+TEST(TestSynthesizeInputs, SingleConvFwd)
+{
+    const auto gr = buildConvFwdGraph();
+    const auto result = runSynthesis(gr, {3});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, ConvPlusBiasFused)
+{
+    const auto gr = buildConvBiasGraph();
+    const auto result = runSynthesis(gr, {5});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, ConvPlusBiasPlusReluFused)
+{
+    const auto gr = buildConvBiasReluGraph();
+    const auto result = runSynthesis(gr, {6});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdNoStructuredOptionals)
+{
+    const auto gr = buildSdpaFwdGraph();
+    const auto result = runSynthesis(gr, {4});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdWithStructuredInputRefuses)
+{
+    const auto gr = buildSdpaFwdWithStructuredGraph();
+    const auto result = runSynthesis(gr, {4});
+
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("seq_len_q"), std::string::npos);
+    EXPECT_NE(result.reason.find("structured"), std::string::npos);
+}
+
+TEST(TestSynthesizeInputs, SdpaBwdStandaloneRefusesDerived)
+{
+    const auto gr = buildSdpaBwdStandaloneGraph();
+    const auto result = runSynthesis(gr, {7, 8, 9});
+
+    EXPECT_FALSE(result.filled);
+    EXPECT_NE(result.reason.find("derived"), std::string::npos);
+}
+
+TEST(TestSynthesizeInputs, SdpaFwdBwdFusedSucceeds)
+{
+    const auto gr = buildSdpaFwdBwdFusedGraph();
+    const auto result = runSynthesis(gr, {7, 8, 9});
+
+    EXPECT_TRUE(result.filled) << result.reason;
+}
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/dnn-providers/integration-tests/tests/TestTestConfig.cpp b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
index 4f45e39c103a..f6da2da519f8 100644
--- a/dnn-providers/integration-tests/tests/TestTestConfig.cpp
+++ b/dnn-providers/integration-tests/tests/TestTestConfig.cpp
@@ -67,6 +67,72 @@ TEST(TestConfigUninitialized, GetReferenceExecutorTypeThrowsWhenUninitialized)
     EXPECT_THROW(TestConfig::get().getReferenceExecutorType(), std::runtime_error);
 }
 
+// parseVerificationMode is a free function (no singleton state), so it can be
+// exercised regardless of initialization.
+TEST(TestParseVerificationMode, AcceptsAllValidValuesCaseInsensitive)
+{
+    using hipdnn_integration_tests::parseVerificationMode;
+    using hipdnn_integration_tests::VerificationMode;
+
+    EXPECT_EQ(parseVerificationMode("auto"), VerificationMode::AUTO);
+    EXPECT_EQ(parseVerificationMode("golden"), VerificationMode::GOLDEN);
+    EXPECT_EQ(parseVerificationMode("gpu"), VerificationMode::GPU);
+    EXPECT_EQ(parseVerificationMode("cpu"), VerificationMode::CPU);
+
+    EXPECT_EQ(parseVerificationMode("AUTO"), VerificationMode::AUTO);
+    EXPECT_EQ(parseVerificationMode("Golden"), VerificationMode::GOLDEN);
+    EXPECT_EQ(parseVerificationMode("GPU"), VerificationMode::GPU);
+}
+
+TEST(TestParseVerificationMode, ThrowsOnInvalidValue)
+{
+    EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode("bogus"), std::runtime_error);
+    EXPECT_THROW(hipdnn_integration_tests::parseVerificationMode(""), std::runtime_error);
+}
+
+// resolveVerificationMode / resolveGoldenDataDir are free functions that
+// implement the "CLI wins, then env, then nullopt" precedence chain.
+// They don't touch the singleton so they can be tested freely.
+
+TEST(TestResolveVerificationMode, CliValueWinsOverEnv)
+{
+    using hipdnn_integration_tests::resolveVerificationMode;
+    using hipdnn_integration_tests::VerificationMode;
+
+    // Even if the env var were set, the CLI value takes precedence.
+    const auto result = resolveVerificationMode(VerificationMode::GPU);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, VerificationMode::GPU);
+}
+
+TEST(TestResolveVerificationMode, NulloptCliWithoutEnvReturnsNullopt)
+{
+    using hipdnn_integration_tests::resolveVerificationMode;
+
+    // Assuming HIPDNN_TEST_VERIFICATION_MODE is not set in the test env.
+    const auto result = resolveVerificationMode(std::nullopt);
+    EXPECT_FALSE(result.has_value());
+}
+
+TEST(TestResolveGoldenDataDir, CliValueWinsOverEnv)
+{
+    using hipdnn_integration_tests::resolveGoldenDataDir;
+
+    const std::filesystem::path cliPath = "/explicit/golden/dir";
+    const auto result = resolveGoldenDataDir(cliPath);
+    ASSERT_TRUE(result.has_value());
+    EXPECT_EQ(*result, cliPath);
+}
+
+TEST(TestResolveGoldenDataDir, NulloptCliWithoutEnvReturnsNullopt)
+{
+    using hipdnn_integration_tests::resolveGoldenDataDir;
+
+    // Assuming HIPDNN_TEST_GOLDEN_DATA_DIR is not set in the test env.
+    const auto result = resolveGoldenDataDir(std::nullopt);
+    EXPECT_FALSE(result.has_value());
+}
+
 // ---------------------------------------------------------------------------
 // Suite 2 – initialized singleton (all args provided)
 // ---------------------------------------------------------------------------
@@ -129,6 +195,23 @@ TEST_F(TestConfigInitialized, GetReferenceExecutorTypeDefaultsToCpu)
               hipdnn_integration_tests::ReferenceExecutorType::CPU);
 }
 
+TEST_F(TestConfigInitialized, GetVerificationModeDefaultsToAuto)
+{
+    // No CLI flag and (assuming) no env var -> AUTO.
+    EXPECT_EQ(TestConfig::get().getVerificationMode(),
+              hipdnn_integration_tests::VerificationMode::AUTO);
+}
+
+TEST_F(TestConfigInitialized, HasGoldenDataDirReturnsFalseWhenNotProvided)
+{
+    EXPECT_FALSE(TestConfig::get().hasGoldenDataDir());
+}
+
+TEST_F(TestConfigInitialized, GetGoldenDataDirThrowsWhenNotProvided)
+{
+    EXPECT_THROW(TestConfig::get().getGoldenDataDir(), std::runtime_error);
+}
+
 TEST_F(TestConfigInitialized, DoubleInitializeThrows)
 {
     EXPECT_THROW(TestConfig::initialize(std::nullopt, std::nullopt), std::runtime_error);
diff --git a/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
new file mode 100644
index 000000000000..be39945b972d
--- /dev/null
+++ b/dnn-providers/integration-tests/tests/TestVerificationModePaths.cpp
@@ -0,0 +1,411 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Tests the verification mode dispatch logic in the harness:
+//
+//   AUTO mode:    golden → GPU ref → CPU ref → SKIP
+//   GOLDEN mode:  golden or SKIP
+//   GPU/CPU mode: explicit ref or SKIP/FAIL
+//
+// Each test overrides getVerificationMode() and the executor stubs to exercise
+// one branch without touching the TestConfig singleton.
+
+#include <gtest/gtest-spi.h>
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <hipdnn_test_sdk/utilities/FileUtilities.hpp>
+
+#include "harness/ReferenceCapabilityError.hpp"
+#include "harness/TestConfig.hpp"
+#include "harness/bundle/IntegrationBundleVerificationHarness.hpp"
+#include "harness/bundle/IntegrationTestBundle.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+using namespace hipdnn_integration_tests;
+using namespace hipdnn_integration_tests::bundle;
+
+namespace
+{
+
+using EngineStub = std::function<void(std::unordered_map<int64_t, void*>&)>;
+using RefStub = std::function<void(ReferenceExecutorType, std::unordered_map<int64_t, void*>&)>;
+
+class ModeTestableHarness : public IntegrationBundleVerificationHarness
+{
+public:
+    ModeTestableHarness(VerificationMode mode, EngineStub engineStub, RefStub refStub)
+        : IntegrationBundleVerificationHarness(/*requiresDevice=*/false)
+        , _mode(mode)
+        , _engineStub(std::move(engineStub))
+        , _refStub(std::move(refStub))
+    {
+    }
+
+    using IntegrationBundleVerificationHarness::SetUp;
+    using IntegrationBundleVerificationHarness::TestBody;
+
+protected:
+    VerificationMode getVerificationMode() const override
+    {
+        return _mode;
+    }
+
+    void executeGraphThroughEngine(std::unordered_map<int64_t, void*>& variantPack) override
+    {
+        _engineStub(variantPack);
+    }
+
+    void runReferenceExecutor(ReferenceExecutorType type,
+                              std::unordered_map<int64_t, void*>& variantPack) override
+    {
+        _refStub(type, variantPack);
+    }
+
+    std::unique_ptr<IReferenceGraphExecutor>
+        makeReferenceExecutor(ReferenceExecutorType /*type*/) override
+    {
+        return nullptr;
+    }
+
+    // These tests exercise verification-mode dispatch, not the VRAM/arch
+    // hardware guards. Override to a no-op so they don't reach into the
+    // (uninitialized-in-this-binary) TestConfig singleton.
+    void applyMetadataGuards() const override {}
+
+private:
+    VerificationMode _mode;
+    EngineStub _engineStub;
+    RefStub _refStub;
+};
+
+class TestVerificationModePathsFixture : public ::testing::Test
+{
+protected:
+    std::optional<hipdnn_test_sdk::utilities::ScopedDirectory> _scopedDir;
+    std::filesystem::path _tempDir;
+
+    void SetUp() override
+    {
+        auto path
+            = std::filesystem::temp_directory_path()
+              / ("vmode_test_"
+                 + std::to_string(::testing::UnitTest::GetInstance()->current_test_info()->line()));
+        std::filesystem::remove_all(path);
+        _scopedDir.emplace(path);
+        _tempDir = _scopedDir->path();
+    }
+
+    static constexpr float K_OUTPUT_VALUE = 3.5f;
+    static constexpr int64_t K_OUTPUT_UID = 5;
+    static constexpr size_t K_OUTPUT_ELEMS = 120;
+
+    static void writeBundleFiles(const std::filesystem::path& dir,
+                                 const std::string& name,
+                                 bool includeGoldenOutput)
+    {
+        std::filesystem::create_directories(dir);
+        std::ofstream(dir / (name + ".json"))
+            << R"({"nodes": [{"inputs": {"x_tensor_uid": 0, "mean_tensor_uid": 1, )"
+               R"("inv_variance_tensor_uid": 2, "scale_tensor_uid": 3, "bias_tensor_uid": 4}, )"
+               R"("outputs": {"y_tensor_uid": 5}, "type": "BatchnormInferenceAttributes", )"
+               R"("compute_data_type": "float", "name": ""}], "tensors": [)"
+               R"({"name": "", "uid": 0, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 1, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 2, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 3, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 4, "strides": [3, 1, 1, 1], "dims": [1, 3, 1, 1], )"
+               R"("data_type": "float", "virtual": false}, )"
+               R"({"name": "", "uid": 5, "strides": [60, 20, 5, 1], "dims": [2, 3, 4, 5], )"
+               R"("data_type": "float", "virtual": false}], "io_data_type": "float", )"
+               R"("compute_data_type": "float", "intermediate_data_type": "float", "name": ""})";
+
+        std::ofstream(dir / (name + ".meta.json"))
+            << R"({"format_version": 1, "operation": "BatchnormInference"})";
+
+        const auto basePath = (dir / name).string();
+        const auto writeFloatBin = [&](int64_t uid, size_t elems, float value) {
+            const std::vector<float> data(elems, value);
+            std::ofstream out(basePath + ".tensor" + std::to_string(uid) + ".bin",
+                              std::ios::binary);
+            out.write(reinterpret_cast<const char*>(data.data()),
+                      static_cast<std::streamsize>(data.size() * sizeof(float)));
+        };
+
+        writeFloatBin(0, 120, 0.0f);
+        writeFloatBin(1, 3, 0.0f);
+        writeFloatBin(2, 3, 0.0f);
+        writeFloatBin(3, 3, 0.0f);
+        writeFloatBin(4, 3, 0.0f);
+
+        if(includeGoldenOutput)
+        {
+            writeFloatBin(K_OUTPUT_UID, K_OUTPUT_ELEMS, K_OUTPUT_VALUE);
+        }
+    }
+
+    std::shared_ptr<IntegrationTestBundle> loadBundle(const std::string& name,
+                                                      bool includeGoldenOutput) const
+    {
+        const auto dir = _tempDir / name;
+        writeBundleFiles(dir, name, includeGoldenOutput);
+        auto result = loadIntegrationTestBundle(dir / (name + ".json"));
+        EXPECT_TRUE(std::holds_alternative<IntegrationTestBundle>(result));
+        return std::make_shared<IntegrationTestBundle>(
+            std::move(std::get<IntegrationTestBundle>(result)));
+    }
+
+    static void writeOutput(std::unordered_map<int64_t, void*>& variantPack, float value)
+    {
+        auto* ptr = static_cast<float*>(variantPack.at(K_OUTPUT_UID));
+        std::fill(ptr, ptr + K_OUTPUT_ELEMS, value);
+    }
+
+    static void runCapturing(std::shared_ptr<IntegrationTestBundle> bundle,
+                             VerificationMode mode,
+                             EngineStub engineStub,
+                             RefStub refStub,
+                             ::testing::TestPartResultArray* results)
+    {
+        ModeTestableHarness harness(mode, std::move(engineStub), std::move(refStub));
+        harness.setBundle(std::move(bundle), "vmode-test-bundle");
+
+        const ::testing::ScopedFakeTestPartResultReporter reporter(
+            ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, results);
+        harness.SetUp();
+        harness.TestBody();
+    }
+
+    static bool anySkipped(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).skipped())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool anyFailed(const ::testing::TestPartResultArray& results)
+    {
+        for(int i = 0; i < results.size(); ++i)
+        {
+            if(results.GetTestPartResult(i).failed())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static EngineStub matchingEngine()
+    {
+        return [](std::unordered_map<int64_t, void*>& vp) { writeOutput(vp, K_OUTPUT_VALUE); };
+    }
+
+    static EngineStub mismatchingEngine()
+    {
+        return [](std::unordered_map<int64_t, void*>& vp) {
+            writeOutput(vp, K_OUTPUT_VALUE + 100.0f);
+        };
+    }
+
+    static RefStub matchingRef()
+    {
+        return [](ReferenceExecutorType, std::unordered_map<int64_t, void*>& vp) {
+            writeOutput(vp, K_OUTPUT_VALUE);
+        };
+    }
+
+    static RefStub capabilityMissRef()
+    {
+        return [](ReferenceExecutorType, std::unordered_map<int64_t, void*>&) {
+            throw ReferenceCapabilityError("stub: unsupported op");
+        };
+    }
+
+    static RefStub gpuMissCpuMatchRef()
+    {
+        return [](ReferenceExecutorType type, std::unordered_map<int64_t, void*>& vp) {
+            if(type == ReferenceExecutorType::GPU)
+            {
+                throw ReferenceCapabilityError("stub: no GPU ref plan");
+            }
+            writeOutput(vp, K_OUTPUT_VALUE);
+        };
+    }
+};
+
+} // namespace
+
+// ── AUTO mode ───────────────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, AutoWithGoldenUsesGoldenAndPasses)
+{
+    ::testing::TestPartResultArray results;
+    bool refCalled = false;
+    runCapturing(
+        loadBundle("auto_golden", /*includeGoldenOutput=*/true),
+        VerificationMode::AUTO,
+        matchingEngine(),
+        [&](ReferenceExecutorType, std::unordered_map<int64_t, void*>&) { refCalled = true; },
+        &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+    EXPECT_FALSE(refCalled) << "Reference executor should NOT run when golden data is present";
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoWithGoldenMismatchFails)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_golden_mm", /*includeGoldenOutput=*/true),
+                 VerificationMode::AUTO,
+                 mismatchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_TRUE(anyFailed(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_gpu", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenRefMissFallsThroughToCpu)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_fallthrough", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 gpuMissCpuMatchRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, AutoNoGoldenBothRefsMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("auto_both_miss", /*includeGoldenOutput=*/false),
+                 VerificationMode::AUTO,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── GOLDEN mode ─────────────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, GoldenModeWithDataPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("golden_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::GOLDEN,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, GoldenModeWithoutDataSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("golden_absent", /*includeGoldenOutput=*/false),
+                 VerificationMode::GOLDEN,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── Explicit GPU mode ───────────────────────────────────────────────────────
+// "Device" in these case names denotes VerificationMode::GPU (the device-side
+// reference executor). The literal "Gpu" keyword is reserved by the test-name
+// linter for the suite name and so cannot appear in the case name.
+
+TEST_F(TestVerificationModePathsFixture, DeviceModeRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("gpu_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::GPU,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, DeviceModeCapabilityMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("gpu_miss", /*includeGoldenOutput=*/true),
+                 VerificationMode::GPU,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// ── Explicit CPU mode ───────────────────────────────────────────────────────
+
+TEST_F(TestVerificationModePathsFixture, CpuModeRefSucceedsPasses)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("cpu_ok", /*includeGoldenOutput=*/true),
+                 VerificationMode::CPU,
+                 matchingEngine(),
+                 matchingRef(),
+                 &results);
+
+    EXPECT_FALSE(anyFailed(results));
+    EXPECT_FALSE(anySkipped(results));
+}
+
+TEST_F(TestVerificationModePathsFixture, CpuModeCapabilityMissSkips)
+{
+    ::testing::TestPartResultArray results;
+    runCapturing(loadBundle("cpu_miss", /*includeGoldenOutput=*/true),
+                 VerificationMode::CPU,
+                 matchingEngine(),
+                 capabilityMissRef(),
+                 &results);
+
+    EXPECT_TRUE(anySkipped(results));
+    EXPECT_FALSE(anyFailed(results));
+}
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp b/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
index 36a54fcbb5ec..b2d7ecb5473d 100644
--- a/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
+++ b/dnn-providers/integration-tests/tests/TestVerificationPaths.cpp
@@ -12,12 +12,12 @@
 #include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
 #include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>
 
-#include "harness/golden/BundleDiscovery.hpp"
+#include "harness/bundle/BundleDiscovery.hpp"
 #include "harness/gpu_graph_executor/GpuReferenceGraphExecutor.hpp"
 
 // NOLINTBEGIN(readability-identifier-naming)
 
-using namespace hipdnn_integration_tests::golden;
+using namespace hipdnn_integration_tests::bundle;
 
 namespace
 {