Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
906fe49
Add golden test infrastructure: bundle discovery, registration, and l…
bghimireamd Jun 22, 2026
2ec7676
Add --verification-mode flag with harness refactor and A/B/C outcome …
bghimireamd Jun 22, 2026
392e628
Add tier-3 input synthesis with deny-by-default RoleAccounting
bghimireamd Jun 22, 2026
6552b41
Rename RoleAccounting → SynthesisTracker, expand synthesis to all 19 ops
bghimireamd Jun 22, 2026
967e995
Split harness into .hpp + .cpp, add SynthesisTracker unit tests
bghimireamd Jun 22, 2026
ae7b665
Add verification mode dispatch tests with virtual getVerificationMode()
bghimireamd Jun 22, 2026
dec13b1
Fix clang-tidy misc-const-correctness in TestSynthesisTracker
bghimireamd Jun 22, 2026
4438fe0
Address review: fix misleading skip msg, add env fallback tests, fix …
bghimireamd Jun 22, 2026
133700f
Lift SynthesisTracker to graph level for fused/multi-node synthesis
bghimireamd Jun 22, 2026
6d3a7a9
Harden synthesis pipeline: sentinel outputs, FP8 STRUCTURED, metadata…
bghimireamd Jun 23, 2026
d05a68a
Fix uid=0 rejection in SynthesisTracker and no-GPU std::bad_alloc in …
bghimireamd Jun 23, 2026
b067310
Apply clang-format to CI-flagged files
bghimireamd Jun 23, 2026
d2860b4
Merge branch 'develop' of github.com:ROCm/rocm-libraries into users/b…
bghimireamd Jun 23, 2026
476d4a0
Allow --gd to point directly at a bundle folder
bghimireamd Jun 23, 2026
da94630
Wire golden runner into TOML tolerance override + test_skips chain
bghimireamd Jun 23, 2026
75bf232
Remove fillComputed/tensorAt from SynthesisTracker (deferred to input…
bghimireamd Jun 23, 2026
50a8bd7
Replace golden harness virtual tolerance/skip seam with direct TomlGu…
bghimireamd Jun 23, 2026
fcdc9ef
Unify graph/bundle harness input-init and rename golden->bundle
bghimireamd Jun 24, 2026
d16d91f
Unify tolerance resolution into a shared policy-based resolver
bghimireamd Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dnn-providers/integration-tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ set(INTEGRATION_TESTS_EXE hipdnn_integration_tests)

add_executable(${INTEGRATION_TESTS_EXE}
src/main.cpp
src/harness/bundle/IntegrationBundleVerificationHarness.cpp
)

add_subdirectory(src/integration_tests)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@

#pragma once

#include <stdexcept>
#include <string>

#include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>

#include "IReferenceGraphExecutor.hpp"
#include "ReferenceCapabilityError.hpp"

namespace hipdnn_integration_tests
{
Expand All @@ -17,7 +21,25 @@ class CpuReferenceGraphExecutorAdapter : public IReferenceGraphExecutor
size_t size,
const std::unordered_map<int64_t, void*>& variantPack) override
{
_executor.execute(graphBuffer, size, variantPack);
// The shared test_sdk CPU executor throws a plain std::runtime_error for
// BOTH "no plan for this op" (capability miss, case A) and a genuine
// runtime failure (case C) — it does not distinguish them by type. We
// cannot tell them apart here, so we conservatively translate every throw
// into a ReferenceCapabilityError (case A), carrying the original message
// so a real failure still surfaces in the unverifiable report. Net effect:
// a CPU-ref crash routes as "couldn't run" rather than a hard FAIL. The
// GPU executor (our code) keeps full A-vs-C fidelity by throwing the right
// type at the source.
try
{
_executor.execute(graphBuffer, size, variantPack);
}
catch(const std::exception& e)
{
throw ReferenceCapabilityError(std::string("CPU reference executor could not run "
"this graph: ")
+ e.what());
}
}

bool requiresDeviceMemory() const override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <hipdnn_test_sdk/utilities/TestUtilities.hpp>
#include <hipdnn_test_sdk/utilities/cpu_graph_executor/CpuReferenceGraphExecutor.hpp>

#include "harness/TomlGuards.hpp"

namespace hipdnn_integration_tests
{

Expand Down Expand Up @@ -47,12 +49,16 @@ class TestGoldenReferenceCpu : public ::testing::TestWithParam<std::filesystem::

_graphAndTensors = hipdnn_test_sdk::utilities::loadGraphAndTensors(path);
_referenceOutputTensors = _graphAndTensors.extractAndClearOutputTensorData();

skipIfTomlMatched(currentTestName());
}

void goldenReferenceTestSuite(float absoluteTolerance, float relativeTolerance)
{
SKIP_IF_WINDOWS();

applyTomlToleranceOverride(currentTestName(), absoluteTolerance, relativeTolerance);

auto tensorMap = _graphAndTensors.hostBufferMap();

hipdnn_test_sdk::utilities::CpuReferenceGraphExecutor().execute(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include <hipdnn_frontend/Graph.hpp>
#include <hipdnn_frontend/Utilities.hpp>
#include <hipdnn_frontend/attributes/TensorAttributes.hpp>
#include <hipdnn_frontend/node/RMSNormNode.hpp>
#include <hipdnn_frontend/node/ReductionNode.hpp>
#include <hipdnn_plugin_sdk/PluginLogging.hpp>
#include <hipdnn_test_sdk/utilities/CpuFpReferenceMiopenRmsValidation.hpp>
#include <hipdnn_test_sdk/utilities/CpuFpReferenceValidation.hpp>
Expand All @@ -29,6 +27,9 @@
#include "harness/SharedHandle.hpp"
#include "harness/SupportMatrixCollector.hpp"
#include "harness/TestConfig.hpp"
#include "harness/TomlGuards.hpp"
#include "harness/input_init/SynthesizeInputs.hpp"
#include "harness/tolerance/ToleranceResolver.hpp"

namespace hipdnn_integration_tests
{
Expand All @@ -41,7 +42,6 @@ template <typename DataType, typename TestCaseType>
class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<TestCaseType>
{
protected:
int _deviceId = 0;
std::string _testCaseNote;
std::string _testCaseLayout;
std::unordered_map<int64_t, std::string> _tensorIdToNameMap;
Expand All @@ -53,20 +53,10 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
{
SKIP_IF_NO_DEVICES();

// Initialize HIP
ASSERT_EQ(hipInit(0), hipSuccess);
ASSERT_EQ(hipGetDevice(&_deviceId), hipSuccess);

// Check for any engine specific test skips
if(auto* info = ::testing::UnitTest::GetInstance()->current_test_info(); info != nullptr)
{
const std::string testName = std::string(info->test_suite_name()) + "." + info->name();
if(auto skipReason = TestConfig::get().findSkipForTest(testName))
{
GTEST_SKIP() << "[arch " << TestConfig::get().getCurrentArch() << "] "
<< *skipReason;
}
}
// HIP initializes lazily on first runtime use; the shared hipdnn handle
// (getSharedHandle -> hipdnnCreate) does this before any graph executes,
// so no explicit hipInit is needed here.
skipIfTomlMatched(currentTestName());
}

void setTestCaseNote(std::string note)
Expand All @@ -81,41 +71,64 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test

virtual void runGraphTest() = 0;

// Determine tolerance for an output tensor based on the graph and
// configured tolerance mode for the engine.
// Determine the FINAL tolerance for an output tensor: an aggregation-policy
// default plus the TOML per-test override, both via
// harness/tolerance/ToleranceResolver.hpp. The resolver is keyed on the
// serialized flatbuffer graph: we serialize with to_binary() — the same
// pattern initializeBundle() already uses — and read the output tensor's dtype
// from the flatbuffer.
//
// Policy = outputOpTolerance (the last non-Pointwise op), which reproduces
// this harness's historical getTolerance() behavior so the C++ graph tests
// keep their exact tolerances. (The bundle harness uses the maxAcrossNodes
// default; the two agree for the common one-real-op + activation case.) The
// returned value is already overridden, so registerValidator stores it as-is.
float getTolerance(const hipdnn_frontend::graph::Graph& graph,
const std::shared_ptr<hipdnn_frontend::graph::TensorAttributes>& output)
{
ToleranceMode mode = TestConfig::get().getToleranceMode();
if(mode != ToleranceMode::DEFAULT)
{
ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
return 0.0f;
}

if(mode == ToleranceMode::DEFAULT)
auto [serialized, serErr] = graph.to_binary();
if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
{
// We determine the tolerance based on the last non-PointwiseNode
// (the root op). This will be gradually updated to use dynamic
// calculation as possible; eventually, the tolerance will be
// entirely dynamically determined in the default case.
//
// NOTE: after validate(), the graph's sub-nodes are in topological order.
const hipdnn_frontend::graph::INode* rootOp = nullptr;
graph.visit([&](const hipdnn_frontend::graph::INode& node) {
if(dynamic_cast<const hipdnn_frontend::graph::PointwiseNode*>(&node) == nullptr
&& dynamic_cast<const hipdnn_frontend::graph::Graph*>(&node) == nullptr)
{
rootOp = &node;
}
});
ADD_FAILURE() << "getTolerance: graph serialization failed";
return 0.0f;
}

if(rootOp == nullptr)
{
ADD_FAILURE() << "getTolerance: no root op found in graph";
return 0.0f;
}
const auto wrapper
= hipdnn_flatbuffers_sdk::flatbuffer_utilities::GraphWrapper::fromSerializedBlob(
serialized.data(), serialized.size());
if(!wrapper.isValid())
{
ADD_FAILURE() << "getTolerance: serialized graph failed verification";
return 0.0f;
}

return toleranceForNode(*rootOp, output->get_data_type());
const auto& tensorMap = wrapper.getTensorMap();
const auto it = tensorMap.find(output->get_uid());
if(it == tensorMap.end())
{
ADD_FAILURE() << "getTolerance: output tensor uid " << output->get_uid()
<< " not found in serialized graph";
return 0.0f;
}

ADD_FAILURE() << "getTolerance: unhandled tolerance mode";
return 0.0f;
float atol = 0.0f;
float rtol = 0.0f;
tolerance::resolveTolerance(wrapper,
it->second->data_type(),
currentTestName(),
atol,
rtol,
tolerance::outputOpTolerance);
// getTolerance's single-float contract predates split atol/rtol; under the
// current resolver the two are equal (same default, same override).
return atol;
}

void verifyGraph(hipdnn_frontend::graph::Graph& graph, unsigned int seed)
Expand Down Expand Up @@ -201,6 +214,9 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
<< "At least one output tensor id must be specified for "
"validation.";

tolerance::warnIfMultipleOutputs(gpuBundle.outputTensorIds.size(),
"IntegrationGraphVerificationHarness");

HIPDNN_PLUGIN_LOG_INFO("Validating " << gpuBundle.outputTensorIds.size()
<< " output tensors");

Expand Down Expand Up @@ -253,25 +269,10 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
float absoluteTolerance,
float relativeTolerance)
{
// Check for per-test tolerance override from TOML config
float finalAtol = absoluteTolerance;
float finalRtol = relativeTolerance;

auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
if(testInfo != nullptr)
{
std::string testName
= std::string(testInfo->test_suite_name()) + "." + testInfo->name();
auto override = TestConfig::get().findToleranceOverride(testName);
if(override.has_value())
{
finalAtol = override->atol;
finalRtol = override->rtol;
HIPDNN_PLUGIN_LOG_INFO("Tolerance override applied for " << testName
<< ": atol=" << finalAtol
<< " rtol=" << finalRtol);
}
}
// Tolerances arrive already resolved (default + TOML override) from
// getTolerance via ToleranceResolver; no override is applied here.
const float finalAtol = absoluteTolerance;
const float finalRtol = relativeTolerance;

// Since the graph can infer properties + Ids, we defer validator registration until right
// before validation in verifyGraph
Expand Down Expand Up @@ -332,67 +333,82 @@ class IntegrationGraphVerificationHarness : public ::testing::TestWithParam<Test
});
}

virtual void initializeBundle([[maybe_unused]] const hipdnn_frontend::graph::Graph& graph,
virtual void initializeBundle(const hipdnn_frontend::graph::Graph& graph,
hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
unsigned int seed)
{
bundle.sentinelFillOutputTensors();

for(auto& tensorPair : bundle.tensors)
auto [serialized, serErr] = graph.to_binary();
if(serErr.code != hipdnn_frontend::ErrorCode::OK || serialized.empty())
{
if(!bundle.isOutput(tensorPair.first))
initializeBundleFallback(bundle, seed);
return;
}

const auto* fb = hipdnn_flatbuffers_sdk::data_objects::GetGraph(serialized.data());
if(fb == nullptr || fb->nodes() == nullptr)
{
initializeBundleFallback(bundle, seed);
return;
}

std::vector<int64_t> leafInputUids;
InputTensorMap inputs;
for(auto& [uid, tensor] : bundle.tensors)
{
if(!bundle.isOutput(uid))
{
bundle.randomizeTensor(tensorPair.first, -1.0f, 1.0f, seed);
leafInputUids.push_back(uid);
inputs[uid] = std::move(tensor);
}
}
}

static float toleranceForNode(const hipdnn_frontend::graph::INode& node,
hipdnn_frontend::DataType dataType)
{
switch(dataType)
std::mt19937 rng(seed);
SynthesisTracker tracker(leafInputUids, inputs);

bool synthesisOk = true;
for(const auto* node : *fb->nodes())
{
case hipdnn_frontend::DataType::FLOAT:
return toleranceForNodeTyped<float>(node);
case hipdnn_frontend::DataType::HALF:
return toleranceForNodeTyped<half>(node);
case hipdnn_frontend::DataType::BFLOAT16:
return toleranceForNodeTyped<bfloat16>(node);
default:
ADD_FAILURE() << "toleranceForNode: unsupported data type";
return 0.0f;
if(node == nullptr)
{
continue;
}
auto result = synthesizeNodeInputs(*node, tracker, rng);
if(!result.filled)
{
synthesisOk = false;
break;
}
}

if(synthesisOk)
{
auto finalResult = tracker.finish("synthesis");
synthesisOk = finalResult.filled;
}

for(auto& [uid, tensor] : inputs)
{
bundle.tensors[uid] = std::move(tensor);
}

if(!synthesisOk)
{
initializeBundleFallback(bundle, seed);
}
}

template <typename T>
static float toleranceForNodeTyped(const hipdnn_frontend::graph::INode& node)
void initializeBundleFallback(hipdnn_test_sdk::utilities::GraphTensorBundle& bundle,
unsigned int seed)
{
namespace fe = hipdnn_frontend::graph;
using namespace hipdnn_test_sdk::utilities;

if(dynamic_cast<const fe::ConvolutionFpropNode*>(&node) != nullptr)
return static_cast<float>(conv::getToleranceFwd<T>());
if(dynamic_cast<const fe::ConvolutionDgradNode*>(&node) != nullptr)
return static_cast<float>(conv::getToleranceBwd<T>());
if(dynamic_cast<const fe::ConvolutionWgradNode*>(&node) != nullptr)
return static_cast<float>(conv::getToleranceWrw<T>());
if(dynamic_cast<const fe::BatchnormInferenceNodeVarianceExt*>(&node) != nullptr)
return static_cast<float>(batchnorm::getToleranceInferenceWithVariance<T>());
if(dynamic_cast<const fe::BatchnormInferenceNode*>(&node) != nullptr)
return static_cast<float>(batchnorm::getToleranceInference<T>());
if(dynamic_cast<const fe::BatchnormNode*>(&node) != nullptr)
return static_cast<float>(batchnorm::getToleranceTraining<T>());
if(dynamic_cast<const fe::BatchnormBackwardNode*>(&node) != nullptr)
return static_cast<float>(batchnorm::getToleranceBackward<T>());
if(dynamic_cast<const fe::MatmulNode*>(&node) != nullptr)
return static_cast<float>(matmul::getTolerance<T>());
if(dynamic_cast<const fe::ReductionNode*>(&node) != nullptr)
return static_cast<float>(reduction::getTolerance<T>());
if(dynamic_cast<const fe::RMSNormNode*>(&node) != nullptr)
return static_cast<float>(rmsnorm::getTolerance<T>());

ADD_FAILURE() << "toleranceForNodeTyped: unsupported node type";
return 0.0f;
for(auto& [uid, tensor] : bundle.tensors)
{
if(!bundle.isOutput(uid))
{
bundle.randomizeTensor(uid, -1.0f, 1.0f, seed);
}
}
}

void executeGpuGraph(hipdnnHandle_t handle,
Expand Down
Loading
Loading