diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt index 37774d496..a8ff8e0bb 100644 --- a/catch/unit/deviceLib/CMakeLists.txt +++ b/catch/unit/deviceLib/CMakeLists.txt @@ -90,6 +90,8 @@ set(AMD_TEST_SRC AtomicsWithRandomActiveLanesInWavefront.cc fp16_ops.cc fp8_host.cc + fp4_ocp.cc + fp6_ocp.cc ) set(AMD_ARCH_SPEC_TEST_SRC diff --git a/catch/unit/deviceLib/fp4_ocp.cc b/catch/unit/deviceLib/fp4_ocp.cc new file mode 100644 index 000000000..0070c22a6 --- /dev/null +++ b/catch/unit/deviceLib/fp4_ocp.cc @@ -0,0 +1,647 @@ +/* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include + +#include +#include + + +TEST_CASE("Unit_ocp_fp4_sanity_host") { + SECTION("sanityx1") { + std::vector inputs{-1.0f, 0.0f, 1.0f}; + for (const auto input : inputs) { + __hip_fp4_e2m1 fp4(input); + float ret = fp4; + INFO("Original: " << input << " Return: " << ret); + REQUIRE(ret == input); + } + } + + SECTION("sanityx2") { + std::vector inputs{ + {-1.0f, 0.0f}, {0.0f, 1.0f}, {1.0f, -1.0f}, {1.0f, 0.0f}, {0.0f, -1.0f}}; + for (const auto input : inputs) { + __hip_fp4x2_e2m1 fp4x2(input); + float2 ret = fp4x2; + INFO("Original: " << input.x << ", " << input.y << " Return: " << ret.x << ", " << ret.y); + REQUIRE(ret.x == input.x); + REQUIRE(ret.y == input.y); + } + } + + SECTION("sanityx4") { + std::vector inputs{ + {-1.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 1.0f, -0.5f, -1.0f}, {1.0f, 0.0f, 1.0f, -1.0f}}; + for (const auto& input : inputs) { + __hip_fp4x4_e2m1 fp4x4(input); + float4 ret = fp4x4; + INFO("Original: " << input.x << ", " << input.y << ", " << input.z << ", " << input.w + << " Return: " << ret.x << ", " << ret.y << ret.z << ", " << ret.w); + REQUIRE(ret.x == input.x); + REQUIRE(ret.y == input.y); + REQUIRE(ret.z == input.z); + REQUIRE(ret.w == input.w); + } + } +} + +template +static __global__ void lambda_kernel_launch(Lambda l, Type... args) { + l(args...); +} + +TEST_CASE("Unit_ocp_fp4_sanity_device") { + SECTION("sanityx1") { + auto fp4x1_l = [] __device__(float* inputs, float* outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{1.0f, 0.0f, -1.0f}; + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(float) * inputs.size(), hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x1_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i] << " Output: " << outputs[i]); + REQUIRE(inputs[i] == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("sanityx2") { + auto fp4x2_l = [] __device__(float2 * inputs, float2 * outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4x2_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{ + {-1.0f, 0.0f}, {0.0f, 1.0f}, {1.0f, -1.0f}, {1.0f, 0.0f}, {0.0f, -1.0f}}; + float2 *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float2) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float2) * inputs.size())); + + HIP_CHECK( + hipMemcpy(d_in, inputs.data(), sizeof(float2) * inputs.size(), hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x2_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size()); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float2) * inputs.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i].x << ", " << inputs[i].y << " Output: " << outputs[i].x << ", " + << outputs[i].y); + REQUIRE(inputs[i].x == outputs[i].x); + REQUIRE(inputs[i].y == outputs[i].y); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("sanityx4") { + auto fp4x4_l = [] __device__(float4 * inputs, float4 * outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4x4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{ + {-1.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 1.0f, -0.5f, -1.0f}, {1.0f, 0.0f, 1.0f, -1.0f}}; + float4 *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float4) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float4) * inputs.size())); + + HIP_CHECK( + hipMemcpy(d_in, inputs.data(), sizeof(float4) * inputs.size(), hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x4_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size()); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float4) * inputs.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i].x << ", " << inputs[i].y << ", " << inputs[i].z << ", " + << inputs[i].w << " Output: " << outputs[i].x << ", " << outputs[i].y + << ", " << outputs[i].z << ", " << outputs[i].w); + REQUIRE(inputs[i].x == outputs[i].x); + REQUIRE(inputs[i].y == outputs[i].y); + REQUIRE(inputs[i].z == outputs[i].z); + REQUIRE(inputs[i].w == outputs[i].w); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } +} + +TEST_CASE("Unit_ocp_fp4_full_range_host") { + // FP4 is -6 to +6 + std::vector all_fp4{-6.0f, -4.0f, -3.0f, -2.0f, -1.5f, -1.0f, -0.5f, 0.0f, + 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f}; + std::vector in; + in.reserve(30); + for (float i = -6.0f; i <= 6.0f; i += 0.5f) { + in.push_back(i); + } + + std::vector expected{-6.0f, -6.0f, -4.0f, -4.0f, -4.0f, -4.0f, -3.0f, -2.0f, -2.0f, + -1.5f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 2.0f, + 3.0f, 4.0f, 4.0f, 4.0f, 4.0f, 6.0f, 6.0f}; + + for (size_t i = 0; i < in.size(); i++) { + __hip_fp4_e2m1 fp4(in[i]); + float fp32 = fp4; + INFO("Original: " << in[i] << " Output: " << fp32 << " Expected: " << expected[i]); + REQUIRE(expected[i] == fp32); + } +} + +TEST_CASE("Unit_ocp_fp4_full_range_device") { + std::vector all_fp4{-6.0f, -4.0f, -3.0f, -2.0f, -1.5f, -1.0f, -0.5f, 0.0f, + 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f}; + auto fp4x1_l = [] __device__(float* inputs, float* outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs; + inputs.reserve(30); + for (float i = -6.0f; i <= 6.0f; i += 0.5f) { + inputs.push_back(i); + } + + std::vector expected{-6.0f, -6.0f, -4.0f, -4.0f, -4.0f, -4.0f, -3.0f, -2.0f, -2.0f, + -1.5f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 2.0f, + 3.0f, 4.0f, 4.0f, 4.0f, 4.0f, 6.0f, 6.0f}; + + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(float) * inputs.size(), hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 16>>>(fp4x1_l, d_in, d_out, inputs.size()); + lambda_kernel_launch<<<1, 16>>>(fp4x1_l, d_in + 16, d_out + 16, inputs.size() - 16); + + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Index: " << i << " Original: " << inputs[i] << " Output: " << outputs[i] + << " Expected: " << expected[i]); + REQUIRE(expected[i] == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); +} + +/* +Test bfloat and half type convertions on host +*/ +TEST_CASE("Unit_fp4_cvt_bfloat_half_host") { + float f1 = 0.5f; + float2 f2 = {-1.0f, 2.0f}; + + SECTION("e2m1_ocp_bfloat") { + auto bf16_val = __float2bfloat16(f1); + __hip_fp4_e2m1 tmp(bf16_val); + __hip_fp4_e2m1 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw_to_fp4(bf16_val, __HIP_E2M1, hipRoundZero); + float bf2_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == bf2_1); + REQUIRE(f1 == f1_1); + } + SECTION("e2m1_ocp_bfloat2") { + auto bf162_val = __float22bfloat162_rn(f2); + __hip_fp4x2_e2m1 tmp(bf162_val); + __hip_fp4x2_e2m1 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp4x2(bf162_val, __HIP_E2M1, hipRoundZero); + float2 bf2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == bf2_2); + REQUIRE(f2 == f2_2); + } + SECTION("e2m1_ocp_half") { + auto half_val = __float2half(f1); + __hip_fp4_e2m1 tmp(half_val); + __hip_fp4_e2m1 tmp1; + tmp1.__x = __hip_cvt_halfraw_to_fp4(half_val, __HIP_E2M1, hipRoundZero); + float half_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == half_1); + REQUIRE(f1 == f1_1); + } + SECTION("e2m1_ocp_half2") { + auto half2_val = __float22half2_rn(f2); + __hip_fp4x2_e2m1 tmp(half2_val); + __hip_fp4x2_e2m1 tmp1; + tmp1.__x = __hip_cvt_halfraw2_to_fp4x2(half2_val, __HIP_E2M1, hipRoundZero); + float2 h2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == h2_2); + REQUIRE(f2 == f2_2); + } +} + +template __global__ void Type_to_bfloat_half(T* in, float* cvt1, float* cvt2) { + T val = in[0]; + __hip_fp4_e2m1 tmp(val); + __hip_fp4_e2m1 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw_to_fp4(val, __HIP_E2M1, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw_to_fp4(val, __HIP_E2M1, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; +} + +template __global__ void Type_to_bfloat2_half2(T* in, float2* cvt1, float2* cvt2) { + T val = in[0]; + __hip_fp4x2_e2m1 tmp(val); + __hip_fp4x2_e2m1 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp4x2(val, __HIP_E2M1, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw2_to_fp4x2(val, __HIP_E2M1, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; +} + +/* +Test bfloat and half type convertions on device +*/ + +TEST_CASE("Unit_fp4_cvt_bfloat_half_device") { + float f1 = 0.5f; + float2 f2 = {-1.0f, 2.0f}; + + SECTION("fp4_ocp_bfloat") { + auto bf16_val = __float2bfloat16(f1); + float bf1_1, f1_1; + __hip_bfloat16* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__hip_bfloat16))); + float* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float))); + float* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float))); + HIP_CHECK(hipMemcpy(d_val, &bf16_val, sizeof(__hip_bfloat16), hipMemcpyHostToDevice)); + auto fp4_kernel = Type_to_bfloat_half<__hip_bfloat16>; + fp4_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&bf1_1, d_f1, sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f1_1, d_f2, sizeof(float), hipMemcpyDeviceToHost)); + + REQUIRE(f1 == bf1_1); + REQUIRE(f1 == f1_1); + + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + + SECTION("fp4_ocp_bfloat2") { + auto bf162_val = __float22bfloat162_rn(f2); + float2 bf2_2, f2_2; + + __hip_bfloat162* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__hip_bfloat162))); + float2* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float2))); + float2* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float2))); + + HIP_CHECK(hipMemcpy(d_val, &bf162_val, sizeof(__hip_bfloat162), hipMemcpyHostToDevice)); + auto fp4_kernel = Type_to_bfloat2_half2<__hip_bfloat162>; + fp4_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&bf2_2, d_f1, sizeof(float2), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f2_2, d_f2, sizeof(float2), hipMemcpyDeviceToHost)); + + REQUIRE(f2 == bf2_2); + REQUIRE(f2 == f2_2); + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + SECTION("fp4_ocp_half") { + auto half_val = __float2half(f1); + float h1_1, f1_1; + __half* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__half))); + float* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float))); + float* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float))); + + HIP_CHECK(hipMemcpy(d_val, &half_val, sizeof(__half), hipMemcpyHostToDevice)); + auto fp4_kernel = Type_to_bfloat_half<__half>; + fp4_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&h1_1, d_f1, sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f1_1, d_f2, sizeof(float), hipMemcpyDeviceToHost)); + + REQUIRE(f1 == h1_1); + REQUIRE(f1 == f1_1); + + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + SECTION("fp4_ocp_half2") { + auto half2_val = __float22half2_rn(f2); + float2 h2_2, f2_2; + + __half2* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__half2))); + float2* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float2))); + float2* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float2))); + + HIP_CHECK(hipMemcpy(d_val, &half2_val, sizeof(__half2), hipMemcpyHostToDevice)); + auto fp4_kernel = Type_to_bfloat2_half2<__half2>; + fp4_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&h2_2, d_f1, sizeof(float2), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f2_2, d_f2, sizeof(float2), hipMemcpyDeviceToHost)); + + REQUIRE(f2 == h2_2); + REQUIRE(f2 == f2_2); + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } +} + +TEST_CASE("Unit_fp4_scale_tests") { + std::vector inputs; + std::vector inputsx2, expectedx2; + inputs.reserve(30); + for (float i = -6.0f; i <= 6.0f; i += 0.5f) { + inputs.push_back(i); + } + + + std::vector expected{-6.0f, -6.0f, -4.0f, -4.0f, -4.0f, -4.0f, -3.0f, -2.0f, -2.0f, + -1.5f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 2.0f, + 3.0f, 4.0f, 4.0f, 4.0f, 4.0f, 6.0f, 6.0f}; + + REQUIRE(inputs.size() == expected.size()); + for (size_t i = 0; i < inputs.size(); i++) { + inputsx2.push_back(float2{inputs[i], inputs[inputs.size() - 1 - i]}); + expectedx2.push_back(float2{expected[i], expected[inputs.size() - 1 - i]}); + } + + SECTION("fp4 to half_raw") { + auto l_kernel = [] __device__(float* inputs, float* outputs, size_t size, + __hip_fp4_storage_t* mid) { + int i = threadIdx.x; + if (i < size) { + { + __hip_fp4_e2m1 fp4(inputs[i]); + auto hr = __hip_cvt_fp4_to_halfraw(fp4.__x, __HIP_E2M1); + mid[i] = fp4.__x; + __half h(hr); + outputs[i] = h; + } + { + __hip_fp4_e2m1 fp4(inputs[i + 16]); + auto hr = __hip_cvt_fp4_to_halfraw(fp4.__x, __HIP_E2M1); + mid[i + 16] = fp4.__x; + __half h(hr); + outputs[i + 16] = h; + } + } + }; + + + float *d_in, *d_out; + __hip_fp4_storage_t* d_mid; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_mid, sizeof(__hip_fp4_storage_t) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(float) * inputs.size(), hipMemcpyHostToDevice)); + // lambda_kernel_launch<<<1, 32>>>(l_kernel, d_in, d_out, inputs.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in, d_out, inputs.size(), d_mid); + // lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in + 16, d_out + 16, inputs.size(), d_mid + 16); + + std::vector outputs(inputs.size(), 0.0f); + std::vector<__hip_fp4_storage_t> mid(inputs.size(), 99u); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(mid.data(), d_mid, sizeof(__hip_fp4_storage_t) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Index: " << i << " Original: " << inputs[i] << " Output: " << outputs[i] + << " Expected: " << expected[i]); + INFO("mid: " << unsigned(mid[i])); + REQUIRE(expected[i] == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_mid)); + } + + SECTION("fp4x2 to half_raw2") { + auto l_kernel = [] __device__(float2 * inputs, float2 * outputs, size_t size, + __hip_fp4x2_storage_t* mid) { + int i = threadIdx.x; + if (i < size) { + { + __hip_fp4x2_e2m1 fp4(inputs[i]); + auto hr = __hip_cvt_fp4x2_to_halfraw2(fp4.__x, __HIP_E2M1); + mid[i] = fp4.__x; + __half2 h(hr); + outputs[i] = __half22float2(h); + } + { + __hip_fp4x2_e2m1 fp4(inputs[i + 16]); + auto hr = __hip_cvt_fp4x2_to_halfraw2(fp4.__x, __HIP_E2M1); + mid[i + 16] = fp4.__x; + __half2 h(hr); + outputs[i + 16] = __half22float2(h); + } + } + }; + + + float2 *d_in, *d_out; + __hip_fp4x2_storage_t* d_mid; + HIP_CHECK(hipMalloc(&d_in, sizeof(float2) * inputsx2.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float2) * inputsx2.size())); + HIP_CHECK(hipMalloc(&d_mid, sizeof(__hip_fp4x2_storage_t) * inputsx2.size())); + + HIP_CHECK( + hipMemcpy(d_in, inputsx2.data(), sizeof(float2) * inputsx2.size(), hipMemcpyHostToDevice)); + // lambda_kernel_launch<<<1, 32>>>(l_kernel, d_in, d_out, inputsx2.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in, d_out, inputsx2.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in + 16, d_out + 16, inputsx2.size(), d_mid + 16); + + std::vector outputs(inputsx2.size(), float2{0.0f, 0.0f}); + std::vector<__hip_fp4x2_storage_t> mid(inputsx2.size(), 99u); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float2) * inputsx2.size(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(mid.data(), d_mid, sizeof(__hip_fp4x2_storage_t) * inputsx2.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputsx2.size(); i++) { + INFO("Index: " << i << " Original: " << inputsx2[i].x << ", " << inputsx2[i].y + << " Output: " << outputs[i].x << ", " << outputs[i].y + << " Expected: " << expectedx2[i].x << ", " << expectedx2[i].y); + INFO("mid: " << unsigned(mid[i])); + REQUIRE(expectedx2[i].x == outputs[i].x); + REQUIRE(expectedx2[i].y == outputs[i].y); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_mid)); + } + + + SECTION("fp4 to bf16") { + auto l_kernel = [] __device__(float* inputs, float* outputs, size_t size, + __hip_fp4_storage_t* mid) { + int i = threadIdx.x; + if (i < size) { + { + __hip_fp4_e2m1 fp4(inputs[i]); + __hip_bfloat16_raw bfr = fp4; + __hip_bfloat16 bf(bfr); + mid[i] = fp4.__x; + outputs[i] = bf; + } + { + __hip_fp4_e2m1 fp4(inputs[i + 16]); + __hip_bfloat16_raw bfr = fp4; + __hip_bfloat16 bf(bfr); + mid[i + 16] = fp4.__x; + outputs[i + 16] = bf; + } + } + }; + + float *d_in, *d_out; + __hip_fp4_storage_t* d_mid; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + HIP_CHECK(hipMalloc(&d_mid, sizeof(__hip_fp4_storage_t) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(float) * inputs.size(), hipMemcpyHostToDevice)); + // lambda_kernel_launch<<<1, 32>>>(l_kernel, d_in, d_out, inputs.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in, d_out, inputs.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in + 16, d_out + 16, inputs.size(), d_mid + 16); + + std::vector outputs(inputs.size(), 0.0f); + std::vector<__hip_fp4_storage_t> mid(inputs.size(), 99u); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(mid.data(), d_mid, sizeof(__hip_fp4_storage_t) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Index: " << i << " Original: " << inputs[i] << " Output: " << outputs[i] + << " Expected: " << expected[i]); + INFO("mid: " << unsigned(mid[i])); + REQUIRE(expected[i] == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_mid)); + } + + SECTION("fp4x2 to bf162") { + auto l_kernel = [] __device__(float2 * inputs, float2 * outputs, size_t size, + __hip_fp4x2_storage_t* mid) { + int i = threadIdx.x; + if (i < size) { + { + __hip_fp4x2_e2m1 fp4(inputs[i]); + __hip_bfloat162_raw bfr = fp4; + __hip_bfloat162 bf(bfr); + mid[i] = fp4.__x; + outputs[i] = bf; + } + } + }; + + + float2 *d_in, *d_out; + __hip_fp4x2_storage_t* d_mid; + HIP_CHECK(hipMalloc(&d_in, sizeof(float2) * inputsx2.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float2) * inputsx2.size())); + HIP_CHECK(hipMalloc(&d_mid, sizeof(__hip_fp4x2_storage_t) * inputsx2.size())); + + HIP_CHECK( + hipMemcpy(d_in, inputsx2.data(), sizeof(float2) * inputsx2.size(), hipMemcpyHostToDevice)); + // lambda_kernel_launch<<<1, 32>>>(l_kernel, d_in, d_out, inputsx2.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in, d_out, inputsx2.size(), d_mid); + lambda_kernel_launch<<<1, 16>>>(l_kernel, d_in + 16, d_out + 16, inputsx2.size(), d_mid + 16); + + std::vector outputs(inputsx2.size(), float2{0.0f, 0.0f}); + std::vector<__hip_fp4x2_storage_t> mid(inputsx2.size(), 99u); + HIP_CHECK( + hipMemcpy(outputs.data(), d_out, sizeof(float2) * inputsx2.size(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(mid.data(), d_mid, sizeof(__hip_fp4x2_storage_t) * inputsx2.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputsx2.size(); i++) { + INFO("Index: " << i << " Original: " << inputsx2[i].x << ", " << inputsx2[i].y + << " Output: " << outputs[i].x << ", " << outputs[i].y + << " Expected: " << expectedx2[i].x << ", " << expectedx2[i].y); + INFO("mid: " << unsigned(mid[i])); + REQUIRE(expectedx2[i].x == outputs[i].x); + REQUIRE(expectedx2[i].y == outputs[i].y); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_mid)); + } + +} \ No newline at end of file diff --git a/catch/unit/deviceLib/fp6_ocp.cc b/catch/unit/deviceLib/fp6_ocp.cc new file mode 100644 index 000000000..2320831fa --- /dev/null +++ b/catch/unit/deviceLib/fp6_ocp.cc @@ -0,0 +1,700 @@ +/* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include +#include +#include + +std::vector<__hip_fp6_storage_t> get_all_fp6_ocp_nums() { + std::vector<__hip_fp6_storage_t> ret; + constexpr unsigned short max_fp6_num = 0b0011'1111; + ret.reserve(max_fp6_num + 1); + + for (unsigned short i = 0; i <= max_fp6_num; i++) { + ret.push_back(static_cast<__hip_fp6_storage_t>(i)); + } + return ret; +} + + +template +__global__ void Type_to_fp6(T* f, __hip_fp6_storage_t* res, size_t size) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + if constexpr (is_e2m3) { + __hip_fp6_e2m3 tmp(f[i]); + res[i] = tmp.__x; + } else { + __hip_fp6_e3m2 tmp(f[i]); + res[i] = tmp.__x; + } + } +} + +/* Test Description +- Generate fp6 numbers +- convert fp6 numbers to float on host +- convert float to fp6 on device +- compare fp6 result from device with original value +*/ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp", "", float, double) { + bool is_e2m3 = GENERATE(true, false); + std::vector f_vals; + std::vector<__hip_fp6_storage_t> all_vals; + + SECTION("all representable numbers") { + all_vals = get_all_fp6_ocp_nums(); + f_vals.reserve(all_vals.size()); + + for (const auto& fp6 : all_vals) { + TestType f = 0.0f; + if (is_e2m3) { + __hip_fp6_e2m3 tmp; + tmp.__x = fp6; + f = tmp; + } else { + __hip_fp6_e3m2 tmp; + tmp.__x = fp6; + f = tmp; + } + f_vals.push_back(f); + } + } + + SECTION("Range stepped numbers") { + constexpr TestType lhs = -30; + constexpr TestType rhs = 30; + constexpr TestType step = 0.1234f; + + f_vals.reserve(500); + all_vals.reserve(500); + + for (float fval = lhs; fval <= rhs; fval += step) { + if (is_e2m3) { + __hip_fp6_e2m3 tmp(fval); + all_vals.push_back(tmp.__x); + } else { + __hip_fp6_e3m2 tmp(fval); + all_vals.push_back(tmp.__x); + } + f_vals.push_back(fval); + } + } + + TestType* d_f_vals; + __hip_fp6_storage_t* d_res; + + HIP_CHECK(hipMalloc(&d_f_vals, sizeof(TestType) * f_vals.size())); + HIP_CHECK(hipMalloc(&d_res, sizeof(__hip_fp6_storage_t) * f_vals.size())); + + HIP_CHECK( + hipMemcpy(d_f_vals, f_vals.data(), sizeof(TestType) * f_vals.size(), hipMemcpyHostToDevice)); + + auto fp6_kernel = is_e2m3 ? Type_to_fp6 : Type_to_fp6; + fp6_kernel<<<(f_vals.size() / 64) + 1, 64>>>(d_f_vals, d_res, f_vals.size()); + + std::vector<__hip_fp6_storage_t> final_res(f_vals.size(), static_cast<__hip_fp6_storage_t>(0)); + + HIP_CHECK(hipMemcpy(final_res.data(), d_res, sizeof(__hip_fp6_storage_t) * final_res.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < final_res.size(); i++) { + INFO("Checking: " << f_vals[i] << " for: " << (is_e2m3 ? "e2m3" : "e3m2") + << " original: " << (int)all_vals[i] << " convert back: " << (int)final_res[i] + << " Idx : " << i); + TestType gpu_cvt_res = 0.0f, cpu_cvt_res = 0.0f; + if (is_e2m3) { + __hip_fp6_e2m3 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res = gtmp; + __hip_fp6_e2m3 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res = ctmp; + } else { + __hip_fp6_e3m2 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res = gtmp; + __hip_fp6_e3m2 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res = ctmp; + } + + INFO("cpu cvt val: " << cpu_cvt_res << " gpu cvt val: " << gpu_cvt_res); + CHECK(cpu_cvt_res == gpu_cvt_res); + } + + HIP_CHECK(hipFree(d_f_vals)); + HIP_CHECK(hipFree(d_res)); +} + +/* Test Description +- Sanity testing for cvt x2 and x4 packed types +- convert cvt float2/float4 types back to float2/float4 on host +- compare results with original value +*/ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_host", "", float, double) { + constexpr bool is_float = std::is_same::value; + using vtype2 = typename std::conditional::type; + using vtype4 = typename std::conditional::type; + + std::vector inputx2{ + {-1.0f, 0.0f}, {0.0f, 1.0f}, {1.0f, -1.0f}, {1.0f, 0.0f}, {0.0f, -1.0f}}; + std::vector inputx4{ + {-1.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 1.0f, -0.5f, -1.0f}, {1.0f, 0.0f, 1.0f, -1.0f}}; + + SECTION("e2m3_x2") { + for (const auto val : inputx2) { + __hip_fp6x2_e2m3 fp6x2(val); + vtype2 ret = fp6x2; + INFO("In: " << val.x << " - " << val.y); + INFO("Out: " << ret.x << " - " << ret.y); + REQUIRE(ret.x == val.x); + REQUIRE(ret.y == val.y); + } + } + SECTION("e2m3_x4") { + for (const auto val : inputx4) { + __hip_fp6x4_e2m3 fp6x4(val); + vtype4 ret = fp6x4; + INFO("In: " << val.x << " - " << val.y << " - " << val.z << " - " << val.w); + INFO("Out: " << ret.x << " - " << ret.y << " - " << ret.z << " - " << ret.w); + REQUIRE(ret.x == val.x); + REQUIRE(ret.y == val.y); + REQUIRE(ret.z == val.z); + REQUIRE(ret.w == val.w); + } + } + SECTION("e3m2_x2") { + for (const auto val : inputx2) { + __hip_fp6x2_e3m2 fp6x2(val); + vtype2 ret = fp6x2; + INFO("In: " << val.x << " - " << val.y); + INFO("Out: " << ret.x << " - " << ret.y); + REQUIRE(ret.x == val.x); + REQUIRE(ret.y == val.y); + } + } + SECTION("e3m2_x4") { + for (const auto val : inputx4) { + __hip_fp6x4_e3m2 fp6x4(val); + vtype4 ret = fp6x4; + INFO("In: " << val.x << " - " << val.y << " - " << val.z << " - " << val.w); + INFO("Out: " << ret.x << " - " << ret.y << " - " << ret.z << " - " << ret.w); + REQUIRE(ret.x == val.x); + REQUIRE(ret.y == val.y); + REQUIRE(ret.z == val.z); + REQUIRE(ret.w == val.w); + } + } +} + +template +__global__ void cvt_float4_fp6x4_float4_ocp(T* in, size_t size) { + int i = threadIdx.x; + if (i < size) { + T val = in[i]; + if constexpr (is_e2m3) { + __hip_fp6x4_e2m3 tmp(val); + in[i] = tmp; + } else { + __hip_fp6x4_e3m2 tmp(val); + in[i] = tmp; + } + } +} + +template +__global__ void cvt_float2_fp6x2_float2_ocp(T* in, size_t size) { + int i = threadIdx.x; + if (i < size) { + T val = in[i]; + if constexpr (is_e2m3) { + __hip_fp6x2_e2m3 tmp(val); + in[i] = tmp; + } else { + __hip_fp6x2_e3m2 tmp(val); + in[i] = tmp; + } + } +} + +/* Test Description +- Sanity testing for cvt x2 and x4 packed types on device +- convert cvt float2/float4 types back to float2/float4 on device +- compare results with original value +*/ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_device", "", float, double) { + constexpr bool is_float = std::is_same::value; + using vtype2 = typename std::conditional::type; + using vtype4 = typename std::conditional::type; + + std::vector inputx2{ + {-1.0f, 0.0f}, {0.0f, 1.0f}, {1.0f, -1.0f}, {1.0f, 0.0f}, {0.0f, -1.0f}}; + std::vector inputx4{ + {-1.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 1.0f, -0.5f, -1.0f}, {1.0f, -0.5f, 1.0f, -1.0f}}; + + std::vector outputx2(inputx2.size()); + std::vector outputx4(inputx4.size()); + + vtype2* d_in2; + vtype4* d_in4; + + HIP_CHECK(hipMalloc(&d_in2, sizeof(vtype2) * inputx2.size())); + HIP_CHECK(hipMalloc(&d_in4, sizeof(vtype4) * inputx4.size())); + + HIP_CHECK( + hipMemcpy(d_in2, inputx2.data(), sizeof(vtype2) * inputx2.size(), hipMemcpyHostToDevice)); + + HIP_CHECK( + hipMemcpy(d_in4, inputx4.data(), sizeof(vtype4) * inputx4.size(), hipMemcpyHostToDevice)); + + SECTION("e2m3_x2") { + auto kern_fp6x2 = cvt_float2_fp6x2_float2_ocp; + kern_fp6x2<<<1, 32>>>(d_in2, inputx2.size()); + HIP_CHECK( + hipMemcpy(outputx2.data(), d_in2, sizeof(vtype2) * inputx2.size(), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < inputx2.size(); i++) { + INFO("Original: " << inputx2[i].x << ", " << inputx2[i].y << " Output: " << outputx2[i].x + << ", " << outputx2[i].y); + CHECK(inputx2[i].x == outputx2[i].x); + CHECK(inputx2[i].y == outputx2[i].y); + } + } + SECTION("e2m3_x4") { + auto kern_fp6x4 = cvt_float4_fp6x4_float4_ocp; + kern_fp6x4<<<1, 32>>>(d_in4, inputx4.size()); + HIP_CHECK( + hipMemcpy(outputx4.data(), d_in4, sizeof(vtype4) * inputx4.size(), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < inputx4.size(); i++) { + INFO("Original: " << inputx4[i].x << ", " << inputx4[i].y << ", " << inputx4[i].z << ", " + << inputx4[i].w << " Output: " << outputx4[i].x << ", " << outputx4[i].y + << ", " << outputx4[i].z << ", " << outputx4[i].w); + CHECK(inputx4[i].x == outputx4[i].x); + CHECK(inputx4[i].y == outputx4[i].y); + CHECK(inputx4[i].z == outputx4[i].z); + CHECK(inputx4[i].w == outputx4[i].w); + } + } + SECTION("e3m2_x2") { + auto kern_fp6x2 = cvt_float2_fp6x2_float2_ocp; + kern_fp6x2<<<1, 32>>>(d_in2, inputx2.size()); + HIP_CHECK( + hipMemcpy(outputx2.data(), d_in2, sizeof(vtype2) * inputx2.size(), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < inputx2.size(); i++) { + INFO("Original: " << inputx2[i].x << ", " << inputx2[i].y << " Output: " << outputx2[i].x + << ", " << outputx2[i].y); + CHECK(inputx2[i].x == outputx2[i].x); + CHECK(inputx2[i].y == outputx2[i].y); + } + } + SECTION("e3m2_x4") { + auto kern_fp6x4 = cvt_float4_fp6x4_float4_ocp; + kern_fp6x4<<<1, 32>>>(d_in4, inputx4.size()); + HIP_CHECK( + hipMemcpy(outputx4.data(), d_in4, sizeof(vtype4) * inputx4.size(), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < inputx4.size(); i++) { + INFO("Original: " << inputx4[i].x << ", " << inputx4[i].y << ", " << inputx4[i].z << ", " + << inputx4[i].w << " Output: " << outputx4[i].x << ", " << outputx4[i].y + << ", " << outputx4[i].z << ", " << outputx4[i].w); + CHECK(inputx4[i].x == outputx4[i].x); + CHECK(inputx4[i].y == outputx4[i].y); + CHECK(inputx4[i].z == outputx4[i].z); + CHECK(inputx4[i].w == outputx4[i].w); + } + } + HIP_CHECK(hipFree(d_in2)); + HIP_CHECK(hipFree(d_in4)); +} + +template +__global__ void Type_to_fp6_ocp(T* in, float* cvt1, float* cvt2, size_t size) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + T val = in[i]; + if constexpr (is_e2m3) { + __hip_fp6_e2m3 tmp(val); + cvt1[i] = tmp; + __hip_fp6_e2m3 tmp1; + tmp1.__x = std::is_same::value + ? __hip_cvt_float_to_fp6(val, __HIP_E2M3, hipRoundZero) + : __hip_cvt_double_to_fp6(val, __HIP_E2M3, hipRoundZero); + cvt2[i] = tmp1; + } else { + __hip_fp6_e3m2 tmp(val); + cvt1[i] = tmp; + __hip_fp6_e3m2 tmp1; + tmp1.__x = std::is_same::value + ? __hip_cvt_float_to_fp6(val, __HIP_E3M2, hipRoundZero) + : __hip_cvt_double_to_fp6(val, __HIP_E3M2, hipRoundZero); + cvt2[i] = tmp1; + } + } +} + +/* Test Description +- Test all e2m3/e3m2 numbers on device +- copy all numbers to device and convert back to float +- compare results with original value on host +*/ +TEMPLATE_TEST_CASE("Unit_fp6_ocp_full_range_device", "", float, double) { + SECTION("e2m3_ocp") { + std::vector e2m3_ocp_nums = { + 0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1, 1.125, 1.25, + 1.375, 1.5, 1.625, 1.75, 1.875, 2, 2.25, 2.5, 2.75, 3, 3.25, + 3.5, 3.75, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, -0, + -0.125, -0.25, -0.375, -0.5, -0.625, -0.75, -0.875, -1, -1.125, -1.25, -1.375, + -1.5, -1.625, -1.75, -1.875, -2, -2.25, -2.5, -2.75, -3, -3.25, -3.5, + -3.75, -4, -4.5, -5, -5.5, -6, -6.5, -7, -7.5}; + size_t totalnums = e2m3_ocp_nums.size(); + TestType* fnums; + HIP_CHECK(hipMalloc((void**)&fnums, totalnums * sizeof(TestType))); + float* cvt1_dev; + HIP_CHECK(hipMalloc((void**)&cvt1_dev, totalnums * sizeof(float))); + float* cvt2_dev; + HIP_CHECK(hipMalloc((void**)&cvt2_dev, totalnums * sizeof(float))); + + HIP_CHECK(hipMemcpy(fnums, e2m3_ocp_nums.data(), totalnums * sizeof(TestType), + hipMemcpyHostToDevice)); + + auto fp6_kernel = Type_to_fp6_ocp; + fp6_kernel<<>>(fnums, cvt1_dev, cvt2_dev, totalnums); + + float* cvt1_host = (float*)malloc(sizeof(float) * totalnums); + float* cvt2_host = (float*)malloc(sizeof(float) * totalnums); + + HIP_CHECK(hipMemcpy(cvt1_host, cvt1_dev, totalnums * sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(cvt2_host, cvt2_dev, totalnums * sizeof(float), hipMemcpyDeviceToHost)); + + HIP_CHECK(hipDeviceSynchronize()); + + for (size_t idx = 0; idx < totalnums; idx++) { + TestType orig = e2m3_ocp_nums[idx]; + float cvt1 = cvt1_host[idx]; + float cvt2 = cvt2_host[idx]; + + INFO("Original: " << std::bitset<32>(*reinterpret_cast(&orig))); + INFO("Cvt back: " << std::bitset<32>(*reinterpret_cast(&cvt1))); + REQUIRE(cvt1 == Approx(orig)); + REQUIRE(cvt2 == cvt1); + } + + HIP_CHECK(hipFree(fnums)); + HIP_CHECK(hipFree(cvt1_dev)); + HIP_CHECK(hipFree(cvt2_dev)); + free(cvt1_host); + free(cvt2_host); + } + SECTION("e3m2_ocp") { + std::vector e3m2_ocp_nums = { + 0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, 0.5, 0.625, 0.75, + 0.875, 1, 1.25, 1.5, 1.75, 2, 2.5, 3, 3.5, 4, 5, + 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, -0, + -0.0625, -0.125, -0.1875, -0.25, -0.3125, -0.375, -0.4375, -0.5, -0.625, -0.75, -0.875, + -1, -1.25, -1.5, -1.75, -2, -2.5, -3, -3.5, -4, -5, -6, + -7, -8, -10, -12, -14, -16, -20, -24, -28}; + size_t totalnums = e3m2_ocp_nums.size(); + TestType* fnums; + HIP_CHECK(hipMalloc((void**)&fnums, totalnums * sizeof(TestType))); + float* cvt1_dev; + HIP_CHECK(hipMalloc((void**)&cvt1_dev, totalnums * sizeof(float))); + float* cvt2_dev; + HIP_CHECK(hipMalloc((void**)&cvt2_dev, totalnums * sizeof(float))); + + HIP_CHECK(hipMemcpy(fnums, e3m2_ocp_nums.data(), totalnums * sizeof(TestType), + hipMemcpyHostToDevice)); + + auto fp6_kernel = Type_to_fp6_ocp; + fp6_kernel<<>>(fnums, cvt1_dev, cvt2_dev, totalnums); + + float* cvt1_host = (float*)malloc(sizeof(float) * totalnums); + float* cvt2_host = (float*)malloc(sizeof(float) * totalnums); + + HIP_CHECK(hipMemcpy(cvt1_host, cvt1_dev, totalnums * sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(cvt2_host, cvt2_dev, totalnums * sizeof(float), hipMemcpyDeviceToHost)); + + HIP_CHECK(hipDeviceSynchronize()); + + for (size_t idx = 0; idx < totalnums; idx++) { + TestType orig = e3m2_ocp_nums[idx]; + float cvt1 = cvt1_host[idx]; + float cvt2 = cvt2_host[idx]; + + INFO("Original: " << std::bitset<32>(*reinterpret_cast(&orig))); + INFO("Cvt back: " << std::bitset<32>(*reinterpret_cast(&cvt1))); + REQUIRE(cvt1 == Approx(orig)); + REQUIRE(cvt2 == cvt1); + } + + HIP_CHECK(hipFree(fnums)); + HIP_CHECK(hipFree(cvt1_dev)); + HIP_CHECK(hipFree(cvt2_dev)); + free(cvt1_host); + free(cvt2_host); + } +} +/* +Test bfloat and half type convertions on host +*/ +TEST_CASE("Unit_fp6_cvt_bfloat_half_host") { + float f1 = 0.75f; + float2 f2 = {1.0f, 2.0f}; + + SECTION("e2m3_ocp_bfloat") { + auto bf16_val = __float2bfloat16(f1); + __hip_fp6_e2m3 tmp(bf16_val); + __hip_fp6_e2m3 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw_to_fp6(bf16_val, __HIP_E2M3, hipRoundZero); + float bf2_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == bf2_1); + REQUIRE(f1 == f1_1); + } + SECTION("e2m3_ocp_bfloat2") { + auto bf162_val = __float22bfloat162_rn(f2); + __hip_fp6x2_e2m3 tmp(bf162_val); + __hip_fp6x2_e2m3 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp6x2(bf162_val, __HIP_E2M3, hipRoundZero); + float2 bf2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == bf2_2); + REQUIRE(f2 == f2_2); + } + SECTION("e3m2_ocp_bfloat") { + auto bf16_val = __float2bfloat16(f1); + __hip_fp6_e3m2 tmp(bf16_val); + __hip_fp6_e3m2 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw_to_fp6(bf16_val, __HIP_E3M2, hipRoundZero); + float bf2_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == bf2_1); + REQUIRE(f1 == f1_1); + } + SECTION("e3m2_ocp_bfloat2") { + auto bf162_val = __float22bfloat162_rn(f2); + __hip_fp6x2_e3m2 tmp(bf162_val); + __hip_fp6x2_e3m2 tmp1; + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp6x2(bf162_val, __HIP_E3M2, hipRoundZero); + float2 bf2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == bf2_2); + REQUIRE(f2 == f2_2); + } + SECTION("e2m3_ocp_half") { + auto half_val = __float2half(f1); + __hip_fp6_e2m3 tmp(half_val); + __hip_fp6_e2m3 tmp1; + tmp1.__x = __hip_cvt_halfraw_to_fp6(half_val, __HIP_E2M3, hipRoundZero); + float half_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == half_1); + REQUIRE(f1 == f1_1); + } + SECTION("e2m3_ocp_half2") { + auto half2_val = __float22half2_rn(f2); + __hip_fp6x2_e2m3 tmp(half2_val); + __hip_fp6x2_e2m3 tmp1; + tmp1.__x = __hip_cvt_halfraw2_to_fp6x2(half2_val, __HIP_E2M3, hipRoundZero); + float2 h2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == h2_2); + REQUIRE(f2 == f2_2); + } + SECTION("e3m2_ocp_half") { + auto half_val = __float2half(f1); + __hip_fp6_e3m2 tmp(half_val); + __hip_fp6_e3m2 tmp1; + tmp1.__x = __hip_cvt_halfraw_to_fp6(half_val, __HIP_E3M2, hipRoundZero); + float half_1 = tmp1; + float f1_1 = tmp; + REQUIRE(f1 == half_1); + REQUIRE(f1 == f1_1); + } + SECTION("e3m2_ocp_half2") { + auto half2_val = __float22half2_rn(f2); + __hip_fp6x2_e3m2 tmp(half2_val); + __hip_fp6x2_e3m2 tmp1; + tmp1.__x = __hip_cvt_halfraw2_to_fp6x2(half2_val, __HIP_E3M2, hipRoundZero); + float2 h2_2 = tmp1; + float2 f2_2 = tmp; + REQUIRE(f2 == h2_2); + REQUIRE(f2 == f2_2); + } +} + +template +__global__ void Type_to_bfloat_half(T* in, float* cvt1, float* cvt2) { + T val = in[0]; + if constexpr (is_e2m3) { + __hip_fp6_e2m3 tmp(val); + __hip_fp6_e2m3 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw_to_fp6(val, __HIP_E2M3, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw_to_fp6(val, __HIP_E2M3, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; + } else { + __hip_fp6_e3m2 tmp(val); + __hip_fp6_e3m2 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw_to_fp6(val, __HIP_E3M2, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw_to_fp6(val, __HIP_E3M2, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; + } +} + +template +__global__ void Type_to_bfloat2_half2(T* in, float2* cvt1, float2* cvt2) { + T val = in[0]; + if constexpr (is_e2m3) { + __hip_fp6x2_e2m3 tmp(val); + __hip_fp6x2_e2m3 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp6x2(val, __HIP_E2M3, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw2_to_fp6x2(val, __HIP_E2M3, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; + } else { + __hip_fp6x2_e3m2 tmp(val); + __hip_fp6x2_e3m2 tmp1; + if constexpr (std::is_same::value) + tmp1.__x = __hip_cvt_bfloat16raw2_to_fp6x2(val, __HIP_E3M2, hipRoundZero); + else + tmp1.__x = __hip_cvt_halfraw2_to_fp6x2(val, __HIP_E3M2, hipRoundZero); + *cvt1 = tmp1; + *cvt2 = tmp; + } +} + +/* +Test bfloat and half type convertions on device +*/ +TEST_CASE("Unit_fp6_cvt_bfloat_half_device") { + float f1 = 0.75f; + float2 f2 = {1.0f, 2.0f}; + bool is_e2m3 = GENERATE(true, false); + + SECTION("fp6_ocp_bfloat") { + auto bf16_val = __float2bfloat16(f1); + float bf1_1, f1_1; + __hip_bfloat16* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__hip_bfloat16))); + float* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float))); + float* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float))); + HIP_CHECK(hipMemcpy(d_val, &bf16_val, sizeof(__hip_bfloat16), hipMemcpyHostToDevice)); + auto fp6_kernel = is_e2m3 ? Type_to_bfloat_half + : Type_to_bfloat_half; + fp6_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&bf1_1, d_f1, sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f1_1, d_f2, sizeof(float), hipMemcpyDeviceToHost)); + + REQUIRE(f1 == bf1_1); + REQUIRE(f1 == f1_1); + + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + SECTION("fp6_ocp_bfloat2") { + auto bf162_val = __float22bfloat162_rn(f2); + float2 bf2_2, f2_2; + + __hip_bfloat162* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__hip_bfloat162))); + float2* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float2))); + float2* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float2))); + + HIP_CHECK(hipMemcpy(d_val, &bf162_val, sizeof(__hip_bfloat162), hipMemcpyHostToDevice)); + auto fp6_kernel = is_e2m3 ? Type_to_bfloat2_half2 + : Type_to_bfloat2_half2; + fp6_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&bf2_2, d_f1, sizeof(float2), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f2_2, d_f2, sizeof(float2), hipMemcpyDeviceToHost)); + + REQUIRE(f2 == bf2_2); + REQUIRE(f2 == f2_2); + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + SECTION("fp6_ocp_half") { + auto half_val = __float2half(f1); + float h1_1, f1_1; + __half* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__half))); + float* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float))); + float* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float))); + + HIP_CHECK(hipMemcpy(d_val, &half_val, sizeof(__half), hipMemcpyHostToDevice)); + auto fp6_kernel = + is_e2m3 ? Type_to_bfloat_half : Type_to_bfloat_half; + fp6_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&h1_1, d_f1, sizeof(float), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f1_1, d_f2, sizeof(float), hipMemcpyDeviceToHost)); + + REQUIRE(f1 == h1_1); + REQUIRE(f1 == f1_1); + + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } + SECTION("fp6_ocp_half2") { + auto half2_val = __float22half2_rn(f2); + float2 h2_2, f2_2; + + __half2* d_val; + HIP_CHECK(hipMalloc((void**)&d_val, sizeof(__half2))); + float2* d_f1; + HIP_CHECK(hipMalloc((void**)&d_f1, sizeof(float2))); + float2* d_f2; + HIP_CHECK(hipMalloc((void**)&d_f2, sizeof(float2))); + + HIP_CHECK(hipMemcpy(d_val, &half2_val, sizeof(__half2), hipMemcpyHostToDevice)); + auto fp6_kernel = + is_e2m3 ? Type_to_bfloat2_half2 : Type_to_bfloat2_half2; + fp6_kernel<<<1, 1>>>(d_val, d_f1, d_f2); + + HIP_CHECK(hipMemcpy(&h2_2, d_f1, sizeof(float2), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&f2_2, d_f2, sizeof(float2), hipMemcpyDeviceToHost)); + + REQUIRE(f2 == h2_2); + REQUIRE(f2 == f2_2); + HIP_CHECK(hipFree(d_val)); + HIP_CHECK(hipFree(d_f1)); + HIP_CHECK(hipFree(d_f2)); + } +} \ No newline at end of file