diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt index 3a6534724..d79896a2b 100644 --- a/catch/unit/deviceLib/CMakeLists.txt +++ b/catch/unit/deviceLib/CMakeLists.txt @@ -90,6 +90,8 @@ set(AMD_TEST_SRC AtomicsWithRandomActiveLanesInWavefront.cc fp16_ops.cc fp8_host.cc + fp6_ocp.cc + fp4_ocp.cc ) set(AMD_ARCH_SPEC_TEST_SRC diff --git a/catch/unit/deviceLib/fp4_ocp.cc b/catch/unit/deviceLib/fp4_ocp.cc new file mode 100644 index 000000000..3f325f2ca --- /dev/null +++ b/catch/unit/deviceLib/fp4_ocp.cc @@ -0,0 +1,449 @@ +/* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include + +#include + +template +static __global__ void lambda_kernel_launch(Lambda l, Type... args) { + l(args...); +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given double type data to FP4 type with E2M1 + * format. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEST_CASE("Unit_all_fp4_from_double") { + SECTION("sanityx1") { + std::vector inputs{-1.0, 0.0, 1.0}; + for (const auto input : inputs) { + __hip_fp4_e2m1 fp4(input); + double ret = fp4; + INFO("Original: " << input << " Return: " << ret); + REQUIRE(ret == input); + } + } + SECTION("sanityx2") { + std::vector inputs{{-1.0, 1.0}, {-2.0, 2.0}}; + for (const auto input : inputs) { + __hip_fp4x2_e2m1 fp4x2(input); + double2 ret = fp4x2; + INFO("Original: " << input.x << " Return: " << ret.x); + INFO("Original: " << input.y << " Return: " << ret.y); + + REQUIRE(ret.x == input.x); + REQUIRE(ret.y == input.y); + } + } + SECTION("sanityx4") { + std::vector inputs{ + {-1.0, 0.5, 1.5, 1.0}, {-2.0, 0.5, 1.5, 2.0}, {-3.0, 0.5, 1.5, 3.0}}; + for (const auto &input : inputs) { + __hip_fp4x4_e2m1 fp4x4(input); + double4 ret = fp4x4; + INFO("Original: " << input.x << ", " << input.y << ", " << input.z << ", " + << input.w << " Return: " << ret.x << ", " << ret.y + << ret.z << ", " << ret.w); + REQUIRE(ret.x == input.x); + REQUIRE(ret.y == input.y); + REQUIRE(ret.z == input.z); + REQUIRE(ret.w == input.w); + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given double data to FP4 type with E2M1 + * format in the device. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEST_CASE("Unit_all_fp4_from_double_device") { + SECTION("sanityx1") { + auto fp4x1_l = [] __device__(double* inputs, float* outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{-1.0, 0.0, 1.0}; + double *d_in; + float *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(double) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(double) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x1_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i] << " Output: " << outputs[i]); + REQUIRE(inputs[i] == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("sanityx2") { + auto fp4x2_l = [] __device__(double2 * inputs, float2 * outputs, + size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4x2_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{ + {-1.0, 0.0}, {0.0, 1.0}, {1.0, -1.0}, {1.0, 0.0}, {0.0, -1.0}}; + double2 *d_in; + float2 *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(double2) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float2) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(double2) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x2_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size()); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float2) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i].x << ", " << inputs[i].y << " Output: " + << outputs[i].x << ", " << outputs[i].y); + REQUIRE(inputs[i].x == outputs[i].x); + REQUIRE(inputs[i].y == outputs[i].y); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("sanityx4") { + auto fp4x4_l = [] __device__(double4 * inputs, float4 * outputs, + size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4x4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs{ + {-1.0, 0.0, 1.0, 0.5}, {0.0, 1.0, -0.5, -1.0}, {1.0, 0.0, 1.0, -1.0}}; + double4 *d_in; + float4 *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(double4) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float4) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(double4) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x4_l, d_in, d_out, inputs.size()); + std::vector outputs(inputs.size()); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float4) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + INFO("Original: " << inputs[i].x << ", " << inputs[i].y << ", " + << inputs[i].z << ", " << inputs[i].w << " Output: " + << outputs[i].x << ", " << outputs[i].y + << ", " << outputs[i].z << ", " << outputs[i].w); + REQUIRE(inputs[i].x == outputs[i].x); + REQUIRE(inputs[i].y == outputs[i].y); + REQUIRE(inputs[i].z == outputs[i].z); + REQUIRE(inputs[i].w == outputs[i].w); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given signed interger data to FP4 type with E2M1 + * format. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp4_from_interger_data", "", int, long int, + long long int, short int) { + SECTION("Fp4 with e2m1") { + std::vector input{-1, 0, 1}; + for (const auto val : input) { + __hip_fp4_e2m1 fp4(val); + float ret = fp4; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given unsigned integer data to FP4 type with E2M1 + * format. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp4_from__unsigned_integer_data", "", unsigned int, + unsigned long int, unsigned long long int, + unsigned short int) { + SECTION("Fp4 with e2m1") { + std::vector input{1, 2, 3}; + for (const auto val : input) { + __hip_fp4_e2m1 fp4(val); + float ret = fp4; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given signed interger data to FP4 type in device with E2M1 + * format. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ + +TEMPLATE_TEST_CASE("Unit_all_fp4_from_integer_data_device", "", int, + long int, long long int, + short int) { + std::vector all_fp4{-6.0f, -4.0f, -3.0f, -2.0f, -1.5f, + -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, + 1.5f, 2.0f, 3.0f, 4.0f, 6.0f}; + auto fp4x1_l = [] __device__(TestType *inputs, float *outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs; + inputs.reserve(30); + for (int i = 0; i <= 6; i += 1) { + inputs.push_back(i); + } + TestType *d_in; + float *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(TestType) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(TestType) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x1_l, d_in, d_out, inputs.size()); + + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + auto lbound = std::lower_bound(all_fp4.begin(), all_fp4.end(), outputs[i]); + INFO("Original: " << inputs[i] << " Output: " << *lbound); + REQUIRE(*lbound == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given unsigned interger data to FP4 type in device with E2M1 + * format. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp4_from__unsigned_integer_data_device", "", + unsigned int, unsigned long int, unsigned long long int, + unsigned short int) { + std::vector all_fp4{-6.0f, -4.0f, -3.0f, -2.0f, -1.5f, + -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, + 1.5f, 2.0f, 3.0f, 4.0f, 6.0f}; + auto fp4x1_l = [] __device__(TestType *inputs, float *outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs; + inputs.reserve(30); + for (int i = -6; i <= 6; i += 1) { + inputs.push_back(i); + } + TestType *d_in; + float *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(TestType) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(TestType) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x1_l, d_in, d_out, inputs.size()); + + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + auto lbound = std::lower_bound(all_fp4.begin(), all_fp4.end(), outputs[i]); + INFO("Original: " << inputs[i] << " Output: " << *lbound); + REQUIRE(*lbound == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given double type data to FP4 type with E2M3 and + * E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEST_CASE("Unit_ocp_fp4_from_double_full_range_host") { + std::vector in; + in.reserve(30); + for (double i = -6.0; i <= 6.0; i += 0.5) { + in.push_back(i); + } + + std::vector expected{-6.0f, -6.0f, -4.0f, -4.0f, -4.0f, -4.0f, -3.0f, + -2.0f, -2.0f, -1.5f, -1.0f, -0.5f, 0.0f, 0.5f, + 1.0f, 1.5f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, + 4.0f, 4.0f, 6.0f, 6.0f}; + + for (size_t i = 0; i < in.size(); i++) { + __hip_fp4_e2m1 fp4(in[i]); + float fp32 = fp4; + INFO("Original: " << in[i] << " Output: " << fp32 + << " Expected: " << expected[i]); + REQUIRE(expected[i] == fp32); + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given double type data to FP4 type in device with + * E2M3 and E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp4_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ + +TEST_CASE("Unit_ocp_fp4_from_double_full_range_device") { + std::vector all_fp4{-6.0f, -4.0f, -3.0f, -2.0f, -1.5f, + -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, + 1.5f, 2.0f, 3.0f, 4.0f, 6.0f}; + auto fp4x1_l = [] __device__(double *inputs, float *outputs, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hip_fp4_e2m1 fp4(inputs[i]); + outputs[i] = fp4; + } + }; + + std::vector inputs; + inputs.reserve(30); + for (double i = -6.0; i <= 6.0; i += 0.5) { + inputs.push_back(i); + } + double *d_in; + float *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(double) * inputs.size())); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * inputs.size())); + + HIP_CHECK(hipMemcpy(d_in, inputs.data(), sizeof(double) * inputs.size(), + hipMemcpyHostToDevice)); + lambda_kernel_launch<<<1, 32>>>(fp4x1_l, d_in, d_out, inputs.size()); + + std::vector outputs(inputs.size(), 0.0f); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, sizeof(float) * inputs.size(), + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < inputs.size(); i++) { + auto lbound = std::lower_bound(all_fp4.begin(), all_fp4.end(), outputs[i]); + INFO("Original: " << inputs[i] << " Output: " << *lbound); + REQUIRE(*lbound == outputs[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); +} diff --git a/catch/unit/deviceLib/fp6_ocp.cc b/catch/unit/deviceLib/fp6_ocp.cc new file mode 100644 index 000000000..428acc58c --- /dev/null +++ b/catch/unit/deviceLib/fp6_ocp.cc @@ -0,0 +1,331 @@ +/* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include +#include +#include + +std::vector<__hip_fp6_storage_t> get_all_fp6_ocp_nums() { + std::vector<__hip_fp6_storage_t> ret; + constexpr unsigned short max_fp6_num = 0b0011'1111; + ret.reserve(max_fp6_num + 1); + + for (unsigned short i = 0; i <= max_fp6_num; i++) { + ret.push_back(static_cast<__hip_fp6_storage_t>(i)); + } + return ret; +} + +template +__global__ void Type_to_fp6(T* f, __hip_fp6_storage_t* res, size_t size) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + if constexpr (is_e2m3) { + __hip_fp6_e2m3 tmp(f[i]); + res[i] = tmp.__x; + } else { + __hip_fp6_e3m2 tmp(f[i]); + res[i] = tmp.__x; + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given interger values to FP6 type in the host + * with E2M3 and E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp6_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_interger_data", "", int, + long int, long long int, short int) { + SECTION("Fp6 with e2m3") { + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, -0, + -1, -2, -3, -4, -5, -6, -7 }; + for (const auto val : input) { + __hip_fp6_e2m3 fp6(val); + float ret = fp6; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } + SECTION("Fp6 with e3m2 ") { + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, + 16, 20, 24, 28, -0, -1, -2, -3, -4, -5, -6, + -7, -8, -10, -12, -14, -16, -20, -24, -28}; + for (const auto val : input) { + __hip_fp6_e3m2 fp6(val); + float ret = fp6; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given unsigned interger values to FP6 type in the host + * with E2M3 and E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp6_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_unsigned_interger_data", "", + int, long int, long long int, short int) { + SECTION("Fp6 with e2m3") { + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7}; + for (const auto val : input) { + __hip_fp6_e2m3 fp6(val); + float ret = fp6; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } + SECTION("Fp6 with e3m2 ") { + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, + 16, 20, 24, 28}; + for (const auto val : input) { + __hip_fp6_e3m2 fp6(val); + float ret = fp6; + INFO("In: " << val); + INFO("Out: " << ret); + REQUIRE(ret == val); + } + } +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given unsigned interger values to FP6 type in the device + * with E2M3 and E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp6_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_unsigned_integer_device", "", + unsigned int, unsigned long int, unsigned long long int, + unsigned short int) { + bool is_e2m3 = GENERATE(true, false); + std::vector f_vals; + std::vector<__hip_fp6_storage_t> all_vals; + constexpr TestType lhs = 0; + constexpr TestType rhs = 64; + constexpr TestType step = 1; + + f_vals.reserve(500); + all_vals.reserve(500); + + for (TestType fval = lhs; fval <= rhs; fval += step) { + if (is_e2m3) { + __hip_fp6_e2m3 tmp(fval); + all_vals.push_back(tmp.__x); + } else { + __hip_fp6_e3m2 tmp(fval); + all_vals.push_back(tmp.__x); + } + f_vals.push_back(fval); + } + + TestType* d_f_vals; + __hip_fp6_storage_t* d_res; + + HIP_CHECK(hipMalloc(&d_f_vals, sizeof(TestType) * f_vals.size())); + HIP_CHECK(hipMalloc(&d_res, sizeof(__hip_fp6_storage_t) * f_vals.size())); + + HIP_CHECK(hipMemcpy(d_f_vals, f_vals.data(), sizeof(TestType) * + f_vals.size(), hipMemcpyHostToDevice)); + + auto fp6_kernel = is_e2m3 ? Type_to_fp6 : + Type_to_fp6; + fp6_kernel<<<(f_vals.size() / 64) + 1, 64>>>(d_f_vals, d_res, f_vals.size()); + + std::vector<__hip_fp6_storage_t> + final_res(f_vals.size(), static_cast<__hip_fp6_storage_t>(0)); + + HIP_CHECK(hipMemcpy(final_res.data(), d_res, + sizeof(__hip_fp6_storage_t) * final_res.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < final_res.size(); i++) { + INFO("Checking: " << f_vals[i] << " for: " << (is_e2m3 ? "e2m3" : "e3m2") + << " original: " << (int)all_vals[i] + << " convert back: " << (int)final_res[i] << " Idx : " + << i); + TestType gpu_cvt_res = 0, cpu_cvt_res = 0; + float gpu_cvt_res_ = 0.0, cpu_cvt_res_ = 0.0; + if (is_e2m3) { + __hip_fp6_e2m3 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res_ = gtmp; + gpu_cvt_res = gpu_cvt_res_; + __hip_fp6_e2m3 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res_ = ctmp; + cpu_cvt_res = cpu_cvt_res_; + } else { + __hip_fp6_e3m2 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res_ = gtmp; + gpu_cvt_res = gpu_cvt_res_; + __hip_fp6_e3m2 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res_ = ctmp; + cpu_cvt_res = cpu_cvt_res_; + } + INFO("cpu cvt val: " << cpu_cvt_res << " gpu cvt val: " << gpu_cvt_res); + REQUIRE(cpu_cvt_res == gpu_cvt_res); + } + + HIP_CHECK(hipFree(d_f_vals)); + HIP_CHECK(hipFree(d_res)); +} + +/** + * Test Description + * ------------------------ + * - Basic test to convert given interger values to FP6 type in device with + * E2M3 and E3M2 formats. + * Test source + * ------------------------ + * - /unit/deviceLib/fp6_ocp.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.5 + */ + +TEMPLATE_TEST_CASE("Unit_all_fp6_ocp_vector_cvt_interger_data_device", "", int, + long int, long long int, short int) { + bool is_e2m3 = GENERATE(true, false); + std::vector f_vals; + std::vector<__hip_fp6_storage_t> all_vals; + SECTION("all representable numbers") { + all_vals = get_all_fp6_ocp_nums(); + f_vals.reserve(all_vals.size()); + + for (const auto& fp6 : all_vals) { + TestType f = 0; + float f_ = 0.0; + if (is_e2m3) { + __hip_fp6_e2m3 tmp; + tmp.__x = fp6; + f_ = tmp; + f = f_; + } else { + __hip_fp6_e3m2 tmp; + tmp.__x = fp6; + f_ = tmp; + f = f_; + } + f_vals.push_back(f); + } + } + SECTION("Range stepped numbers") { + constexpr TestType lhs = -30; + constexpr TestType rhs = 30; + constexpr TestType step = 1; + + f_vals.reserve(500); + all_vals.reserve(500); + + for (TestType fval = lhs; fval <= rhs; fval += step) { + if (is_e2m3) { + __hip_fp6_e2m3 tmp(fval); + all_vals.push_back(tmp.__x); + } else { + __hip_fp6_e3m2 tmp(fval); + all_vals.push_back(tmp.__x); + } + f_vals.push_back(fval); + } + } + + TestType* d_f_vals; + __hip_fp6_storage_t* d_res; + + HIP_CHECK(hipMalloc(&d_f_vals, sizeof(TestType) * f_vals.size())); + HIP_CHECK(hipMalloc(&d_res, sizeof(__hip_fp6_storage_t) * f_vals.size())); + + HIP_CHECK(hipMemcpy(d_f_vals, f_vals.data(), + sizeof(TestType) *f_vals.size(), hipMemcpyHostToDevice)); + + auto fp6_kernel = is_e2m3 ? Type_to_fp6 : + Type_to_fp6; + fp6_kernel<<<(f_vals.size() / 64) + 1, 64>>>(d_f_vals, d_res, f_vals.size()); + + std::vector<__hip_fp6_storage_t> final_res(f_vals.size(), + static_cast<__hip_fp6_storage_t>(0)); + + HIP_CHECK(hipMemcpy(final_res.data(), d_res, sizeof(__hip_fp6_storage_t) * + final_res.size(), hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < final_res.size(); i++) { + INFO("Checking: " << f_vals[i] << " for: " << (is_e2m3 ? "e2m3" : "e3m2") + << " original: " << (int)all_vals[i] + << " convert back: " << (int)final_res[i] << " Idx : " + << i); + TestType gpu_cvt_res = 0.0f, cpu_cvt_res = 0.0f; + float gpu_cvt_res_ = 0.0, cpu_cvt_res_ = 0.0; + if (is_e2m3) { + __hip_fp6_e2m3 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res_ = gtmp; + gpu_cvt_res = gpu_cvt_res_; + __hip_fp6_e2m3 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res_ = ctmp; + cpu_cvt_res = cpu_cvt_res_; + } else { + __hip_fp6_e3m2 gtmp; + gtmp.__x = final_res[i]; + gpu_cvt_res_ = gtmp; + gpu_cvt_res = gpu_cvt_res_; + __hip_fp6_e3m2 ctmp; + ctmp.__x = all_vals[i]; + cpu_cvt_res_ = ctmp; + cpu_cvt_res = cpu_cvt_res_; + } + + INFO("cpu cvt val: " << cpu_cvt_res << " gpu cvt val: " << gpu_cvt_res); + REQUIRE(cpu_cvt_res == gpu_cvt_res); + } + + HIP_CHECK(hipFree(d_f_vals)); + HIP_CHECK(hipFree(d_res)); +} +