diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt index 1ad4dba46..35ccfe090 100644 --- a/catch/unit/deviceLib/CMakeLists.txt +++ b/catch/unit/deviceLib/CMakeLists.txt @@ -87,6 +87,9 @@ set(AMD_TEST_SRC fp8_host.cc fp6_ocp.cc fp4_ocp.cc + ext_ocp.cc + ext_extract.cc + ext_cxx.cc ) set(AMD_ARCH_SPEC_TEST_SRC diff --git a/catch/unit/deviceLib/ext_cxx.cc b/catch/unit/deviceLib/ext_cxx.cc new file mode 100644 index 000000000..57108bfc4 --- /dev/null +++ b/catch/unit/deviceLib/ext_cxx.cc @@ -0,0 +1,1022 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include +#include + + +// CXX + +__global__ void cxx_fp8_e4m3_device_cvt(float* in, float* out, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8_e4m3 tmp(in[i]); + out[i] = tmp; + } +} + +__global__ void cxx_fp8_sr_e4m3_device_cvt(float* in, float* out, unsigned int seed, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8_e4m3 tmp(in[i], seed); + out[i] = tmp; + } +} + +__global__ void cxx_fp8_e5m2_device_cvt(float* in, float* out, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8_e5m2 tmp(in[i]); + out[i] = tmp; + } +} + +__global__ void cxx_fp8_sr_e5m2_device_cvt(float* in, float* out, unsigned int seed, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8_e5m2 tmp(in[i], seed); + out[i] = tmp; + } +} + +__global__ void cxx_fp8x2_e4m3_device_cvt(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8x2_e4m3 tmp(in[i]); + out[i] = tmp; + } +} + +__global__ void cxx_fp8x2_e5m2_device_cvt(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8x2_e5m2 tmp(in[i]); + out[i] = tmp; + } +} + +TEST_CASE("Unit_ocp_cxx_fp8_host_conv") { + SECTION("e4m3") { + constexpr size_t size = 449 * 2 + 1; + std::vector in; + in.reserve(size); + for (int i = -449; i <= 449; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + cxx_fp8_e4m3_device_cvt<<<1, size>>>(d_in, d_out, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8_e4m3 tmp(in[i]); + cpu_res[i] = tmp; + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3-sr") { + constexpr size_t size = 449 * 2 + 1; + constexpr unsigned int seed = 10; + std::vector in; + in.reserve(size); + for (int i = -449; i <= 449; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + cxx_fp8_sr_e4m3_device_cvt<<<1, size>>>(d_in, d_out, seed, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8_e4m3 tmp(in[i], seed); + cpu_res[i] = tmp; + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2") { + constexpr size_t size = 511 * 2 + 1; + std::vector in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + cxx_fp8_e5m2_device_cvt<<<1, size>>>(d_in, d_out, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8_e5m2 tmp(in[i]); + cpu_res[i] = tmp; + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + CHECK(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2-sr") { + constexpr size_t size = 511 * 2 + 1; + constexpr unsigned int seed = 10; + std::vector in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + cxx_fp8_sr_e5m2_device_cvt<<<1, size>>>(d_in, d_out, seed, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8_e5m2 tmp(in[i], seed); + cpu_res[i] = tmp; + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + CHECK(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3x2") { + constexpr size_t size = 448 * 2 + 1; + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, hipMemcpyHostToDevice)); + cxx_fp8x2_e4m3_device_cvt<<<1, size>>>(d_in, d_out, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8x2_e4m3 tmp(in[i]); + cpu_res[i] = tmp; + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << "\n\tin: a: " << in[i][0] << " b: " << in[i][1] + << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2x2") { + constexpr size_t size = 511 * 2 + 1; + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, hipMemcpyHostToDevice)); + cxx_fp8x2_e5m2_device_cvt<<<1, size>>>(d_in, d_out, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + __hipext_ocp_fp8x2_e5m2 tmp(in[i]); + cpu_res[i] = tmp; + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << "\n\tin: a: " << in[i][0] << " b: " << in[i][1] + << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } +} + +__global__ void fp8x2_e4m3_cxx_convert_fp32(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, size_t size) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8x2_e4m3 tmp(in[i]); + out[i] = tmp; + } +} + +__global__ void fp8x2_e4m3_cxx_convert_fp16(__amd_fp16x2_storage_t* in, __amd_fp16x2_storage_t* out, + size_t size, __amd_fp8x2_storage_t* t_out = nullptr) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8x2_e4m3 tmp(in[i], 0); + if (t_out != nullptr) { + t_out[i] = tmp.__x; + } + out[i] = tmp.get_scaled_fp16x2(0); + } +} + +__global__ void fp8x2_e4m3_cxx_convert_bf16(__amd_bf16x2_storage_t* in, __amd_bf16x2_storage_t* out, + size_t size, __amd_fp8x2_storage_t* t_out = nullptr) { + int i = threadIdx.x; + if (i < size) { + __hipext_ocp_fp8x2_e4m3 tmp(in[i], 0); + if (t_out != nullptr) { + t_out[i] = tmp.__x; + } + out[i] = tmp.get_scaled_bf16x2(0); + } +} + +TEST_CASE("Unit_ocp_fp8x2_host_device") { + SECTION("floatx2 to e4m3") { + __amd_floatx2_storage_t in{-10.0f, 10.0f}, *d_in, *d_out, out{0.0f, 0.0f}; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx2_storage_t), hipMemcpyHostToDevice)); + fp8x2_e4m3_cxx_convert_fp32<<<1, 32>>>(d_in, d_out, 1); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + __hipext_ocp_fp8x2_e4m3 tmp(in); + __amd_floatx2_storage_t cpu_out = tmp; + INFO(" In : " << in[0] << ", " << in[1]); + INFO("GPU Out: " << out[0] << ", " << out[1]); + INFO("CPU Out: " << cpu_out[0] << ", " << cpu_out[1]); + REQUIRE(out[0] == cpu_out[0]); + REQUIRE(out[1] == cpu_out[1]); + } + + SECTION("fp16x2 to e4m3") { + __amd_fp16x2_storage_t in{-10.0f, 10.0f}, *d_in, *d_out, out{0.0f, 0.0f}; + __amd_fp8x2_storage_t *t_storage, gt_storage; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x2_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x2_storage_t))); + HIP_CHECK(hipMalloc(&t_storage, sizeof(__amd_fp8x2_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_fp16x2_storage_t), hipMemcpyHostToDevice)); + fp8x2_e4m3_cxx_convert_fp16<<<1, 32>>>(d_in, d_out, 1, t_storage); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_fp16x2_storage_t), hipMemcpyDeviceToHost)); + HIP_CHECK( + hipMemcpy(>_storage, t_storage, sizeof(__amd_fp8x2_storage_t), hipMemcpyDeviceToHost)); + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + __hipext_ocp_fp8x2_e4m3 tmp(in, 0); + __amd_fp16x2_storage_t cpu_out = tmp.get_scaled_fp16x2(0); + INFO(" In : " << float(in[0]) << ", " << float(in[1])); + INFO("GPU Out: " << float(out[0]) << ", " << float(out[1])); + INFO("CPU Out: " << float(cpu_out[0]) << ", " << float(cpu_out[1])); + INFO("gpu tmp: " << std::hex << unsigned(gt_storage) << ", cpu: " << unsigned(tmp.__x)); + REQUIRE(out[0] == cpu_out[0]); + REQUIRE(out[1] == cpu_out[1]); + } + + // SECTION("bf16x2 to e4m3") { + // __amd_bf16x2_storage_t in{-10.0f, 10.0f}, *d_in, *d_out, out{0.0f, 0.0f}; + // HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x2_storage_t))); + // HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x2_storage_t))); + // HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_bf16x2_storage_t), + // hipMemcpyHostToDevice)); + // fp8x2_e4m3_cxx_convert_bf16<<<1, 32>>>(d_in, d_out, 1); + // HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_bf16x2_storage_t), + // hipMemcpyDeviceToHost)); + // HIP_CHECK(hipFree(d_in)); + // HIP_CHECK(hipFree(d_out)); + // __hipext_ocp_fp8x2_e4m3 tmp(in, 0); + // __amd_bf16x2_storage_t cpu_out = tmp.get_scaled_bf16x2(0); + // REQUIRE(out[0] == cpu_out[0]); + // REQUIRE(out[1] == cpu_out[1]); + // } +} + + +namespace cxx_ocp { +__host__ __device__ static float fp8_e4m3_to_float(const float in) { + return __hipext_ocp_fp8_e4m3{in}; +} + +__host__ __device__ static float fp8_e4m3_to_float_scale(const float in, + const __amd_scale_t scale) { + return __hipext_ocp_fp8_e4m3(in, /* seed */ 0, scale).get_scaled_float(scale); +} + +__host__ __device__ static __amd_fp16_storage_t fp8_e4m3_to_fp16(const __amd_fp16_storage_t in) { + return __hipext_ocp_fp8_e4m3(in, /* seed */ 0, 0).get_scaled_fp16(0); +} + +__host__ __device__ static __amd_fp16_storage_t fp8_e4m3_to_fp16_scale( + const __amd_fp16_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8_e4m3(in, /* seed */ 0, scale).get_scaled_fp16(scale); +} + +__host__ __device__ static __amd_bf16_storage_t fp8_e4m3_to_bf16(const __amd_bf16_storage_t in) { + return __hipext_ocp_fp8_e4m3(in, /* seed */ 0, 0).get_scaled_bf16(0); +} + +__host__ __device__ static __amd_bf16_storage_t fp8_e4m3_to_bf16_scale( + const __amd_bf16_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8_e4m3(in, /* seed */ 0, scale).get_scaled_bf16(scale); +} + +__global__ void kernel_fp8_e4m3_to_float(float* in, float* out, size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_float(in[i]); + } +} + +__global__ void kernel_fp8_e4m3_to_float_scale(float* in, float* out, const __amd_scale_t scale, + const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_float_scale(in[i], scale); + } +} + +__global__ void kernel_fp8_e4m3_to_fp16(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_fp16(in[i]); + } +} + +__global__ void kernel_fp8_e4m3_to_fp16_scale(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_fp16_scale(in[i], scale); + } +} + +__global__ void kernel_fp8_e4m3_to_bf16(__amd_bf16_storage_t* in, __amd_bf16_storage_t* out, + size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_bf16(in[i]); + } +} + +__global__ void kernel_fp8_e4m3_to_bf16_scale(__amd_bf16_storage_t* in, __amd_bf16_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e4m3_to_bf16_scale(in[i], scale); + } +} + +__host__ __device__ static float fp8_e5m2_to_float(const float in) { + return __hipext_ocp_fp8_e5m2{in}; +} + +__host__ __device__ static float fp8_e5m2_to_float_scale(const float in, + const __amd_scale_t scale) { + return __hipext_ocp_fp8_e5m2(in, /* seed */ 0, scale).get_scaled_float(scale); +} + +__host__ __device__ static __amd_fp16_storage_t fp8_e5m2_to_fp16(const __amd_fp16_storage_t in) { + return __hipext_ocp_fp8_e5m2(in, /* seed */ 0, 0).get_scaled_fp16(0); +} + +__host__ __device__ static __amd_fp16_storage_t fp8_e5m2_to_fp16_scale( + const __amd_fp16_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8_e5m2(in, /* seed */ 0, scale).get_scaled_fp16(scale); +} + +__global__ void kernel_fp8_e5m2_to_float(float* in, float* out, size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e5m2_to_float(in[i]); + } +} + +__global__ void kernel_fp8_e5m2_to_float_scale(float* in, float* out, const __amd_scale_t scale, + const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e5m2_to_float_scale(in[i], scale); + } +} + +__global__ void kernel_fp8_e5m2_to_fp16(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e5m2_to_fp16(in[i]); + } +} + +__global__ void kernel_fp8_e5m2_to_fp16_scale(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8_e5m2_to_fp16_scale(in[i], scale); + } +} +} // namespace cxx_ocp + +TEST_CASE("Unit_ocp_cxx_fp8") { + using namespace cxx_ocp; + const float in = 20.0f; + float *d_in, *d_out; + __amd_fp16_storage_t fp16_in = in, *fp16_d_in, *fp16_d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float))); + HIP_CHECK(hipMalloc(&d_out, sizeof(float))); + HIP_CHECK(hipMalloc(&fp16_d_in, sizeof(__amd_fp16_storage_t))); + HIP_CHECK(hipMalloc(&fp16_d_out, sizeof(__amd_fp16_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(float), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(fp16_d_in, &fp16_in, sizeof(__amd_fp16_storage_t), hipMemcpyHostToDevice)); + SECTION("E4M3 ") { + SECTION("CPU") { + float out = fp8_e4m3_to_float(in); + INFO("CPU In: " << in << " Out: " << out); + REQUIRE(in == out); + } + + SECTION("GPU") { + kernel_fp8_e4m3_to_float<<<1, 32>>>(d_in, d_out); + float out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost)); + INFO("GPU In: " << in << " Out: " << out); + REQUIRE(in == out); + } + + SECTION("CPU Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + float out = fp8_e4m3_to_float_scale(in, scale); + INFO("Scale: " << int(scale)); + INFO("In: " << in << " Out: " << out); + REQUIRE(in == out); + } + } + + SECTION("GPU Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8_e4m3_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + float out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost)); + INFO("Scale: " << int(scale)); + INFO("GPU In: " << in << " Out: " << out); + CHECK(in == out); + } + } + + SECTION("CPU fp16") { + __amd_fp16_storage_t out = fp8_e4m3_to_fp16(fp16_in); + INFO("GPU In: " << in << " Out: " << float(out)); + REQUIRE(in == out); + } + + SECTION("GPU fp16") { + kernel_fp8_e4m3_to_fp16<<<1, 32>>>(fp16_d_in, fp16_d_out); + __amd_fp16_storage_t out; + HIP_CHECK(hipMemcpy(&out, fp16_d_out, sizeof(__amd_fp16_storage_t), hipMemcpyDeviceToHost)); + INFO("GPU In: " << in << " Out: " << float(out)); + REQUIRE(in == out); + } + + SECTION("CPU fp16 Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_fp16_storage_t out = fp8_e4m3_to_fp16_scale(in, scale); + INFO("Scale: " << int(scale)); + INFO("In: " << in << " Out: " << float(out)); + REQUIRE(fp16_in == out); + } + } + + SECTION("GPU fp16 Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8_e4m3_to_fp16_scale<<<1, 32>>>(fp16_d_in, fp16_d_out, scale); + __amd_fp16_storage_t out; + HIP_CHECK(hipMemcpy(&out, fp16_d_out, sizeof(__amd_fp16_storage_t), hipMemcpyDeviceToHost)); + INFO("Scale: " << int(scale)); + INFO("GPU In: " << in << " Out: " << float(out)); + CHECK(in == out); + } + } + } + + SECTION("E5M2 ") { + SECTION("CPU") { + float out = fp8_e5m2_to_float(in); + INFO("CPU In: " << in << " Out: " << out); + REQUIRE(in == out); + } + + SECTION("GPU") { + kernel_fp8_e5m2_to_float<<<1, 32>>>(d_in, d_out); + float out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost)); + INFO("GPU In: " << in << " Out: " << out); + REQUIRE(in == out); + } + + SECTION("CPU Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + float in = 20.0f; + float out = fp8_e5m2_to_float_scale(in, scale); + INFO("Scale: " << int(scale)); + INFO("In: " << in << " Out: " << out); + REQUIRE(in == out); + } + } + + SECTION("GPU Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8_e5m2_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + float out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost)); + INFO("Scale: " << int(scale)); + INFO("GPU In: " << in << " Out: " << out); + CHECK(in == out); + } + } + + SECTION("CPU fp16") { + __amd_fp16_storage_t out = fp8_e5m2_to_fp16(fp16_in); + INFO("GPU In: " << in << " Out: " << float(out)); + REQUIRE(in == out); + } + + SECTION("GPU fp16") { + kernel_fp8_e5m2_to_fp16<<<1, 32>>>(fp16_d_in, fp16_d_out); + __amd_fp16_storage_t out; + HIP_CHECK(hipMemcpy(&out, fp16_d_out, sizeof(__amd_fp16_storage_t), hipMemcpyDeviceToHost)); + INFO("GPU In: " << in << " Out: " << float(out)); + REQUIRE(in == out); + } + + SECTION("CPU fp16 Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_fp16_storage_t out = fp8_e5m2_to_fp16_scale(in, scale); + INFO("Scale: " << int(scale)); + INFO("In: " << in << " Out: " << float(out)); + REQUIRE(in == out); + } + } + + SECTION("GPU fp16 Scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8_e5m2_to_fp16_scale<<<1, 32>>>(fp16_d_in, fp16_d_out, scale); + __amd_fp16_storage_t out; + HIP_CHECK(hipMemcpy(&out, fp16_d_out, sizeof(__amd_fp16_storage_t), hipMemcpyDeviceToHost)); + INFO("Scale: " << int(scale)); + INFO("GPU In: " << in << " Out: " << float(out)); + CHECK(in == out); + } + } + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(fp16_d_in)); + HIP_CHECK(hipFree(fp16_d_out)); +} + +namespace cxx_ocp { +__host__ __device__ static __amd_floatx2_storage_t fp8x2_e4m3_to_float( + const __amd_floatx2_storage_t in) { + return __hipext_ocp_fp8x2_e4m3{in}; +} + +__host__ __device__ static __amd_floatx2_storage_t fp8x2_e4m3_to_float_scale( + const __amd_floatx2_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8x2_e4m3(in, scale).get_scaled_floatx2(scale); +} + +__host__ __device__ static __amd_fp16x2_storage_t fp8x2_e4m3_to_fp16_scale( + const __amd_fp16x2_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8x2_e4m3(in, scale).get_scaled_fp16x2(scale); +} + +__global__ void kernel_fp8x2_e4m3_to_float(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e4m3_to_float(in[i]); + } +} + +__global__ void kernel_fp8x2_e4m3_to_float_scale(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e4m3_to_float_scale(in[i], scale); + } +} + +__global__ void kernel_fp8x2_e4m3_to_fp16_scale(__amd_fp16x2_storage_t* in, + __amd_fp16x2_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e4m3_to_fp16_scale(in[i], scale); + } +} + +__host__ __device__ static __amd_floatx2_storage_t fp8x2_e5m2_to_float( + const __amd_floatx2_storage_t in) { + return __hipext_ocp_fp8x2_e5m2{in}; +} + +__host__ __device__ static __amd_floatx2_storage_t fp8x2_e5m2_to_float_scale( + const __amd_floatx2_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8x2_e5m2(in, scale).get_scaled_floatx2(scale); +} + +__host__ __device__ static __amd_fp16x2_storage_t fp8x2_e5m2_to_fp16_scale( + const __amd_fp16x2_storage_t in, const __amd_scale_t scale) { + return __hipext_ocp_fp8x2_e5m2(in, scale).get_scaled_fp16x2(scale); +} + +__global__ void kernel_fp8x2_e5m2_to_float(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e5m2_to_float(in[i]); + } +} + +__global__ void kernel_fp8x2_e5m2_to_float_scale(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e5m2_to_float_scale(in[i], scale); + } +} + +__global__ void kernel_fp8x2_e5m2_to_fp16_scale(__amd_fp16x2_storage_t* in, + __amd_fp16x2_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp8x2_e5m2_to_fp16_scale(in[i], scale); + } +} +} // namespace cxx_ocp + +TEST_CASE("Unit_ocp_cxx_fp8x2") { + using namespace cxx_ocp; + const __amd_floatx2_storage_t in = {-5.0f, 7.0f}; + __amd_floatx2_storage_t *d_in, *d_out; + __amd_fp16x2_storage_t fp16_in{static_cast<_Float16>(in[0]), static_cast<_Float16>(in[1])}, + *fp16_d_in, *fp16_d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMalloc(&fp16_d_in, sizeof(__amd_fp16x2_storage_t))); + HIP_CHECK(hipMalloc(&fp16_d_out, sizeof(__amd_fp16x2_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx2_storage_t), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(fp16_d_in, &fp16_in, sizeof(__amd_fp16x2_storage_t), hipMemcpyHostToDevice)); + + SECTION("E4M3") { + SECTION("CPU") { + auto ret = fp8x2_e4m3_to_float(in); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + + SECTION("GPU") { + kernel_fp8x2_e4m3_to_float<<<1, 32>>>(d_in, d_out); + __amd_floatx2_storage_t ret; + HIP_CHECK(hipMemcpy(&ret, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + + SECTION("CPU Scaled") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_floatx2_storage_t ret = fp8x2_e4m3_to_float_scale(in, scale); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + } + + SECTION("GPU Scaled") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_floatx2_storage_t ret; + kernel_fp8x2_e4m3_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + HIP_CHECK(hipMemcpy(&ret, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + } + + SECTION("CPU fp16 scalex") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + auto ret = fp8x2_e4m3_to_fp16_scale(fp16_in, scale); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << float(ret[0]) << ", " << float(ret[1])); + REQUIRE(fp16_in[0] == ret[0]); + REQUIRE(fp16_in[1] == ret[1]); + } + } + + SECTION("GPU fp16 scale") { + __amd_fp16x2_storage_t ret; + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8x2_e4m3_to_fp16_scale<<<1, 32>>>(fp16_d_in, fp16_d_out, scale); + HIP_CHECK( + hipMemcpy(&ret, fp16_d_out, sizeof(__amd_fp16x2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << float(ret[0]) << ", " << float(ret[1])); + REQUIRE(fp16_in[0] == ret[0]); + REQUIRE(fp16_in[1] == ret[1]); + } + } + } + + SECTION("E5M2") { + SECTION("CPU") { + auto ret = fp8x2_e5m2_to_float(in); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + + SECTION("GPU") { + kernel_fp8x2_e4m3_to_float<<<1, 32>>>(d_in, d_out); + __amd_floatx2_storage_t ret; + HIP_CHECK(hipMemcpy(&ret, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + + SECTION("CPU Scaled") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_floatx2_storage_t ret = fp8x2_e5m2_to_float_scale(in, scale); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + } + + SECTION("GPU Scaled") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + __amd_floatx2_storage_t ret; + kernel_fp8x2_e5m2_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + HIP_CHECK(hipMemcpy(&ret, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << ret[0] << ", " << ret[1]); + REQUIRE(in[0] == ret[0]); + REQUIRE(in[1] == ret[1]); + } + } + + SECTION("CPU fp16 scale") { + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + auto ret = fp8x2_e5m2_to_fp16_scale(fp16_in, scale); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << float(ret[0]) << ", " << float(ret[1])); + REQUIRE(fp16_in[0] == ret[0]); + REQUIRE(fp16_in[1] == ret[1]); + } + } + + SECTION("GPU fp16 scale") { + __amd_fp16x2_storage_t ret; + std::vector<__amd_scale_t> scales = {0, 1, 2, 3}; + for (const auto scale : scales) { + kernel_fp8x2_e5m2_to_fp16_scale<<<1, 32>>>(fp16_d_in, fp16_d_out, scale); + HIP_CHECK( + hipMemcpy(&ret, fp16_d_out, sizeof(__amd_fp16x2_storage_t), hipMemcpyDeviceToHost)); + INFO("In : " << in[0] << ", " << in[1]); + INFO("Out: " << float(ret[0]) << ", " << float(ret[1])); + REQUIRE(fp16_in[0] == ret[0]); + REQUIRE(fp16_in[1] == ret[1]); + } + } + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(fp16_d_in)); + HIP_CHECK(hipFree(fp16_d_out)); +} + +namespace cxx_ocp { +#if __AVX512F__ +__host__ __device__ __amd_floatx32_storage_t +fp6x32_e3m2_to_float_scale(const __amd_floatx32_storage_t in, __amd_scale_t scale) { + __hipext_ocp_fp6x32_e3m2 tmp(in, 0, scale); + return tmp.get_scaled_floatx32(scale); +} + +__global__ void kernel_fp6x32_e3m2_to_float_scale(__amd_floatx32_storage_t* in, + __amd_floatx32_storage_t* out, + const __amd_scale_t scale, + const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp6x32_e3m2_to_float_scale(in[i], scale); + } +} + +__host__ __device__ __amd_floatx32_storage_t +fp6x32_e2m3_to_float_scale(const __amd_floatx32_storage_t in, __amd_scale_t scale) { + __hipext_ocp_fp6x32_e2m3 tmp(in, 0, scale); + return tmp.get_scaled_floatx32(scale); +} + +__global__ void kernel_fp6x32_e2m3_to_float_scale(__amd_floatx32_storage_t* in, + __amd_floatx32_storage_t* out, + const __amd_scale_t scale, + const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp6x32_e2m3_to_float_scale(in[i], scale); + } +} +#endif +} // namespace cxx_ocp + +#if __AVX512F__ +TEST_CASE("Unit_ocp_cxx_fp6x32") { + using namespace cxx_ocp; + __amd_floatx32_storage_t in; + float iter = -8.0f; + for (int i = 0; i < 32; i++) { + in[i] = iter; + iter++; + if (iter > 8.0f) { + iter = -8.0f; + } + } + __amd_floatx32_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx32_storage_t), hipMemcpyHostToDevice)); + + SECTION("E3M2") { + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + auto cpu_res = fp6x32_e3m2_to_float_scale(in, scale); + kernel_fp6x32_e3m2_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + __amd_floatx32_storage_t gpu_res; + HIP_CHECK( + hipMemcpy(&gpu_res, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("Index: " << i << " cpu: " << cpu_res[i] << " gpu:" << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + } + } + + SECTION("E2M3") { + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + auto cpu_res = fp6x32_e2m3_to_float_scale(in, scale); + kernel_fp6x32_e2m3_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + __amd_floatx32_storage_t gpu_res; + HIP_CHECK( + hipMemcpy(&gpu_res, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("Index: " << i << " cpu: " << cpu_res[i] << " gpu:" << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + } + } +} +#endif + +namespace cxx_ocp { +__host__ __device__ __amd_floatx2_storage_t +fp4x2_e2m1_to_float_scale(const __amd_floatx2_storage_t in, __amd_scale_t scale) { + __hipext_ocp_fp4x2_e2m1 tmp(in, 0, scale); + return tmp.get_scaled_floatx2(scale); +} + +__global__ void kernel_fp4x2_e2m1_to_float_scale(__amd_floatx2_storage_t* in, + __amd_floatx2_storage_t* out, + const __amd_scale_t scale, const size_t size = 1) { + int i = threadIdx.x; + if (i < size) { + out[i] = fp4x2_e2m1_to_float_scale(in[i], scale); + } +} +} // namespace cxx_ocp + +TEST_CASE("Unit_ocp_cxx_fp4x2") { + using namespace cxx_ocp; + const __amd_floatx2_storage_t in = {-2.0f, 3.0f}; + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx2_storage_t), hipMemcpyHostToDevice)); + + SECTION("CPU-GPU compare") { + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + auto cpu_res = fp4x2_e2m1_to_float_scale(in, scale); + __amd_floatx2_storage_t gpu_res; + kernel_fp4x2_e2m1_to_float_scale<<<1, 32>>>(d_in, d_out, scale); + HIP_CHECK(hipMemcpy(&gpu_res, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("CPU: " << cpu_res[0] << ", " << cpu_res[1]); + INFO("GPU: " << gpu_res[0] << ", " << gpu_res[1]); + REQUIRE(cpu_res[0] == gpu_res[0]); + REQUIRE(cpu_res[1] == gpu_res[1]); + } + } +} diff --git a/catch/unit/deviceLib/ext_extract.cc b/catch/unit/deviceLib/ext_extract.cc new file mode 100644 index 000000000..fd9a27617 --- /dev/null +++ b/catch/unit/deviceLib/ext_extract.cc @@ -0,0 +1,243 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include +#include + +TEST_CASE("Unit_amd_ocp_type_to_hip_type") { + SECTION("fp16") { + float in = 10.0f; + __half hf = in; + auto fp16 = __amd_cvt_half_to_fp16(hf); + auto hf_back = __amd_cvt_fp16_to_half(fp16); + float out = fp16; + REQUIRE(out == in); + REQUIRE(float(hf_back) == float(hf)); + } + + SECTION("fp16x2") { + float a = -10.0f, b = 10.0f; + __half2 hf = {a, b}; + auto fp16x2 = __amd_cvt_half2_to_fp16x2(hf); + auto hf_back = __amd_cvt_fp16x2_to_half2(fp16x2); + float o_a = fp16x2[0], o_b = fp16x2[1]; + REQUIRE(o_a == a); + REQUIRE(o_b == b); + REQUIRE(__hbeq2(hf_back, hf)); + } + + SECTION("floatx2 to float2") { + __amd_floatx2_storage_t in = {-10.0f, 10.0f}; + auto f2 = __amd_cvt_floatx2_to_float2(in); + REQUIRE(f2.x == in[0]); + REQUIRE(f2.y == in[1]); + } + + SECTION("bf16") { + float in = 10.0f; + __hip_bfloat16 bf = in; + auto bf16 = __amd_cvt_hipbf16_to_bf16(bf); + auto bf_back = __amd_cvt_bf16_to_hipbf16(bf16); + float out = bf16; + REQUIRE(out == in); + REQUIRE(float(bf_back) == float(bf)); + } + + SECTION("bf16x2") { + float a = -10.0f, b = 10.0f; + __hip_bfloat162 bf = {a, b}; + auto bf16x2 = __amd_cvt_hipbf162_to_bf16x2(bf); + auto bf_back = __amd_cvt_bf16x2_to_hipbf162(bf16x2); + float o_a = bf16x2[0], o_b = bf16x2[1]; + REQUIRE(o_a == a); + REQUIRE(o_b == b); + REQUIRE(__hbeq2(bf_back, bf)); + } +} + +template +static __global__ void t_lambda_launch(Kernel k, Args... args) { + int i = threadIdx.x; + if (i == 0) { + k(args...); + } +} + +TEST_CASE("Unit_amd_ocp_extract_tests") { + SECTION("fp8x2 host") { + constexpr auto interpret = __AMD_OCP_E4M3; + __amd_floatx2_storage_t in{-10.0f, 10.0f}; + auto fp8x2 = __amd_cvt_floatx2_to_fp8x2(in, interpret); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(fp8x2, 0), interpret) == -10.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(fp8x2, 1), interpret) == 10.0f); + } + +#if __AVX__ + SECTION("fp8x8 host") { + constexpr auto interpret = __AMD_OCP_E4M3; + __amd_floatx8_storage_t in{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + auto fp8x8 = __amd_cvt_floatx8_to_fp8x8_scale(in, interpret, 0); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 0), 0), + interpret) == 1.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 0), 1), + interpret) == 2.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 1), 0), + interpret) == 3.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 1), 1), + interpret) == 4.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 2), 0), + interpret) == 5.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 2), 1), + interpret) == 6.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 3), 0), + interpret) == 7.0f); + REQUIRE(__amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 3), 1), + interpret) == 8.0f); + } +#endif + + SECTION("fp8x2 device") { + auto l = [] __device__(float a, float b, float* o_a, float* o_b) { + constexpr auto interpret = __AMD_OCP_E4M3; + __amd_floatx2_storage_t in{a, b}; + auto fp8x2 = __amd_cvt_floatx2_to_fp8x2(in, interpret); + *o_a = __amd_cvt_fp8_to_float(__amd_extract_fp8(fp8x2, 0), interpret); + *o_b = __amd_cvt_fp8_to_float(__amd_extract_fp8(fp8x2, 1), interpret); + }; + + float a = -10.0f, b = 10.0f, *res_a, *res_b; + + HIP_CHECK(hipMallocManaged(&res_a, sizeof(float))); + HIP_CHECK(hipMallocManaged(&res_b, sizeof(float))); + t_lambda_launch<<<1, 32>>>(l, a, b, res_a, res_b); + HIP_CHECK(hipDeviceSynchronize()); + REQUIRE(*res_a == a); + REQUIRE(*res_b == b); + + HIP_CHECK(hipFree(res_a)); + HIP_CHECK(hipFree(res_b)); + } + + SECTION("fp8x8 device") { + auto l = [] __device__(float* res) { + constexpr auto interpret = __AMD_OCP_E4M3; + __amd_floatx8_storage_t in{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + auto fp8x8 = __amd_cvt_floatx8_to_fp8x8_scale(in, interpret, 0); + res[0] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 0), 0), interpret); + res[1] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 0), 1), interpret); + res[2] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 1), 0), interpret); + res[3] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 1), 1), interpret); + res[4] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 2), 0), interpret); + res[5] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 2), 1), interpret); + res[6] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 3), 0), interpret); + res[7] = + __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(fp8x8, 3), 1), interpret); + }; + + float* res; + + HIP_CHECK(hipMallocManaged(&res, sizeof(float))); + t_lambda_launch<<<1, 32>>>(l, res); + HIP_CHECK(hipDeviceSynchronize()); + + for (size_t i = 0; i < 8; i++) { + INFO("Index: " << i << " res: " << res[i]); + REQUIRE(res[i] == static_cast(i + 1)); + } + + HIP_CHECK(hipFree(res)); + } + + SECTION("fp4x2 host") { + constexpr auto interpret = __AMD_OCP_E2M1; + __amd_floatx2_storage_t in{-1.0f, 1.0f}; + auto fp4x2 = __amd_cvt_floatx2_to_fp4x2_scale(in, interpret, 0 /* scale*/); + REQUIRE((__amd_extract_fp4(fp4x2, 0) & 0b1000) != 0); + REQUIRE((__amd_extract_fp4(fp4x2, 1) & 0b1000) == 0); + } + +#if __AVX__ + SECTION("fp4x8 host") { + constexpr auto interpret = __AMD_OCP_E2M1; + __amd_floatx8_storage_t in{0, 1, 1, 0, 1, 0, 0, 1}; + auto fp4x8 = __amd_cvt_floatx8_to_fp4x8_scale(in, interpret, 0); + for (size_t i = 0; i < 4; i++) { + auto r1 = __amd_cvt_fp4x2_to_floatx2_scale(__amd_extract_fp4x2(fp4x8, i), interpret, 0); + INFO("Index: " << i << " vals: " << in[i * 2] << ", " << in[i * 2 + 1]); + CHECK(r1[0] == in[i * 2]); + CHECK(r1[1] == in[i * 2 + 1]); + } + } +#endif + + SECTION("fp4x2 device") { + auto l = [] __device__(__amd_fp4x2_storage_t * res) { + constexpr auto interpret = __AMD_OCP_E2M1; + __amd_floatx2_storage_t in{-1.0f, 1.0f}; + *res = __amd_cvt_floatx2_to_fp4x2_scale(in, interpret, 0 /* scale*/); + }; + + __amd_fp4x2_storage_t* fp4x2; + HIP_CHECK(hipMallocManaged(&fp4x2, sizeof(__amd_fp4x2_storage_t))); + t_lambda_launch<<<1, 32>>>(l, fp4x2); + HIP_CHECK(hipDeviceSynchronize()); + + REQUIRE((__amd_extract_fp4(*fp4x2, 0) & 0b1000) != 0); + REQUIRE((__amd_extract_fp4(*fp4x2, 1) & 0b1000) == 0); + HIP_CHECK(hipFree(fp4x2)); + } + + SECTION("fp4x8 device") { + auto l = [] __device__(__amd_fp4x2_storage_t * res) { + __amd_floatx8_storage_t in{0, 1, 1, 0, 1, 0, 0, 1}; + auto fp4x8 = __amd_cvt_floatx8_to_fp4x8_scale(in, __AMD_OCP_E2M1, 0); + for (size_t i = 0; i < 4; i++) { + res[i] = __amd_extract_fp4x2(fp4x8, i); + } + }; + + __amd_fp4x2_storage_t* fp4x2; + HIP_CHECK(hipMallocManaged(&fp4x2, sizeof(__amd_fp4x2_storage_t) * 4)); + t_lambda_launch<<<1, 32>>>(l, fp4x2); + HIP_CHECK(hipDeviceSynchronize()); + + __amd_floatx8_storage_t in{0, 1, 1, 0, 1, 0, 0, 1}; + for (size_t i = 0; i < 4; i++) { + auto r1 = __amd_cvt_fp4x2_to_floatx2_scale(fp4x2[i], __AMD_OCP_E2M1, 0); + INFO("Index: " << i << " vals: " << in[i * 2] << ", " << in[i * 2 + 1]); + CHECK(r1[0] == in[i * 2]); + CHECK(r1[1] == in[i * 2 + 1]); + } + HIP_CHECK(hipFree(fp4x2)); + } +} diff --git a/catch/unit/deviceLib/ext_ocp.cc b/catch/unit/deviceLib/ext_ocp.cc new file mode 100644 index 000000000..cd493b47d --- /dev/null +++ b/catch/unit/deviceLib/ext_ocp.cc @@ -0,0 +1,3056 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include +#include + +// List of all gfx which support OCP HW capabilities, append here +static const std::vector ocp_capeable_hw{"gfx950"}; + +static __global__ void float_to_fp8_sr(float* in, __amd_fp8_storage_t* out, + __amd_fp8_interpretation_t interpret, int size, + unsigned int rng = 0) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_float_to_fp8_sr(in[i], interpret, rng); + } +} + +static __global__ void float_to_fp8_sr_scale(float* in, __amd_fp8_storage_t* out, + __amd_fp8_interpretation_t interpret, int size, + unsigned int rng, __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_float_to_fp8_sr_scale(in[i], interpret, rng, scale); + } +} + +static __global__ void fp8_to_float(__amd_fp8_storage_t* in, __amd_fp8_interpretation_t interpret, + float* out, int size) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_fp8_to_float(in[i], interpret); + } +} + +static __global__ void fp8_to_float_scale(__amd_fp8_storage_t* in, + __amd_fp8_interpretation_t interpret, float* out, + int size, __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_fp8_to_float_scale(in[i], interpret, scale); + } +} + +static __global__ void floatx2_to_fp8x2(__amd_floatx2_storage_t* in, __amd_fp8x2_storage_t* out, + __amd_fp8_interpretation_t interpret, int size) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_floatx2_to_fp8x2(in[i], interpret); + } +} + +static __global__ void fp8x2_to_floatx2(__amd_fp8x2_storage_t* in, + __amd_fp8_interpretation_t interpret, + __amd_floatx2_storage_t* out, int size) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_fp8x2_to_floatx2(in[i], interpret); + } +} + +static __global__ void floatx2_to_fp8x2_scale(__amd_floatx2_storage_t* in, + __amd_fp8x2_storage_t* out, + __amd_fp8_interpretation_t interpret, int size, + __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_floatx2_to_fp8x2_scale(in[i], interpret, scale); + } +} + +static __global__ void fp8x2_to_floatx2_scale(__amd_fp8x2_storage_t* in, + __amd_fp8_interpretation_t interpret, + __amd_floatx2_storage_t* out, int size, + __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + out[i] = __amd_cvt_fp8x2_to_floatx2_scale(in[i], interpret, scale); + } +} + +static __global__ void cxx_fp8_to_float_e4m3(float* res1, float* res2, float* res3, float* res4, + float* res5, float aa, __amd_scale_t scale, + unsigned int seed) { + int i = threadIdx.x; + float a = aa + i; + __half hf = a; + __hip_bfloat16 bf16 = a; + auto fp8_e4m3 = __hipext_ocp_fp8_e4m3(a); + auto fp8_e4m3_seed = __hipext_ocp_fp8_e4m3(a, seed); + auto fp8_e4m3_scale_seed = __hipext_ocp_fp8_e4m3(a, seed, scale); + auto fp8_from_half = __hipext_ocp_fp8_e4m3(__amd_cvt_half_to_fp16(hf), seed, scale); + auto fp8_from_bf16 = __hipext_ocp_fp8_e4m3(__amd_cvt_hipbf16_to_bf16(bf16), seed, scale); + res1[i] = fp8_e4m3; + res2[i] = fp8_e4m3_seed; + res3[i] = fp8_e4m3_scale_seed.get_scaled_float(scale); + res4[i] = fp8_from_half.get_scaled_float(scale); + res5[i] = fp8_from_bf16.get_scaled_float(scale); +} + +static __global__ void cxx_fp8_to_float_e5m2(float* res1, float* res2, float* res3, float* res4, + float* res5, float aa, __amd_scale_t scale, + unsigned int seed) { + int i = threadIdx.x; + float a = aa + i; + __half hf = a; + __hip_bfloat16 bf16 = a; + auto fp8_e4m3 = __hipext_ocp_fp8_e5m2(a); + auto fp8_e4m3_seed = __hipext_ocp_fp8_e5m2(a, seed); + auto fp8_e4m3_scale_seed = __hipext_ocp_fp8_e5m2(a, seed, scale); + auto fp8_from_half = __hipext_ocp_fp8_e5m2(__amd_cvt_half_to_fp16(hf), seed, scale); + auto fp8_from_bf16 = __hipext_ocp_fp8_e5m2(__amd_cvt_hipbf16_to_bf16(bf16), seed, scale); + res1[i] = fp8_e4m3; + res2[i] = fp8_e4m3_seed; + res3[i] = fp8_e4m3_scale_seed.get_scaled_float(scale); + res4[i] = fp8_from_half.get_scaled_float(scale); + res5[i] = fp8_from_bf16.get_scaled_float(scale); +} + +static __global__ void cxx_fp8x2_to_floatx2_e4m3( + __amd_floatx2_storage_t* res1, __amd_floatx2_storage_t* res2, __amd_floatx2_storage_t* res3, + __amd_floatx2_storage_t* res4, __amd_floatx2_storage_t* res5, __amd_floatx2_storage_t* res6, + float aa, float bb, __amd_scale_t scale) { + int i = threadIdx.x; + float a = aa + i; + float b = bb + i; + __amd_floatx2_storage_t fpx2{a, b}; + __half2 fp16x2{a, b}; + __hip_bfloat162 bf16x2{a, b}; + auto fp8x2_e4m3_from_float = __hipext_ocp_fp8x2_e4m3(a, b); + auto fp8x2_e4m3_from_floatx2 = __hipext_ocp_fp8x2_e4m3(fpx2); + auto fp8x2_e4m3_scale = __hipext_ocp_fp8x2_e4m3(a, b, scale); + auto fp8x2_e4m3_from_floatx2_scale = __hipext_ocp_fp8x2_e4m3(fpx2, scale); + auto fp8x2_from_half = __hipext_ocp_fp8x2_e4m3(__amd_cvt_half2_to_fp16x2(fp16x2), scale); + auto fp8x2_from_bf16 = __hipext_ocp_fp8x2_e4m3(__amd_cvt_hipbf162_to_bf16x2(bf16x2), scale); + res1[i] = fp8x2_e4m3_from_float; + res2[i] = fp8x2_e4m3_from_floatx2; + res3[i] = fp8x2_e4m3_scale.get_scaled_floatx2(scale); + res4[i] = fp8x2_e4m3_from_floatx2_scale.get_scaled_floatx2(scale); + res5[i] = fp8x2_from_half.get_scaled_floatx2(scale); + res6[i] = fp8x2_from_bf16.get_scaled_floatx2(scale); +} + +static __global__ void cxx_fp8x2_to_floatx2_e5m2( + __amd_floatx2_storage_t* res1, __amd_floatx2_storage_t* res2, __amd_floatx2_storage_t* res3, + __amd_floatx2_storage_t* res4, __amd_floatx2_storage_t* res5, __amd_floatx2_storage_t* res6, + float aa, float bb, __amd_scale_t scale) { + int i = threadIdx.x; + float a = aa + i; + float b = bb + i; + __amd_floatx2_storage_t fpx2{a, b}; + __half2 fp16x2{a, b}; + __hip_bfloat162 bf16x2{a, b}; + auto fp8x2_e5m2_from_float = __hipext_ocp_fp8x2_e5m2(a, b); + auto fp8x2_e5m2_from_floatx2 = __hipext_ocp_fp8x2_e5m2(fpx2); + auto fp8x2_e5m2_scale = __hipext_ocp_fp8x2_e5m2(a, b, scale); + auto fp8x2_e5m2_from_floatx2_scale = __hipext_ocp_fp8x2_e5m2(fpx2, scale); + auto fp8x2_from_half = __hipext_ocp_fp8x2_e5m2(__amd_cvt_half2_to_fp16x2(fp16x2), scale); + auto fp8x2_from_bf16 = __hipext_ocp_fp8x2_e5m2(__amd_cvt_hipbf162_to_bf16x2(bf16x2), scale); + res1[i] = fp8x2_e5m2_from_float; + res2[i] = fp8x2_e5m2_from_floatx2; + res3[i] = fp8x2_e5m2_scale.get_scaled_floatx2(scale); + res4[i] = fp8x2_e5m2_from_floatx2_scale.get_scaled_floatx2(scale); + res5[i] = fp8x2_from_half.get_scaled_floatx2(scale); + res6[i] = fp8x2_from_bf16.get_scaled_floatx2(scale); +} + +static __global__ void cxx_fp6x32_to_floatx32_e2m3(__amd_floatx32_storage_t* res, + unsigned int round = 0, + __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_floatx32_storage_t in; + for (int i = 0; i < 32; i++) { + in[i] = static_cast(i % 8); + } + __hipext_ocp_fp6x32_e2m3 fp6(in, round, scale); + *res = fp6.get_scaled_floatx32(scale); + } +} + +static __global__ void cxx_fp6x32_to_floatx32_e3m2(__amd_floatx32_storage_t* res, + unsigned int round = 0, + __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_floatx32_storage_t in; + for (int i = 0; i < 32; i++) { + in[i] = static_cast(i % 8); + } + __hipext_ocp_fp6x32_e3m2 fp6(in, round, scale); + *res = fp6.get_scaled_floatx32(scale); + } +} + +static __global__ void cxx_fp4x2_to_floatx2_e2m1(__amd_floatx2_storage_t* res1, + __amd_floatx2_storage_t* res2, + __amd_floatx2_storage_t* res3, + __amd_floatx2_storage_t* res4, float a, float b, + __amd_scale_t scale) { + int i = threadIdx.x; + __amd_floatx2_storage_t fpx2{a, b}; + __half2 fp16x2{a, b}; + __hip_bfloat162 bf16x2{a, b}; + auto fp4x2_scale = __hipext_ocp_fp4x2_e2m1(a, b, scale); + auto fp4x2_from_floatx2_scale = __hipext_ocp_fp4x2_e2m1(fpx2, scale); + auto fp4x2_from_half = __hipext_ocp_fp4x2_e2m1(__amd_cvt_half2_to_fp16x2(fp16x2), scale); + auto fp4x2_from_bf16 = __hipext_ocp_fp4x2_e2m1(__amd_cvt_hipbf162_to_bf16x2(bf16x2), scale); + res1[i] = fp4x2_scale.get_scaled_floatx2(scale); + res2[i] = fp4x2_from_floatx2_scale.get_scaled_floatx2(scale); + res3[i] = fp4x2_from_half.get_scaled_floatx2(scale); + res4[i] = fp4x2_from_bf16.get_scaled_floatx2(scale); +} + +static __global__ void pack_and_unpack_fp8x4(float* a) { + int i = threadIdx.x; + if (i == 0) { + auto fp8_a = __amd_cvt_float_to_fp8_sr(a[0], __AMD_OCP_E4M3, 0 /* seed */); + auto fp8_b = __amd_cvt_float_to_fp8_sr(a[1], __AMD_OCP_E4M3, 0 /* seed */); + auto fp8_c = __amd_cvt_float_to_fp8_sr(a[2], __AMD_OCP_E4M3, 0 /* seed */); + auto fp8_d = __amd_cvt_float_to_fp8_sr(a[3], __AMD_OCP_E4M3, 0 /* seed */); + auto packed = + __amd_create_fp8x8(__amd_create_fp8x2(fp8_a, fp8_b), __amd_create_fp8x2(fp8_c, fp8_d), + __amd_create_fp8x2(fp8_a, fp8_b), __amd_create_fp8x2(fp8_c, fp8_d)); + a[0] = __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(packed, 0), 0), + __AMD_OCP_E4M3); + a[1] = __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(packed, 0), 1), + __AMD_OCP_E4M3); + a[2] = __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(packed, 1), 0), + __AMD_OCP_E4M3); + a[3] = __amd_cvt_fp8_to_float(__amd_extract_fp8(__amd_extract_fp8x2(packed, 1), 1), + __AMD_OCP_E4M3); + } +} + +TEST_CASE("Unit_amd_ocp_fp8") { + constexpr int size = 32; + SECTION("E4M3") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector in; + in.reserve(size); + for (int i = -15; i <= (size / 2); i++) { + in.push_back(i * 1.0f); + } + float* d_ptr; + __amd_fp8_storage_t* d_out; + HIP_CHECK(hipMalloc(&d_ptr, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_ptr, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice)); + float_to_fp8_sr<<<1, size>>>(d_ptr, d_out, interpret, size); + // d_out is populated, cvt back and populate in d_ptr + HIP_CHECK(hipMemset(d_ptr, 0, sizeof(float) * size)); + fp8_to_float<<<1, size>>>(d_out, interpret, d_ptr, size); + std::vector res(size, 0.0f); + HIP_CHECK(hipMemcpy(res.data(), d_ptr, sizeof(float) * res.size(), hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + INFO("Result: " << res[i] << " input: " << in[i]); + REQUIRE(std::fabs(res[i] - in[i]) <= 2.0f); + } + HIP_CHECK(hipFree(d_ptr)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E5M2") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector in; + in.reserve(size); + for (int i = -15; i <= (size / 2); i++) { + in.push_back(i * 1.0f); + } + float* d_ptr; + __amd_fp8_storage_t* d_out; + HIP_CHECK(hipMalloc(&d_ptr, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_ptr, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice)); + float_to_fp8_sr<<<1, size>>>(d_ptr, d_out, interpret, size); + // d_out is populated, cvt back and populate in d_ptr + HIP_CHECK(hipMemset(d_ptr, 0, sizeof(float) * size)); + fp8_to_float<<<1, size>>>(d_out, interpret, d_ptr, size); + std::vector res(size, 0.0f); + HIP_CHECK(hipMemcpy(res.data(), d_ptr, sizeof(float) * res.size(), hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + INFO("Result: " << res[i] << " input: " << in[i]); + REQUIRE(std::fabs(res[i] - in[i]) <= 2.0f); + } + HIP_CHECK(hipFree(d_ptr)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E4M3x2") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_floatx2_storage_t> in; + for (int i = 0; i < size; i++) { + __amd_floatx2_storage_t tmp{i + 1.0f, i * 1.0f}; + in.push_back(tmp); + } + __amd_floatx2_storage_t* d_in; + __amd_fp8x2_storage_t* d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8x2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * in.size(), + hipMemcpyHostToDevice)); + floatx2_to_fp8x2<<<1, size>>>(d_in, d_out, interpret, size); + HIP_CHECK(hipMemset(d_in, 0, sizeof(__amd_floatx2_storage_t) * size)); + fp8x2_to_floatx2<<<1, size>>>(d_out, interpret, d_in, size); + std::vector<__amd_floatx2_storage_t> out(size); + HIP_CHECK(hipMemcpy(out.data(), d_in, sizeof(__amd_floatx2_storage_t) * out.size(), + hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + auto in1 = in[i][0]; + auto in2 = in[i][1]; + auto out1 = out[i][0]; + auto out2 = out[i][1]; + INFO("Input: " << in1 << ", " << in2); + INFO("Output: " << out1 << ", " << out2); + REQUIRE(std::fabs(in1 - out1) <= 2.0f); + REQUIRE(std::fabs(in2 - out2) <= 2.0f); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E5M2x2") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_floatx2_storage_t> in; + for (int i = 0; i < size; i++) { + __amd_floatx2_storage_t tmp{i + 1.0f, i * 1.0f}; + in.push_back(tmp); + } + __amd_floatx2_storage_t* d_in; + __amd_fp8x2_storage_t* d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8x2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * in.size(), + hipMemcpyHostToDevice)); + floatx2_to_fp8x2<<<1, size>>>(d_in, d_out, interpret, size); + HIP_CHECK(hipMemset(d_in, 0, sizeof(__amd_floatx2_storage_t) * size)); + fp8x2_to_floatx2<<<1, size>>>(d_out, interpret, d_in, size); + std::vector<__amd_floatx2_storage_t> out(size); + HIP_CHECK(hipMemcpy(out.data(), d_in, sizeof(__amd_floatx2_storage_t) * out.size(), + hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + auto in1 = in[i][0]; + auto in2 = in[i][1]; + auto out1 = out[i][0]; + auto out2 = out[i][1]; + INFO("Input: " << in1 << ", " << in2); + INFO("Output: " << out1 << ", " << out2); + REQUIRE(std::fabs(in1 - out1) <= 2.0f); + REQUIRE(std::fabs(in2 - out2) <= 2.0f); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E4M3x2 scale") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_floatx2_storage_t> in; + for (int i = 0; i < size; i++) { + __amd_floatx2_storage_t tmp{i + 1.0f, i * 1.0f}; + in.push_back(tmp); + } + __amd_floatx2_storage_t* d_in; + __amd_fp8x2_storage_t* d_out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8x2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * in.size(), + hipMemcpyHostToDevice)); + floatx2_to_fp8x2_scale<<<1, size>>>(d_in, d_out, interpret, size, scale); + HIP_CHECK(hipMemset(d_in, 0, sizeof(__amd_floatx2_storage_t) * size)); + fp8x2_to_floatx2_scale<<<1, size>>>(d_out, interpret, d_in, size, scale); + std::vector<__amd_floatx2_storage_t> out(size); + HIP_CHECK(hipMemcpy(out.data(), d_in, sizeof(__amd_floatx2_storage_t) * out.size(), + hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + auto in1 = in[i][0]; + auto in2 = in[i][1]; + auto out1 = out[i][0]; + auto out2 = out[i][1]; + INFO("Input: " << in1 << ", " << in2); + INFO("Output: " << out1 << ", " << out2); + INFO("Scale: " << (int)scale); + REQUIRE(std::fabs(in1 - out1) <= 2.0f); + REQUIRE(std::fabs(in2 - out2) <= 2.0f); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E5M2x2 scale") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_floatx2_storage_t> in; + for (int i = 0; i < size; i++) { + __amd_floatx2_storage_t tmp{i + 1.0f, i * 1.0f}; + in.push_back(tmp); + } + __amd_floatx2_storage_t* d_in; + __amd_fp8x2_storage_t* d_out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8x2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * in.size(), + hipMemcpyHostToDevice)); + floatx2_to_fp8x2_scale<<<1, size>>>(d_in, d_out, interpret, size, scale); + HIP_CHECK(hipMemset(d_in, 0, sizeof(__amd_floatx2_storage_t) * size)); + fp8x2_to_floatx2_scale<<<1, size>>>(d_out, interpret, d_in, size, scale); + std::vector<__amd_floatx2_storage_t> out(size); + HIP_CHECK(hipMemcpy(out.data(), d_in, sizeof(__amd_floatx2_storage_t) * out.size(), + hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + auto in1 = in[i][0]; + auto in2 = in[i][1]; + auto out1 = out[i][0]; + auto out2 = out[i][1]; + INFO("Input: " << in1 << ", " << in2); + INFO("Output: " << out1 << ", " << out2); + INFO("Scale: " << scale); + REQUIRE(std::fabs(in1 - out1) <= 2.0f); + REQUIRE(std::fabs(in2 - out2) <= 2.0f); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("E4M3 sr scale") { + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector in; + in.reserve(size); + for (int i = -15; i <= (size / 2); i++) { + in.push_back(i * 1.0f); + } + float* d_ptr; + __amd_fp8_storage_t* d_out; + HIP_CHECK(hipMalloc(&d_ptr, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_ptr, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice)); + __amd_scale_t scale = 1; + float_to_fp8_sr_scale<<<1, size>>>(d_ptr, d_out, interpret, size, 0 /* seed */, scale); + // d_out is populated, cvt back and populate in d_ptr + HIP_CHECK(hipMemset(d_ptr, 0, sizeof(float) * size)); + fp8_to_float_scale<<<1, size>>>(d_out, interpret, d_ptr, size, scale); + std::vector res(size, 0.0f); + HIP_CHECK(hipMemcpy(res.data(), d_ptr, sizeof(float) * res.size(), hipMemcpyDeviceToHost)); + for (int i = 0; i < size; i++) { + INFO("Result: " << res[i] << " input: " << in[i]); + REQUIRE(std::fabs(res[i] - in[i]) <= 2.0f); + } + HIP_CHECK(hipFree(d_ptr)); + HIP_CHECK(hipFree(d_out)); + } +} + +TEST_CASE("Unit_fp8_pack_unpack") { + float* d_a; + HIP_CHECK(hipMalloc(&d_a, sizeof(float) * 4)); + std::vector a(4, 0.0f); + a[0] = 1.0f; + a[1] = 2.0f; + a[2] = 3.0f; + a[3] = 4.0f; + HIP_CHECK(hipMemcpy(d_a, a.data(), sizeof(float) * 4, hipMemcpyHostToDevice)); + pack_and_unpack_fp8x4<<<1, 32>>>(d_a); + std::vector res(4, 0.0f); + HIP_CHECK(hipMemcpy(res.data(), d_a, sizeof(float) * 4, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < a.size(); i++) { + REQUIRE(a[i] == res[i]); + } + HIP_CHECK(hipFree(d_a)); +} + +static __global__ void float_to_fp6(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, unsigned int round = 0, + __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp6 = __amd_cvt_floatx32_to_fp6x32_sr_scale(*in, interpret, round, scale); + *out = __amd_cvt_fp6x32_to_floatx32_scale(fp6, interpret, scale); + } +} + +static __global__ void bf16_to_fp6(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_bf16x32_storage_t bf16_in, bf16_out; + for (size_t i = 0; i < 32; i++) { + bf16_in[i] = (*in)[i]; + } + auto fp6 = __amd_cvt_bf16x32_to_fp6x32_scale(bf16_in, interpret, scale); + bf16_out = __amd_cvt_fp6x32_to_bf16x32_scale(fp6, interpret, scale); + for (size_t i = 0; i < 32; i++) { + (*out)[i] = bf16_out[i]; + } + } +} + +static __global__ void fp16_to_fp6(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_fp16x32_storage_t fp16_in, fp16_out; + for (size_t i = 0; i < 32; i++) { + fp16_in[i] = (*in)[i]; + } + auto fp6 = __amd_cvt_fp16x32_to_fp6x32_scale(fp16_in, interpret, scale); + fp16_out = __amd_cvt_fp6x32_to_fp16x32_scale(fp6, interpret, scale); + for (size_t i = 0; i < 32; i++) { + (*out)[i] = fp16_out[i]; + } + } +} + +static __global__ void float_halves_to_fp6(__amd_floatx32_storage_t* in, + __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, + __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_floatx16_storage_t fpx16_1, fpx16_2; + for (size_t i = 0; i < 16; i++) { + fpx16_1[i] = (*in)[i * 2]; + fpx16_2[i] = (*in)[i * 2 + 1]; + } + auto fp6 = __amd_cvt_floatx16_floatx16_to_fp6x32_scale(fpx16_1, fpx16_2, interpret, scale); + *out = __amd_cvt_fp6x32_to_floatx32_scale(fp6, interpret, scale); + } +} + +static __global__ void floatx32_to_fp6(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, + __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp6 = __amd_cvt_floatx32_to_fp6x32_scale(*in, interpret, scale); + *out = __amd_cvt_fp6x32_to_floatx32_scale(fp6, interpret, scale); + } +} + +static __global__ void bf16_to_fp6_sr(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, unsigned int round, + __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_bf16x32_storage_t bf16_in, bf16_out; + for (size_t i = 0; i < 32; i++) { + bf16_in[i] = (*in)[i]; + } + auto fp6 = __amd_cvt_bf16x32_to_fp6x32_sr_scale(bf16_in, interpret, round, scale); + bf16_out = __amd_cvt_fp6x32_to_bf16x32_scale(fp6, interpret, scale); + for (size_t i = 0; i < 32; i++) { + (*out)[i] = bf16_out[i]; + } + } +} + +static __global__ void fp16_to_fp6_sr(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, unsigned int round, + __amd_scale_t scale = 0) { + if (threadIdx.x == 0) { + __amd_fp16x32_storage_t fp16_in, fp16_out; + for (size_t i = 0; i < 32; i++) { + fp16_in[i] = (*in)[i]; + } + auto fp6 = __amd_cvt_fp16x32_to_fp6x32_sr_scale(fp16_in, interpret, round, scale); + fp16_out = __amd_cvt_fp6x32_to_fp16x32_scale(fp6, interpret, scale); + for (size_t i = 0; i < 32; i++) { + (*out)[i] = fp16_out[i]; + } + } +} + +TEST_CASE("Unit_amd_ocp_fp6") { + __amd_floatx32_storage_t fpx32, *d_in; + float iter = 0.0f; + for (size_t i = 0; i < 32; i++) { + iter++; + fpx32[i] = iter; + if (iter >= 8) { + iter = 0.0f; + } + } + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &fpx32, sizeof(__amd_floatx32_storage_t), hipMemcpyHostToDevice)); + + SECTION("float to fp6 E2M3") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 1.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16 to fp6 E2M3") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + bf16_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 1.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp16 to fp6 E2M3") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + fp16_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 1.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("float halves to fp6 E2M3") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_halves_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 1.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("floatx32 to fp6 E2M3") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + floatx32_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("float to fp6 E2M3 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16 to fp6 E2M3 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + bf16_to_fp6_sr<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp16 to fp6 E2M3 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + fp16_to_fp6_sr<<<1, 32>>>(d_in, d_out, __AMD_OCP_E2M3, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("float to fp6 E3M2") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16 to fp6 E3M2") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + bf16_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp16 to fp6 E3M2") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + fp16_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("float halves to fp6 E3M2") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_halves_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("floatx32 to fp6 E3M2") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + floatx32_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("float to fp6 E3M2 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + float_to_fp6<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16 to fp6 E3M2 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + bf16_to_fp6_sr<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); + } + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp16 to fp6 E3M2 sr") { + __amd_floatx32_storage_t* d_out; + __amd_floatx32_storage_t out; + __amd_scale_t scale = 1; + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + fp16_to_fp6_sr<<<1, 32>>>(d_in, d_out, __AMD_OCP_E3M2, 0, scale); + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (size_t i = 0; i < 32; i++) { + INFO("In: " << fpx32[i] << " out: " << out[i]); + CHECK(std::fabs(fpx32[i] - out[i]) <= 3.0f); // Max error range + } + HIP_CHECK(hipFree(d_out)); + } + + HIP_CHECK(hipFree(d_in)); +} + +static __global__ void float_to_fp4(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_floatx2_to_fp4x2_scale(*in, __AMD_OCP_E2M1, scale); + *out = __amd_cvt_fp4x2_to_floatx2_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + +static __global__ void fp16_to_fp4(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + __amd_fp16x2_storage_t fp16; + fp16[0] = (*in)[0]; + fp16[1] = (*in)[1]; + auto fp4 = __amd_cvt_fp16x2_to_fp4x2_scale(fp16, __AMD_OCP_E2M1, scale); + auto fp16_cvt = __amd_cvt_fp4x2_to_fp16x2_scale(fp4, __AMD_OCP_E2M1, scale); + (*out)[0] = fp16_cvt[0]; + (*out)[1] = fp16_cvt[1]; + } +} + +static __global__ void bf16_to_fp4(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + __amd_bf16x2_storage_t bf16; + bf16[0] = (*in)[0]; + bf16[1] = (*in)[1]; + auto fp4 = __amd_cvt_bf16x2_to_fp4x2_scale(bf16, __AMD_OCP_E2M1, scale); + auto bf16_cvt = __amd_cvt_fp4x2_to_bf16x2_scale(fp4, __AMD_OCP_E2M1, scale); + (*out)[0] = bf16_cvt[0]; + (*out)[1] = bf16_cvt[1]; + } +} + +static __global__ void float_to_fp4_sr(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + unsigned int round, const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_floatx2_to_fp4x2_sr_scale(*in, __AMD_OCP_E2M1, round, scale); + *out = __amd_cvt_fp4x2_to_floatx2_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + +static __global__ void fp16_to_fp4_sr(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + unsigned int round, const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + __amd_fp16x2_storage_t fp16; + fp16[0] = (*in)[0]; + fp16[1] = (*in)[1]; + auto fp4 = __amd_cvt_fp16x2_to_fp4x2_sr_scale(fp16, __AMD_OCP_E2M1, round, scale); + auto fp16_cvt = __amd_cvt_fp4x2_to_fp16x2_scale(fp4, __AMD_OCP_E2M1, scale); + (*out)[0] = fp16_cvt[0]; + (*out)[1] = fp16_cvt[1]; + } +} + +static __global__ void bf16_to_fp4_sr(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + unsigned int round, const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + __amd_bf16x2_storage_t bf16; + bf16[0] = (*in)[0]; + bf16[1] = (*in)[1]; + auto fp4 = __amd_cvt_bf16x2_to_fp4x2_sr_scale(bf16, __AMD_OCP_E2M1, round, scale); + auto bf16_cvt = __amd_cvt_fp4x2_to_bf16x2_scale(fp4, __AMD_OCP_E2M1, scale); + (*out)[0] = bf16_cvt[0]; + (*out)[1] = bf16_cvt[1]; + } +} + +TEST_CASE("Unit_amd_ocp_fp4") { + __amd_floatx2_storage_t fpx2{4.0f, 2.0f}, *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &fpx2, sizeof(__amd_floatx2_storage_t), hipMemcpyHostToDevice)); + + SECTION("float to fp4") { + float_to_fp4<<<1, 32>>>(d_in, d_out); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("float to fp4 scale") { + __amd_scale_t scale = 1; + float_to_fp4<<<1, 32>>>(d_in, d_out, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("fp16 to fp4") { + fp16_to_fp4<<<1, 32>>>(d_in, d_out); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("fp16 to fp4 scale") { + __amd_scale_t scale = 1; + fp16_to_fp4<<<1, 32>>>(d_in, d_out, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("bf16 to fp4") { + bf16_to_fp4<<<1, 32>>>(d_in, d_out); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("bf16 to fp4 scale") { + __amd_scale_t scale = 1; + bf16_to_fp4<<<1, 32>>>(d_in, d_out, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("float to fp4 sr scale") { + __amd_scale_t scale = 0; + unsigned int round = 1; + float_to_fp4_sr<<<1, 32>>>(d_in, d_out, round, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("fp16 to fp4 sr scale") { + __amd_scale_t scale = 0; + unsigned int round = 1; + fp16_to_fp4_sr<<<1, 32>>>(d_in, d_out, round, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + SECTION("bf16 to fp4 sr scale") { + __amd_scale_t scale = 0; + unsigned int round = 1; + bf16_to_fp4_sr<<<1, 32>>>(d_in, d_out, round, scale); + __amd_floatx2_storage_t out; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(__amd_floatx2_storage_t), hipMemcpyDeviceToHost)); + INFO("In: " << fpx2[0] << ", " << fpx2[1]); + INFO("Out: " << out[0] << ", " << out[1]); + CHECK(fpx2[0] == out[0]); + CHECK(fpx2[1] == out[1]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); +} + +static __global__ void fp16x8_to_fp4x8_sr_scale(__amd_fp16x8_storage_t* in, + __amd_fp16x8_storage_t* out, unsigned int round, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_fp16x8_to_fp4x8_sr_scale(in[i], __AMD_OCP_E2M1, round, scale); + out[i] = __amd_cvt_fp4x8_to_fp16x8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + +static __global__ void fp16x8_to_fp4x8_scale(__amd_fp16x8_storage_t* in, + __amd_fp16x8_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_fp16x8_to_fp4x8_scale(in[i], __AMD_OCP_E2M1, scale); + out[i] = __amd_cvt_fp4x8_to_fp16x8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + + +static __global__ void bf16x8_to_fp4x8_sr_scale(__amd_bf16x8_storage_t* in, + __amd_bf16x8_storage_t* out, unsigned int round, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_bf16x8_to_fp4x8_sr_scale(in[i], __AMD_OCP_E2M1, round, scale); + out[i] = __amd_cvt_fp4x8_to_bf16x8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + + +static __global__ void bf16x8_to_fp4x8_scale(__amd_bf16x8_storage_t* in, + __amd_bf16x8_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_bf16x8_to_fp4x8_scale(in[i], __AMD_OCP_E2M1, scale); + out[i] = __amd_cvt_fp4x8_to_bf16x8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + +#if __AVX512F__ +static __global__ void floatx8_to_fp4x8_sr_scale(__amd_floatx8_storage_t* in, + __amd_floatx8_storage_t* out, unsigned int round, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_floatx8_to_fp4x8_sr_scale(in[i], __AMD_OCP_E2M1, round, scale); + out[i] = __amd_cvt_fp4x8_to_floatx8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} + +static __global__ void floatx8_to_fp4x8_scale(__amd_floatx8_storage_t* in, + __amd_floatx8_storage_t* out, + const __amd_scale_t scale = 0) { + int i = threadIdx.x; + if (i == 0) { + auto fp4 = __amd_cvt_floatx8_to_fp4x8_scale(in[i], __AMD_OCP_E2M1, scale); + out[i] = __amd_cvt_fp4x8_to_floatx8_scale(fp4, __AMD_OCP_E2M1, scale); + } +} +#endif + +TEST_CASE("Unit_amd_ocp_fp4x8") { + __amd_fp4x8_storage_t* d_tmp; + __amd_floatx8_storage_t in; + + HIP_CHECK(hipMalloc(&d_tmp, sizeof(__amd_fp4x8_storage_t))); + + for (size_t i = 0; i < 8; i++) { + in[i] = int(i) - 4; + } + + SECTION("fp16x8 sr scale") { + __amd_fp16x8_storage_t tmp_in, *d_in, *d_out, tmp_out; + for (size_t i = 0; i < 8; i++) { + tmp_in[i] = in[i]; + } + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &tmp_in, sizeof(__amd_fp16x8_storage_t), hipMemcpyHostToDevice)); + fp16x8_to_fp4x8_sr_scale<<<1, 32>>>(d_in, d_out, 0 /* round */, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_fp16x8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = + __amd_cvt_fp16x8_to_fp4x8_sr_scale(tmp_in, __AMD_OCP_E2M1, 0 /* round */, 0 /* scale */); + auto cpu_fp16 = __amd_cvt_fp4x8_to_fp16x8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(tmp_in[i]) << " cpu: " << float(cpu_fp16[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_fp16[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp16x8 scale") { + __amd_fp16x8_storage_t tmp_in, *d_in, *d_out, tmp_out; + for (size_t i = 0; i < 8; i++) { + tmp_in[i] = in[i]; + } + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &tmp_in, sizeof(__amd_fp16x8_storage_t), hipMemcpyHostToDevice)); + fp16x8_to_fp4x8_scale<<<1, 32>>>(d_in, d_out, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_fp16x8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = __amd_cvt_fp16x8_to_fp4x8_scale(tmp_in, __AMD_OCP_E2M1, 0 /* scale */); + auto cpu_fp16 = __amd_cvt_fp4x8_to_fp16x8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(tmp_in[i]) << " cpu: " << float(cpu_fp16[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_fp16[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16x8 sr scale") { + __amd_bf16x8_storage_t tmp_in, *d_in, *d_out, tmp_out; + for (size_t i = 0; i < 8; i++) { + tmp_in[i] = in[i]; + } + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &tmp_in, sizeof(__amd_bf16x8_storage_t), hipMemcpyHostToDevice)); + bf16x8_to_fp4x8_sr_scale<<<1, 32>>>(d_in, d_out, 0 /* round */, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_bf16x8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = + __amd_cvt_bf16x8_to_fp4x8_sr_scale(tmp_in, __AMD_OCP_E2M1, 0 /* round */, 0 /* scale */); + auto cpu_bf16 = __amd_cvt_fp4x8_to_bf16x8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(tmp_in[i]) << " cpu: " << float(cpu_bf16[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_bf16[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("bf16x8 scale") { + __amd_bf16x8_storage_t tmp_in, *d_in, *d_out, tmp_out; + for (size_t i = 0; i < 8; i++) { + tmp_in[i] = in[i]; + } + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &tmp_in, sizeof(__amd_bf16x8_storage_t), hipMemcpyHostToDevice)); + bf16x8_to_fp4x8_scale<<<1, 32>>>(d_in, d_out, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_bf16x8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = __amd_cvt_bf16x8_to_fp4x8_scale(tmp_in, __AMD_OCP_E2M1, 0 /* scale */); + auto cpu_bf16 = __amd_cvt_fp4x8_to_bf16x8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(tmp_in[i]) << " cpu: " << float(cpu_bf16[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_bf16[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + +#if __AVX512F__ + SECTION("floatx8 sr scale") { + __amd_floatx8_storage_t *d_in, *d_out, tmp_out; + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx8_storage_t), hipMemcpyHostToDevice)); + floatx8_to_fp4x8_sr_scale<<<1, 32>>>(d_in, d_out, 0 /* round */, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_floatx8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = + __amd_cvt_floatx8_to_fp4x8_sr_scale(in, __AMD_OCP_E2M1, 0 /* round */, 0 /* scale */); + auto cpu_out = __amd_cvt_fp4x8_to_floatx8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_out[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_out[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("floatx8 scale") { + __amd_floatx8_storage_t *d_in, *d_out, tmp_out; + + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t))); + + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx8_storage_t), hipMemcpyHostToDevice)); + floatx8_to_fp4x8_scale<<<1, 32>>>(d_in, d_out, 0 /* scale */); + HIP_CHECK(hipMemcpy(&tmp_out, d_out, sizeof(__amd_floatx8_storage_t), hipMemcpyDeviceToHost)); + + auto cpu_fp4 = __amd_cvt_floatx8_to_fp4x8_scale(in, __AMD_OCP_E2M1, 0 /* scale */); + auto cpu_out = __amd_cvt_fp4x8_to_floatx8_scale(cpu_fp4, __AMD_OCP_E2M1, 0 /* scale */); + + for (size_t i = 0; i < 8; i++) { + INFO("index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_out[i]) + << " gpu: " << float(tmp_out[i])); + REQUIRE(cpu_out[i] == tmp_out[i]); + } + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } +#endif + + HIP_CHECK(hipFree(d_tmp)); +} + +TEST_CASE("Unit_amd_ocp_cpp_types") { + SECTION("fp8 to float e4m3") { + constexpr size_t size = 32; + float *d_res1, *d_res2, *d_res3, *d_res4, *d_res5; + HIP_CHECK(hipMalloc(&d_res1, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res2, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res3, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res4, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res5, sizeof(float) * size)); + float a = -15.0f; + __amd_scale_t scale = 0; + unsigned int seed = 10; + cxx_fp8_to_float_e4m3<<<1, size>>>(d_res1, d_res2, d_res3, d_res4, d_res5, a, scale, seed); + std::vector res1(size, 0.0f), res2(size, 0.0f), res3(size, 0.0f), res4(size, 0.0f), + res5(size, 0.0f); + HIP_CHECK(hipMemcpy(res1.data(), d_res1, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res2.data(), d_res2, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res3.data(), d_res3, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res4.data(), d_res4, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res5.data(), d_res5, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto input_val = a + i; + INFO("Input: " << input_val); + INFO("Output: " << res1[i] << ", " << res2[i] << ", " << res3[i] << ", " << res4[i] << ", " + << res5[i]); + REQUIRE(std::fabs(res1[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res2[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res3[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res4[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res5[i] - input_val) <= 2.0f); + } + HIP_CHECK(hipFree(d_res1)); + HIP_CHECK(hipFree(d_res2)); + HIP_CHECK(hipFree(d_res3)); + HIP_CHECK(hipFree(d_res4)); + HIP_CHECK(hipFree(d_res5)); + } + + SECTION("fp8 to float e5m2") { + constexpr size_t size = 32; + float *d_res1, *d_res2, *d_res3, *d_res4, *d_res5; + HIP_CHECK(hipMalloc(&d_res1, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res2, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res3, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res4, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_res5, sizeof(float) * size)); + float a = -15.0f; + __amd_scale_t scale = 0; + unsigned int seed = 10; + cxx_fp8_to_float_e5m2<<<1, size>>>(d_res1, d_res2, d_res3, d_res4, d_res5, a, scale, seed); + std::vector res1(size, 0.0f), res2(size, 0.0f), res3(size, 0.0f), res4(size, 0.0f), + res5(size, 0.0f); + HIP_CHECK(hipMemcpy(res1.data(), d_res1, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res2.data(), d_res2, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res3.data(), d_res3, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res4.data(), d_res4, sizeof(float) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res5.data(), d_res5, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto input_val = a + i; + INFO("Input: " << input_val); + INFO("Output: " << res1[i] << ", " << res2[i] << ", " << res3[i] << ", " << res4[i] << ", " + << res5[i]); + REQUIRE(std::fabs(res1[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res2[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res3[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res4[i] - input_val) <= 2.0f); + REQUIRE(std::fabs(res5[i] - input_val) <= 2.0f); + } + HIP_CHECK(hipFree(d_res1)); + HIP_CHECK(hipFree(d_res2)); + HIP_CHECK(hipFree(d_res3)); + HIP_CHECK(hipFree(d_res4)); + HIP_CHECK(hipFree(d_res5)); + } + + SECTION("fp8x2 to floatx2 e4m3") { + constexpr size_t size = 32; + __amd_floatx2_storage_t *d_res1, *d_res2, *d_res3, *d_res4, *d_res5, *d_res6; + HIP_CHECK(hipMalloc(&d_res1, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res2, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res3, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res4, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res5, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res6, sizeof(__amd_floatx2_storage_t) * size)); + float a = -15.0f, b = -14.0f; + __amd_scale_t scale = 0; + cxx_fp8x2_to_floatx2_e4m3<<<1, size>>>(d_res1, d_res2, d_res3, d_res4, d_res5, d_res6, a, b, + scale); + std::vector<__amd_floatx2_storage_t> res1(size), res2(size), res3(size), res4(size), res5(size), + res6(size); + HIP_CHECK(hipMemcpy(res1.data(), d_res1, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res2.data(), d_res2, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res3.data(), d_res3, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res4.data(), d_res4, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res5.data(), d_res5, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res6.data(), d_res6, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto input_val1 = a + i; + auto input_val2 = b + i; + INFO("Input val: " << input_val1 << ", " << input_val2); + INFO("Output1: " << res1[i][0] << ", " << res1[i][1]); + INFO("Output2: " << res2[i][0] << ", " << res2[i][1]); + INFO("Output3: " << res3[i][0] << ", " << res3[i][1]); + INFO("Output4: " << res4[i][0] << ", " << res4[i][1]); + INFO("Output5: " << res5[i][0] << ", " << res5[i][1]); + INFO("Output6: " << res6[i][0] << ", " << res6[i][1]); + REQUIRE(std::fabs(res1[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res1[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res2[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res2[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res3[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res3[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res4[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res4[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res5[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res6[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res6[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res6[i][1] - input_val2) <= 2.0f); + } + HIP_CHECK(hipFree(d_res1)); + HIP_CHECK(hipFree(d_res2)); + HIP_CHECK(hipFree(d_res3)); + HIP_CHECK(hipFree(d_res4)); + HIP_CHECK(hipFree(d_res5)); + HIP_CHECK(hipFree(d_res6)); + } + + SECTION("fp8x2 to floatx2 e5m2") { + constexpr size_t size = 32; + __amd_floatx2_storage_t *d_res1, *d_res2, *d_res3, *d_res4, *d_res5, *d_res6; + HIP_CHECK(hipMalloc(&d_res1, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res2, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res3, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res4, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res5, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res6, sizeof(__amd_floatx2_storage_t) * size)); + float a = -15.0f, b = -14.0f; + __amd_scale_t scale = 0; + cxx_fp8x2_to_floatx2_e5m2<<<1, size>>>(d_res1, d_res2, d_res3, d_res4, d_res5, d_res6, a, b, + scale); + std::vector<__amd_floatx2_storage_t> res1(size), res2(size), res3(size), res4(size), res5(size), + res6(size); + HIP_CHECK(hipMemcpy(res1.data(), d_res1, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res2.data(), d_res2, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res3.data(), d_res3, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res4.data(), d_res4, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res5.data(), d_res5, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res6.data(), d_res6, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto input_val1 = a + i; + auto input_val2 = b + i; + INFO("Input val: " << input_val1 << ", " << input_val2); + INFO("Output1: " << res1[i][0] << ", " << res1[i][1]); + INFO("Output2: " << res2[i][0] << ", " << res2[i][1]); + INFO("Output3: " << res3[i][0] << ", " << res3[i][1]); + INFO("Output4: " << res4[i][0] << ", " << res4[i][1]); + INFO("Output5: " << res5[i][0] << ", " << res5[i][1]); + INFO("Output6: " << res6[i][0] << ", " << res6[i][1]); + REQUIRE(std::fabs(res1[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res1[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res2[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res2[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res3[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res3[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res4[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res4[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res5[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res6[i][1] - input_val2) <= 2.0f); + REQUIRE(std::fabs(res6[i][0] - input_val1) <= 2.0f); + REQUIRE(std::fabs(res6[i][1] - input_val2) <= 2.0f); + } + HIP_CHECK(hipFree(d_res1)); + HIP_CHECK(hipFree(d_res2)); + HIP_CHECK(hipFree(d_res3)); + HIP_CHECK(hipFree(d_res4)); + HIP_CHECK(hipFree(d_res5)); + HIP_CHECK(hipFree(d_res6)); + } + + SECTION("fp6 to float e2m3") { + __amd_floatx32_storage_t* d_res; + __amd_floatx32_storage_t res; + HIP_CHECK(hipMalloc(&d_res, sizeof(__amd_floatx32_storage_t))); + const __amd_scale_t scale = 0; + cxx_fp6x32_to_floatx32_e2m3<<<1, 32>>>(d_res, scale); + HIP_CHECK(hipMemcpy(&res, d_res, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (int i = 0; i < 32; i++) { + INFO("Index: " << i << " res: " << res[i]); + CHECK(res[i] == (i % 8)); + } + HIP_CHECK(hipFree(d_res)); + } + + SECTION("fp6 to float e3m2") { + __amd_floatx32_storage_t* d_res; + __amd_floatx32_storage_t res; + HIP_CHECK(hipMalloc(&d_res, sizeof(__amd_floatx32_storage_t))); + const __amd_scale_t scale = 1.0f; + cxx_fp6x32_to_floatx32_e3m2<<<1, 32>>>(d_res, scale); + HIP_CHECK(hipMemcpy(&res, d_res, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + for (int i = 0; i < 32; i++) { + INFO("Index: " << i << " res: " << res[i]); + CHECK(res[i] == (i % 8)); + } + HIP_CHECK(hipFree(d_res)); + } + + SECTION("fp4 to float e2m1") { + constexpr size_t size = 32; + __amd_floatx2_storage_t *d_res1, *d_res2, *d_res3, *d_res4; + HIP_CHECK(hipMalloc(&d_res1, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res2, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res3, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_res4, sizeof(__amd_floatx2_storage_t) * size)); + float a = 1.0f, b = 2.0f; + __amd_scale_t scale = 0; + cxx_fp4x2_to_floatx2_e2m1<<<1, size>>>(d_res1, d_res2, d_res3, d_res4, a, b, scale); + std::vector<__amd_floatx2_storage_t> res1(size), res2(size), res3(size), res4(size); + HIP_CHECK(hipMemcpy(res1.data(), d_res1, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res2.data(), d_res2, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res3.data(), d_res3, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(res4.data(), d_res4, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + CHECK(res1[i][0] == a); + CHECK(res1[i][1] == b); + } + HIP_CHECK(hipFree(d_res1)); + HIP_CHECK(hipFree(d_res2)); + HIP_CHECK(hipFree(d_res3)); + HIP_CHECK(hipFree(d_res4)); + } +} + +TEST_CASE("Unit_amd_ocp_hip_to_compiler_types") { + SECTION("bf16") { + const float f_in = 1.5f; + __amd_bf16_storage_t in = f_in; + auto hip_bf = __amd_cvt_bf16_to_hipbf16(in); + float cvt_back = hip_bf; + REQUIRE(f_in == cvt_back); + auto bf_cvt = __amd_cvt_hipbf16_to_bf16(hip_bf); + float f_res = bf_cvt; + REQUIRE(f_res == f_in); + } + + SECTION("bf16x2") { + const float f_in1 = 1.5f, f_in2 = 2.5f; + __amd_bf16x2_storage_t in{static_cast<__bf16>(f_in1), static_cast<__bf16>(f_in2)}; + auto hip_bf = __amd_cvt_bf16x2_to_hipbf162(in); + float cvt_back1 = hip_bf.x; + float cvt_back2 = hip_bf.y; + REQUIRE(f_in1 == cvt_back1); + REQUIRE(f_in2 == cvt_back2); + auto bf162_cvt = __amd_cvt_hipbf162_to_bf16x2(hip_bf); + float f_out1 = bf162_cvt[0]; + float f_out2 = bf162_cvt[1]; + REQUIRE(f_in1 == f_out1); + REQUIRE(f_in2 == f_out2); + } + + SECTION("half") { + const float f_in = 1.5f; + __amd_fp16_storage_t in = f_in; + auto hip_half = __amd_cvt_fp16_to_half(in); + float cvt_back = hip_half; + REQUIRE(f_in == cvt_back); + auto fp16_cvt = __amd_cvt_half_to_fp16(hip_half); + float f_res = fp16_cvt; + REQUIRE(f_in == f_res); + } + + SECTION("halfx2") { + const float f_in1 = 1.5f, f_in2 = 2.5f; + __amd_fp16x2_storage_t in{static_cast<_Float16>(f_in1), static_cast<_Float16>(f_in2)}; + auto hip_half = __amd_cvt_fp16x2_to_half2(in); + float cvt_back1 = hip_half.x; + float cvt_back2 = hip_half.y; + REQUIRE(f_in1 == cvt_back1); + REQUIRE(f_in2 == cvt_back2); + auto fp16_cvt = __amd_cvt_half2_to_fp16x2(hip_half); + float f_res1 = fp16_cvt[0]; + float f_res2 = fp16_cvt[1]; + REQUIRE(f_in1 == f_res1); + REQUIRE(f_in2 == f_res2); + } +} + +__global__ void fp8_device_cvt(float* in, float* out, __amd_fp8_interpretation_t interpret, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_float_to_fp8_sr(in[i], interpret, 0 /*seed*/); + out[i] = __amd_cvt_fp8_to_float(tmp, interpret); + } +} + +__global__ void fp8_sr_scale_device_cvt(float* in, float* out, __amd_fp8_interpretation_t interpret, + size_t size, __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_float_to_fp8_sr_scale(in[i], interpret, 1 /*seed*/, scale); + out[i] = __amd_cvt_fp8_to_float_scale(tmp, interpret, scale); + } +} + +__global__ void fp8_fp16_sr_scale_device_cvt(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + __amd_fp8_interpretation_t interpret, size_t size, + __amd_scale_t scale) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16_to_fp8_sr_scale(in[i], interpret, 0 /*seed*/, scale); + out[i] = __amd_cvt_fp8_to_fp16_scale(tmp, interpret, scale); + } +} + +__global__ void fp8_fp16_sr_device_cvt(__amd_fp16_storage_t* in, __amd_fp16_storage_t* out, + __amd_fp8_interpretation_t interpret, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16_to_fp8_sr(in[i], interpret, 0 /*seed*/); + out[i] = __amd_cvt_fp8_to_fp16(tmp, interpret); + } +} + +__global__ void fp8x2_device_cvt(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + __amd_fp8_interpretation_t interpret, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx2_to_fp8x2(in[i], interpret); + out[i] = __amd_cvt_fp8x2_to_floatx2(tmp, interpret); + } +} + +__global__ void fp8x2_device_cvt_scale(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + __amd_fp8_interpretation_t interpret, __amd_scale_t scale, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx2_to_fp8x2_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x2_to_floatx2_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x2_fp16x2_device_cvt_scale(__amd_fp16x2_storage_t* in, + __amd_fp16x2_storage_t* out, + __amd_fp8_interpretation_t interpret, + __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16x2_to_fp8x2_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x2_to_fp16x2_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x2_fp16x2_device_cvt(__amd_fp16x2_storage_t* in, __amd_fp16x2_storage_t* out, + __amd_fp8_interpretation_t interpret, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16x2_to_fp8x2(in[i], interpret); + out[i] = __amd_cvt_fp8x2_to_fp16x2(tmp, interpret); + } +} + +__global__ void fp8_bf16_device_cvt_sr_scale(__amd_bf16_storage_t* in, __amd_bf16_storage_t* out, + __amd_fp8_interpretation_t interpret, + unsigned int round, __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_bf16_to_fp8_sr_scale(in[i], interpret, round, scale); + out[i] = __amd_cvt_fp8_to_bf16_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x2_bf16x2_device_cvt_scale(__amd_bf16x2_storage_t* in, + __amd_bf16x2_storage_t* out, + __amd_fp8_interpretation_t interpret, + __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_bf16x2_to_fp8x2_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x2_to_bf16x2_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_bf16x8_device_cvt_scale(__amd_bf16x8_storage_t* in, + __amd_bf16x8_storage_t* out, + __amd_fp8_interpretation_t interpret, + __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_floatx8_device_cvt_sr_scale(__amd_floatx8_storage_t* in, + __amd_floatx8_storage_t* out, + __amd_fp8_interpretation_t interpret, + unsigned int round, __amd_scale_t scale, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_sr_scale(in[i], interpret, round, scale); + out[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_floatx8_device_cvt_scale(__amd_floatx8_storage_t* in, + __amd_floatx8_storage_t* out, + __amd_fp8_interpretation_t interpret, + __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_fp16x8_device_cvt_sr_scale(__amd_fp16x8_storage_t* in, + __amd_fp16x8_storage_t* out, + __amd_fp8_interpretation_t interpret, + unsigned int round, __amd_scale_t scale, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_sr_scale(in[i], interpret, round, scale); + out[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_bf16x8_device_cvt_sr_scale(__amd_bf16x8_storage_t* in, + __amd_bf16x8_storage_t* out, + __amd_fp8_interpretation_t interpret, + unsigned int round, __amd_scale_t scale, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_sr_scale(in[i], interpret, round, scale); + out[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } +} + +__global__ void fp8x8_fp16x8_device_cvt_scale(__amd_fp16x8_storage_t* in, + __amd_fp16x8_storage_t* out, + __amd_fp8_interpretation_t interpret, + __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_scale(in[i], interpret, scale); + out[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } +} + +__global__ void fp6x32_convert(__amd_floatx32_storage_t* in, __amd_floatx32_storage_t* out, + __amd_fp6_interpretation_t interpret, unsigned int seed, + unsigned int scale) { + int i = threadIdx.x; + if (i == 0) { + auto tmp = __amd_cvt_floatx32_to_fp6x32_sr_scale(*in, interpret, seed, scale); + *out = __amd_cvt_fp6x32_to_floatx32_scale(tmp, interpret, scale); + } +} + +__global__ void fp4x2_convert(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + __amd_scale_t scale, size_t size, + __amd_fp4x2_storage_t* tmp_out = nullptr) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx2_to_fp4x2_scale(in[i], __AMD_OCP_E2M1, scale); + out[i] = __amd_cvt_fp4x2_to_floatx2_scale(tmp, __AMD_OCP_E2M1, scale); + if (tmp_out != nullptr) { + tmp_out[i] = tmp; + } + } +} + +__global__ void fp4x2_sr_scale_convert(__amd_floatx2_storage_t* in, __amd_floatx2_storage_t* out, + const unsigned int seed, __amd_scale_t scale, size_t size) { + int i = threadIdx.x; + if (i < size) { + auto tmp = __amd_cvt_floatx2_to_fp4x2_sr_scale(in[i], __AMD_OCP_E2M1, seed, scale); + out[i] = __amd_cvt_fp4x2_to_floatx2_scale(tmp, __AMD_OCP_E2M1, scale); + } +} + +TEST_CASE("Unit_ocp_host_fp8_device_compare") { + hipDeviceProp_t prop; + HIP_CHECK(hipGetDeviceProperties(&prop, 0)); + bool found = false; + + for (const auto& device : ocp_capeable_hw) { + if (std::string(prop.gcnArchName).find(device) != std::string::npos) { + found = true; + break; + } + } + + if (!found) { + std::string skip_string = "Need OCP HW to run this test: " + std::string(prop.name); + HipTest::HIP_SKIP_TEST(skip_string.c_str()); + return; + } + + SECTION("e4m3") { + constexpr size_t size = 447 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector in; + in.reserve(size); + for (int i = -447; i <= 447; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + fp8_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_float_to_fp8_sr(in[i], interpret, 1 /*seed*/); + cpu_res[i] = __amd_cvt_fp8_to_float(tmp, interpret); + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + fp8_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_float_to_fp8_sr(in[i], interpret, 1 /*seed*/); + cpu_res[i] = __amd_cvt_fp8_to_float(tmp, interpret); + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3x2") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx2_to_fp8x2(in[i], interpret); + cpu_res[i] = __amd_cvt_fp8x2_to_floatx2(tmp, interpret); + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << "\n\tin: a: " << in[i][0] << " b: " << in[i][1] + << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2x2") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx2_to_fp8x2(in[i], interpret); + cpu_res[i] = __amd_cvt_fp8x2_to_floatx2(tmp, interpret); + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << "\n\tin: a: " << in[i][0] << " b: " << in[i][1] + << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("fp4x2 to float") { + constexpr __amd_fp4_interpretation_t interpret = __AMD_OCP_E2M1; + const std::vector<__amd_scale_t> scales = {0, 1, 2}; + std::vector<__amd_floatx2_storage_t> in_vals = { + __amd_floatx2_storage_t{-3.0f, 3.0f}, __amd_floatx2_storage_t{-2.0f, 2.0f}, + __amd_floatx2_storage_t{-1.0f, 1.0f}, __amd_floatx2_storage_t{-0.0f, 0.0f}, + __amd_floatx2_storage_t{1.0f, -1.0f}, __amd_floatx2_storage_t{2.0f, -2.0f}, + __amd_floatx2_storage_t{3.0f, -3.0f}}; + const size_t size = in_vals.size(); + for (const auto scale : scales) { + __amd_floatx2_storage_t *d_in, *d_out; + __amd_fp4x2_storage_t* d_tmp_out; + std::vector<__amd_fp4x2_storage_t> gpu_tmp_out(size); + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_tmp_out, sizeof(__amd_fp4x2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in_vals.data(), sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyHostToDevice)); + fp4x2_convert<<<1, 32>>>(d_in, d_out, scale, size, d_tmp_out); + HIP_CHECK(hipMemcpy(gpu_tmp_out.data(), d_tmp_out, sizeof(__amd_fp4x2_storage_t) * size, + hipMemcpyDeviceToHost)); + std::vector<__amd_floatx2_storage_t> gpu_out(size); + HIP_CHECK(hipMemcpy(gpu_out.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto gpu_res = gpu_out[i]; + auto cpu_fp4_temp = __amd_cvt_floatx2_to_fp4x2_scale(in_vals[i], interpret, scale); + auto cpu_res = __amd_cvt_fp4x2_to_floatx2_scale(cpu_fp4_temp, interpret, scale); + INFO("Index: " << i << " Scale: " << scale << "\n Input l: " << in_vals[i][0] + << " r: " << in_vals[i][1] << "\n cpu l: " << cpu_res[0] + << " r: " << cpu_res[1] << "\n gpu l: " << gpu_res[0] + << " r: " << gpu_res[1] << "\n cpu_tmp: " << std::hex + << (unsigned)cpu_fp4_temp << " gpu_tmp: " << (unsigned)gpu_tmp_out[i]); + CHECK(cpu_res[0] == gpu_res[0]); + CHECK(cpu_res[1] == gpu_res[1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_tmp_out)); + } + } + + SECTION("e4m3x2_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_floatx2_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << in[i][0] << " b: " + << in[i][1] << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x2_fp16x2_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_fp16x2_storage_t tmp{static_cast<__amd_fp16_storage_t>(i), + static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_fp16x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_fp16x2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_fp16x2_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << int(scale) << " Index: " << i << "\n\tin: a: " << float(in[i][0]) + << " b: " << float(in[i][1]) << "\n\tcpu: a: " << float(cpu_res[i][0]) + << " b: " << float(cpu_res[i][1]) << "\n\tgpu: a: " << float(gpu_res[i][0]) + << " b: " << float(gpu_res[i][0])); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x2_fp16x2") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_fp16x2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_fp16x2_storage_t tmp{static_cast<__amd_fp16_storage_t>(i), + static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_fp16x2_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector<__amd_fp16x2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x2_to_fp8x2(in[i], interpret); + cpu_res[i] = __amd_cvt_fp8x2_to_fp16x2(tmp, interpret); + } + std::vector<__amd_fp16x2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << "\n\tin: a: " << float(in[i][0]) << " b: " << float(in[i][1]) + << "\n\tcpu: a: " << float(cpu_res[i][0]) << " b: " << float(cpu_res[i][1]) + << "\n\tgpu: a: " << float(gpu_res[i][0]) << " b: " << float(gpu_res[i][0])); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3x2_bf16x2_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x2_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_bf16x2_storage_t tmp{static_cast<__amd_bf16_storage_t>(i), + static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_bf16x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_bf16x2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_bf16x2_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << float(in[i][0]) + << " b: " << float(in[i][1]) << "\n\tcpu: a: " << float(cpu_res[i][0]) + << " b: " << float(cpu_res[i][1]) << "\n\tgpu: a: " << float(gpu_res[i][0]) + << " b: " << float(gpu_res[i][0])); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3_bf16_sr_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16_storage_t> in; + in.reserve(size); + for (int i = -448; i <= 448; i++) { + in.push_back(static_cast<__amd_bf16_storage_t>(i)); + } + REQUIRE(in.size() == size); + __amd_bf16_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16_storage_t) * size, hipMemcpyHostToDevice)); + fp8_bf16_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0, scale, size); + // CPU calc + std::vector<__amd_bf16_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16_to_fp8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8_to_bf16_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << float(in[i]) + << "\n\tcpu: a: " << float(cpu_res[i]) + << "\n\tgpu: a: " << float(gpu_res[i])); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x2_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx2_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_floatx2_storage_t tmp{static_cast(i), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_floatx2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_floatx2_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << in[i][0] << " b: " + << in[i][1] << "\n\tcpu: a: " << cpu_res[i][0] << " b: " << cpu_res[i][1] + << "\n\tgpu: a: " << gpu_res[i][0] << " b: " << gpu_res[i][0]); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x2_fp16x2_scale") { + constexpr size_t size = 400 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x2_storage_t> in; + in.reserve(size); + for (int i = -400, j = 400; i <= 400; i++, j--) { + __amd_fp16x2_storage_t tmp{static_cast<__amd_fp16_storage_t>(i), + static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_fp16x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_fp16x2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_fp16x2_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << float(in[i][0]) + << " b: " << float(in[i][1]) << "\n\tcpu: a: " << float(cpu_res[i][0]) + << " b: " << float(cpu_res[i][1]) << "\n\tgpu: a: " << float(gpu_res[i][0]) + << " b: " << float(gpu_res[i][0])); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x2_bf16x2_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x2_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_bf16x2_storage_t tmp{static_cast<__amd_bf16_storage_t>(i), + static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x2_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x2_storage_t) * size, hipMemcpyHostToDevice)); + fp8x2_bf16x2_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_bf16x2_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x2_to_fp8x2_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x2_to_bf16x2_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x2_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Scale: " << scale << " Index: " << i << "\n\tin: a: " << float(in[i][0]) + << " b: " << float(in[i][1]) << "\n\tcpu: a: " << float(cpu_res[i][0]) + << " b: " << float(cpu_res[i][1]) << "\n\tgpu: a: " << float(gpu_res[i][0]) + << " b: " << float(gpu_res[i][0])); + REQUIRE(cpu_res[i][0] == gpu_res[i][0]); + REQUIRE(cpu_res[i][1] == gpu_res[i][1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + +// To enable these tests we need to pass -mavx512f since we pass floatx8 as return types +#if __AVX512F__ + SECTION("e4m3x8_floatx8_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx8_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_floatx8_storage_t tmp{static_cast(i), static_cast(j), + static_cast(j), static_cast(i), + static_cast(i), static_cast(i), + static_cast(j), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x8_floatx8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_floatx8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_floatx8_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_floatx8_storage_t tmp{static_cast(i), static_cast(j), + static_cast(j), static_cast(i), + static_cast(i), static_cast(i), + static_cast(j), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x8_floatx8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_floatx8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x8_floatx8_sr_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx8_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_floatx8_storage_t tmp{static_cast(i), static_cast(j), + static_cast(j), static_cast(i), + static_cast(i), static_cast(i), + static_cast(j), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x8_floatx8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0 /* round */, scale, + size); + // CPU calc + std::vector<__amd_floatx8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << " Index: " << i << " subindex : " << j + << " In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + CHECK(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_floatx8_sr_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_floatx8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_floatx8_storage_t tmp{static_cast(i), static_cast(j), + static_cast(j), static_cast(i), + static_cast(i), static_cast(i), + static_cast(j), static_cast(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_floatx8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx8_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyHostToDevice)); + fp8x8_floatx8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0 /* round*/, scale, + size); + // CPU calc + std::vector<__amd_floatx8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_floatx8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_floatx8_scale(tmp, interpret, scale); + } + std::vector<__amd_floatx8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_floatx8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << " Index: " << i << " subindex : " << j + << " In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + CHECK(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } +#endif + + SECTION("e4m3x8_bf16x8_scale") { + constexpr size_t size = 400 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x8_storage_t> in; + in.reserve(size); + for (int i = -400, j = 400; i <= 400; i++, j--) { + __amd_bf16x8_storage_t tmp{ + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_bf16x8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_bf16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << " Index: i: " << i << " subindex : " << j + << " In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_bf16x8_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_bf16x8_storage_t tmp{ + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_bf16x8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_bf16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x8_fp16x8_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x8_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_fp16x8_storage_t tmp{ + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_fp16x8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_fp16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_fp16x8_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_fp16x8_storage_t tmp{ + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_fp16x8_device_cvt_scale<<<1, size>>>(d_in, d_out, interpret, scale, size); + // CPU calc + std::vector<__amd_fp16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_scale(in[i], interpret, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x8_fp16x8_sr_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x8_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_fp16x8_storage_t tmp{ + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_fp16x8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0, scale, size); + // CPU calc + std::vector<__amd_fp16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_fp16x8_sr_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_fp16x8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_fp16x8_storage_t tmp{ + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j), + static_cast<__amd_fp16_storage_t>(i), static_cast<__amd_fp16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_fp16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_fp16x8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0, scale, size); + // CPU calc + std::vector<__amd_fp16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16x8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_fp16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3x8_bf16x8_sr_scale") { + constexpr size_t size = 448 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x8_storage_t> in; + in.reserve(size); + for (int i = -448, j = 448; i <= 448; i++, j--) { + __amd_bf16x8_storage_t tmp{ + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_bf16x8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0, scale, size); + // CPU calc + std::vector<__amd_bf16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e5m2x8_bf16x8_sr_scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_scale_t> scales{0, 1, 2}; + for (const auto scale : scales) { + std::vector<__amd_bf16x8_storage_t> in; + in.reserve(size); + for (int i = -511, j = 511; i <= 511; i++, j--) { + __amd_bf16x8_storage_t tmp{ + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j), + static_cast<__amd_bf16_storage_t>(i), static_cast<__amd_bf16_storage_t>(j)}; + in.push_back(tmp); + } + REQUIRE(in.size() == size); + __amd_bf16x8_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_bf16x8_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_bf16x8_storage_t) * size, hipMemcpyHostToDevice)); + fp8x8_bf16x8_device_cvt_sr_scale<<<1, size>>>(d_in, d_out, interpret, 0, scale, size); + // CPU calc + std::vector<__amd_bf16x8_storage_t> cpu_res(size); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_bf16x8_to_fp8x8_sr_scale(in[i], interpret, 0, scale); + cpu_res[i] = __amd_cvt_fp8x8_to_bf16x8_scale(tmp, interpret, scale); + } + std::vector<__amd_bf16x8_storage_t> gpu_res(size); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_bf16x8_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < 8; j++) { + INFO("Scale: " << int(scale) << "Index: i: " << i << " subindex : " << j + << "In: " << float(in[i][j]) << " cpu res: " << float(cpu_res[i][j]) + << " gpu res: " << float(gpu_res[i][j])); + REQUIRE(cpu_res[i][j] == gpu_res[i][j]); + } + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } + + SECTION("e4m3-sr") { + constexpr size_t size = 449 * 2 + 1; + constexpr __amd_scale_t scale = 2; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector in; + in.reserve(size); + for (int i = -449; i <= 449; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + fp8_sr_scale_device_cvt<<<1, size>>>(d_in, d_out, interpret, size, scale); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_float_to_fp8_sr_scale(in[i], interpret, 1 /*seed*/, scale); + cpu_res[i] = __amd_cvt_fp8_to_float_scale(tmp, interpret, scale); + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2-sr") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_scale_t scale = 2; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast(i)); + } + REQUIRE(in.size() == size); + float *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + fp8_sr_scale_device_cvt<<<1, size>>>(d_in, d_out, interpret, size, scale); + // CPU calc + std::vector cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_float_to_fp8_sr_scale(in[i], interpret, 1 /*seed*/, scale); + cpu_res[i] = __amd_cvt_fp8_to_float_scale(tmp, interpret, scale); + } + std::vector gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " cpu: " << cpu_res[i] << " gpu: " << gpu_res[i]); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3-fp16-sr-scale") { + constexpr size_t size = 449 * 2 + 1; + constexpr __amd_scale_t scale = 2; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_fp16_storage_t> in; + in.reserve(size); + for (int i = -449; i <= 449; i++) { + in.push_back(static_cast<__amd_fp16_storage_t>(i)); + } + REQUIRE(in.size() == size); + __amd_fp16_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16_storage_t) * size, hipMemcpyHostToDevice)); + fp8_fp16_sr_scale_device_cvt<<<1, size>>>(d_in, d_out, interpret, size, scale); + // CPU calc + std::vector<__amd_fp16_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16_to_fp8_sr_scale(in[i], interpret, 0 /*seed*/, scale); + cpu_res[i] = __amd_cvt_fp8_to_fp16_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_res[i]) + << " gpu: " << float(gpu_res[i])); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2-fp16-sr-scale") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_scale_t scale = 2; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_fp16_storage_t> in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast<__amd_fp16_storage_t>(i)); + } + REQUIRE(in.size() == size); + __amd_fp16_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16_storage_t) * size, hipMemcpyHostToDevice)); + fp8_fp16_sr_scale_device_cvt<<<1, size>>>(d_in, d_out, interpret, size, scale); + // CPU calc + std::vector<__amd_fp16_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16_to_fp8_sr_scale(in[i], interpret, 0 /*seed*/, scale); + cpu_res[i] = __amd_cvt_fp8_to_fp16_scale(tmp, interpret, scale); + } + std::vector<__amd_fp16_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_res[i]) + << " gpu: " << float(gpu_res[i])); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e4m3-fp16-sr") { + constexpr size_t size = 449 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E4M3; + std::vector<__amd_fp16_storage_t> in; + in.reserve(size); + for (int i = -449; i <= 449; i++) { + in.push_back(static_cast<__amd_fp16_storage_t>(i)); + } + REQUIRE(in.size() == size); + __amd_fp16_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16_storage_t) * size, hipMemcpyHostToDevice)); + fp8_fp16_sr_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector<__amd_fp16_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16_to_fp8_sr(in[i], interpret, 0 /*seed*/); + cpu_res[i] = __amd_cvt_fp8_to_fp16(tmp, interpret); + } + std::vector<__amd_fp16_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_res[i]) + << " gpu: " << float(gpu_res[i])); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + + SECTION("e5m2-fp16-sr") { + constexpr size_t size = 511 * 2 + 1; + constexpr __amd_fp8_interpretation_t interpret = __AMD_OCP_E5M2; + std::vector<__amd_fp16_storage_t> in; + in.reserve(size); + for (int i = -511; i <= 511; i++) { + in.push_back(static_cast<__amd_fp16_storage_t>(i)); + } + REQUIRE(in.size() == size); + __amd_fp16_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_fp16_storage_t) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(__amd_fp16_storage_t) * size, hipMemcpyHostToDevice)); + fp8_fp16_sr_device_cvt<<<1, size>>>(d_in, d_out, interpret, size); + // CPU calc + std::vector<__amd_fp16_storage_t> cpu_res(size, 0.0f); + for (size_t i = 0; i < size; i++) { + auto tmp = __amd_cvt_fp16_to_fp8_sr(in[i], interpret, 0 /*seed*/); + cpu_res[i] = __amd_cvt_fp8_to_fp16(tmp, interpret); + } + std::vector<__amd_fp16_storage_t> gpu_res(size, 0.0f); + HIP_CHECK(hipMemcpy(gpu_res.data(), d_out, sizeof(__amd_fp16_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << float(in[i]) << " cpu: " << float(cpu_res[i]) + << " gpu: " << float(gpu_res[i])); + REQUIRE(cpu_res[i] == gpu_res[i]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + +#if __AVX512F__ + SECTION("fp6x32 cvt e2m3") { + constexpr __amd_fp6_interpretation_t interpret = __AMD_OCP_E2M3; + constexpr unsigned int seed = 1; + __amd_scale_t scale = 0; + __amd_floatx32_storage_t in, *d_in, *d_out, gpu_out, cpu_out; + float counter = -7.5f; + for (size_t i = 0; i < 31; i++, counter += 0.5f) { + in[i] = counter; + } + in[31] = -0.0f; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx32_storage_t), hipMemcpyHostToDevice)); + fp6x32_convert<<<1, 32>>>(d_in, d_out, interpret, seed, scale); + HIP_CHECK(hipMemcpy(&gpu_out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + auto tmp = __amd_cvt_floatx32_to_fp6x32_sr_scale(in, interpret, seed, scale); + cpu_out = __amd_cvt_fp6x32_to_floatx32_scale(tmp, interpret, scale); + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + for (size_t i = 0; i < 32; i++) { + INFO("Index: " << i << " In: " << in[i] << " cpu: " << cpu_out[i] << " gpu: " << gpu_out[i]); + CHECK(cpu_out[i] == gpu_out[i]); + } + } + + SECTION("fp6x32 cvt e3m2") { + constexpr __amd_fp6_interpretation_t interpret = __AMD_OCP_E3M2; + constexpr unsigned int seed = 1; + __amd_scale_t scale = 1; + __amd_floatx32_storage_t in, *d_in, *d_out, gpu_out, cpu_out; + float counter = -28.0f; + for (size_t i = 0; i < 32; i++, counter += 1.0f) { + in[i] = counter; + } + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx32_storage_t))); + HIP_CHECK(hipMemcpy(d_in, &in, sizeof(__amd_floatx32_storage_t), hipMemcpyHostToDevice)); + fp6x32_convert<<<1, 32>>>(d_in, d_out, interpret, seed, scale); + HIP_CHECK(hipMemcpy(&gpu_out, d_out, sizeof(__amd_floatx32_storage_t), hipMemcpyDeviceToHost)); + auto tmp = __amd_cvt_floatx32_to_fp6x32_sr_scale(in, interpret, seed, scale); + cpu_out = __amd_cvt_fp6x32_to_floatx32_scale(tmp, interpret, scale); + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + for (size_t i = 0; i < 32; i++) { + INFO("Index: " << i << " In: " << in[i] << " cpu: " << cpu_out[i] << " gpu: " << gpu_out[i]); + CHECK(cpu_out[i] == gpu_out[i]); + } + } +#endif + + SECTION("fp4x2 sr") { + constexpr __amd_fp4_interpretation_t interpret = __AMD_OCP_E2M1; + std::vector<__amd_scale_t> scales{0, 1, 2}; + std::vector<__amd_floatx2_storage_t> in_vals = { + __amd_floatx2_storage_t{-3.0f, 3.0f}, __amd_floatx2_storage_t{-2.5f, 2.5f}, + __amd_floatx2_storage_t{-2.0f, 2.0f}, __amd_floatx2_storage_t{-1.5f, 1.5f}, + __amd_floatx2_storage_t{-1.0f, 1.0f}, __amd_floatx2_storage_t{-0.5f, 0.5f}, + __amd_floatx2_storage_t{-0.0f, 0.0f}, __amd_floatx2_storage_t{0.5f, -0.5f}, + __amd_floatx2_storage_t{1.0f, -1.0f}, __amd_floatx2_storage_t{1.5f, -1.5f}, + __amd_floatx2_storage_t{2.0f, -2.0f}, __amd_floatx2_storage_t{2.5f, -2.5f}, + __amd_floatx2_storage_t{3.0f, -3.0f}}; + const size_t size = in_vals.size(); + for (const auto scale : scales) { + __amd_floatx2_storage_t *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMalloc(&d_out, sizeof(__amd_floatx2_storage_t) * size)); + HIP_CHECK(hipMemcpy(d_in, in_vals.data(), sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyHostToDevice)); + fp4x2_sr_scale_convert<<<1, 32>>>(d_in, d_out, 1 /*seed*/, scale, size); + std::vector<__amd_floatx2_storage_t> gpu_out(size); + HIP_CHECK(hipMemcpy(gpu_out.data(), d_out, sizeof(__amd_floatx2_storage_t) * size, + hipMemcpyDeviceToHost)); + for (size_t i = 0; i < size; i++) { + auto gpu_res = gpu_out[i]; + auto cpu_fp4_temp = __amd_cvt_floatx2_to_fp4x2_sr_scale(in_vals[i], interpret, 1, scale); + auto cpu_res = __amd_cvt_fp4x2_to_floatx2_scale(cpu_fp4_temp, interpret, scale); + INFO("Scale: " << scale << " Input l: " << in_vals[i][0] << " r: " << in_vals[i][1] + << "\n cpu l: " << cpu_res[0] << " r: " << cpu_res[1] + << "\n gpu l: " << gpu_res[0] << " r: " << gpu_res[1]); + CHECK(cpu_res[0] == gpu_res[0]); + CHECK(cpu_res[1] == gpu_res[1]); + } + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + } + } +}