diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_tf32.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000..758e2e579a --- /dev/null +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_tf32.cpp @@ -0,0 +1,27 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/SYCL/Matrix/XMX8/joint_matrix_tf32.cpp b/SYCL/Matrix/XMX8/joint_matrix_tf32.cpp new file mode 100644 index 0000000000..91c31a1a94 --- /dev/null +++ b/SYCL/Matrix/XMX8/joint_matrix_tf32.cpp @@ -0,0 +1,27 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#include "../joint_matrix_tf32_impl.hpp" diff --git a/SYCL/Matrix/element_wise_all_ops_tf32.cpp b/SYCL/Matrix/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000..2b8e390846 --- /dev/null +++ b/SYCL/Matrix/element_wise_all_ops_tf32.cpp @@ -0,0 +1,27 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 16 + +#include "element_wise_all_ops_tf32_impl.hpp" diff --git a/SYCL/Matrix/element_wise_all_ops_tf32_impl.hpp b/SYCL/Matrix/element_wise_all_ops_tf32_impl.hpp new file mode 100644 index 0000000000..77a84c6533 --- /dev/null +++ b/SYCL/Matrix/element_wise_all_ops_tf32_impl.hpp @@ -0,0 +1,239 @@ + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void assert_ops_ref(host_accessor C, + const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} +template +void matrix_verify_add(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] + 2; + } + + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_sub(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] - round_to_tf32(2); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_mul(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] * round_to_tf32(3.0); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_div(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(4.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] / round_to_tf32(2.0); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_logic(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + if (wi_slice_a[i]) { + if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 2.0 || + wi_slice_a[i] < 2.0 || wi_slice_a[i] <= 2.0) { + Ts val = (wi_slice_a[i] != 2.0) ? wi_slice_a[i] : 2.0; + val = val - static_cast(1); + val = val + static_cast(1); + if (wi_slice_a[i] == 2.0) { + val = val - static_cast(2); + val = val * static_cast(3); + val = val / static_cast(2); + + } else { + val = val + static_cast(2); + } + wi_slice_a[i] = val; + } + } + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +float A[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +int main() { + + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + + size_t NDRangeM = MATRIX_M / TM; + size_t NDRangeN = MATRIX_N / TN; + queue q; + nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); + + matrix_verify_add(q, MA, r, 7.0); + matrix_verify_sub(q, MA, r, 3.0); + matrix_verify_mul(q, MA, r, 15.0); + matrix_verify_div(q, MA, r, 2.0); + matrix_verify_logic(q, MA, r, + 7.0); + + return 0; +} diff --git a/SYCL/Matrix/joint_matrix_tf32.cpp b/SYCL/Matrix/joint_matrix_tf32.cpp new file mode 100644 index 0000000000..57425b9334 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_tf32.cpp @@ -0,0 +1,27 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 16 + +#include "joint_matrix_tf32_impl.hpp" diff --git a/SYCL/Matrix/joint_matrix_tf32_impl.hpp b/SYCL/Matrix/joint_matrix_tf32_impl.hpp new file mode 100644 index 0000000000..0c6787ecd0 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_tf32_impl.hpp @@ -0,0 +1,147 @@ +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + joint_matrix_fill(sg, sub_a, 42); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k, K); + joint_matrix_load( + sg, sub_b, + accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); + // If no rounding to tf32 function is called, joint_matrix_mad + // function will work on truncated floats. + joint_matrix_apply(sg, sub_a, + [=](float x) { x = round_to_tf32(x); }); + auto wi_data_b = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b); + for (int i = 0; i < wi_data_b.length(); i++) { + wi_data_b[i] = round_to_tf32(wi_data_b[i]); + } + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + auto wi_slice_a = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + joint_matrix_apply(sg, sub_a, [=](float x) { x *= 2; }); + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; +float B[MATRIX_K][MATRIX_N]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + float va = A_mem[m * K + k]; + float vb = B_mem[k * N + n]; + C_mem[m * N + n] += va * vb; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = 1.0f * (i + j); + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = 2.0f * i + 3.0f * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +}