Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/common/sycl_cute_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,22 @@ zero_fill(InTensor &X)
X(i) = T(0);
}

template <typename T>
void
random_fill(std::vector<T> &X) {

for (int i = 0; i < X.size(); i++)
X[i] = random_value<T>();
}

template <typename T>
void
zero_fill(std::vector<T> &X) {
for (int i = 0; i < X.size(); i++)
X[i] = T(0);
}


// Pack sub-byte types in a gmem tensor.
// On input, the backing array holds one sub-byte value per byte.
// On exit, the backing array contains packed values.
Expand Down
5 changes: 5 additions & 0 deletions examples/cute/tutorial/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ if (CUTLASS_ENABLE_SYCL)
tiled_copy_sycl.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_transpose
transpose/main.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_copy_if
tiled_copy_if_sycl.cpp
Expand Down
148 changes: 148 additions & 0 deletions examples/cute/tutorial/transpose/block_2d_transposed_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#pragma once
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <cute/tensor.hpp>
#include <cute/util/compat.hpp>
#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
#include <sycl/sycl.hpp>

#include "cutlass/util/print_error.hpp"
#include "util.h"

template <class TensorS, class TensorD, class BlockShape, class BlockShapeTrans,
class ThreadLayout>
void block2DTransposedLoadKernel(TensorS const S, TensorD const DT,
BlockShape const block_shape,
BlockShapeTrans const block_shape_transposed,
ThreadLayout const thread_layout) {
using namespace cute;
using Element = typename TensorS::value_type;

/* get workgroup and local ids */
auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
auto wg_m = int(item.get_group(0));
auto wg_n = int(item.get_group(1));
auto local_id = int(item.get_local_id(0));

/* proxy coordinate tensor */
Tensor cS = make_identity_tensor(S.shape()); // (M,N)
Tensor cDT = make_identity_tensor(DT.shape()); // (N,M)

auto wg_coord = make_coord(wg_m, wg_n);
auto wg_coord_transposed = make_coord(wg_n, wg_m);

// Tensor data = ... // ( M, N) Tensor cta_data = local_tile(data,
// Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y)); // (_32,_64)
Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)
Tensor gDT = local_tile(cDT, block_shape_transposed,
wg_coord_transposed); // (BLK_N,BLK_M);

constexpr int CopyBits = sizeof_bits_v<Element>;
auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, 8, 8>{};
auto store_op = XE_STORE_2D<CopyBits, 8, 8>{};

/* Slice TiledCopy operations to thread (work-item) level */
auto transpose_S = make_block_2d_copy(transposed_load_op, S);
auto thr_transpose_S = transpose_S.get_slice(local_id);

auto store_DT = make_block_2d_copy(store_op, DT);
auto thr_copy_DT = store_DT.get_slice(local_id);

/* Register fragments for transposed copy */
auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS);
auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT);

/* Partition global tensor (proxies) for copies */
Tensor tSgS = thr_transpose_S.partition_S(gS);
Tensor tDgD = thr_copy_DT.partition_D(gDT);

// if ( cute::thread(0, 0)){
// print(tSgS);print("\n");
// print(tSrS);print("\n");
// print(tDgD);print("\n");
// }

copy(transpose_S, tSgS, tSrS);
// copy(tSrS, tDrD);
copy(store_DT, tSrS, tDgD);
}

class TransposeCuteName;
template <typename Element>
void block_2d_transposed_copy(TransposeParams<Element> params) {

using namespace cute;
//
// Make Tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto tensor_shape_trans = make_shape(params.N, params.M);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

// Make a transposed view of the output
// auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
// Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT);

sycl::queue Q;

//
// Tile tensors
//

using bM = Int<32>;
using bN = Int<8>;

auto block_shape = make_shape(bM{}, bN{}); // (bM, bN)
auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)

sycl::range<2> local = {bM{}, 1};
sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
local[1] * ceil_div(shape<1>(tensor_S), bN{})};

auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{});

namespace syclex = sycl::ext::oneapi::experimental;
namespace intelex = sycl::ext::intel::experimental;

syclex::properties kernel_props{syclex::sub_group_size<16>,
intelex::grf_size<256>};

auto event = Q.parallel_for<TransposeCuteName>(
sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape,
block_shape_trans, threadLayout);
});
};
157 changes: 157 additions & 0 deletions examples/cute/tutorial/transpose/copy_direct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#pragma once

/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

// copy kernel adapted from
// https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/tiled_copy.cu

#include <cute/util/compat.hpp>
#include <sycl/sycl.hpp>

#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "util.h"

#include <iomanip>

template <class TensorS, class TensorD, class ThreadLayout>
void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
using namespace cute;

// Slice the tiled tensors
// This line slices the tiled tensor S to get the tile for the current work
// group. S is a 3D tensor with layout ((M, N), m', n') where:
// - (M, N) is the block/tile shape (first mode)
// - m' is the number of tiles in the M dimension (second mode)
// - n' is the number of tiles in the N dimension (third mode)
//
// The indexing S(make_coord(_, _), x, y) selects:
// - make_coord(_, _): Takes all elements from the first mode (M, N), i.e.,
// the entire tile
// - compat::work_group_id::x(): Selects the x-th tile along the m'
// dimension
// - compat::work_group_id::y(): Selects the y-th tile along the n'
// dimension
//
// Result: A 2D tensor of shape (BlockShape_M, BlockShape_N) corresponding to
// the tile assigned to the current work group.
Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)

// Construct a partitioning of the tile among threads with the given thread
// arrangement.

// Concept: Tensor ThrLayout ThrIndex
Tensor thr_tile_S = local_partition(
tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
Tensor thr_tile_D = local_partition(
tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)

// Construct a register-backed Tensor with the same shape as each thread's
// partition Use make_tensor to try to match the layout of thr_tile_S
Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)

// Copy from GMEM to RMEM and from RMEM to GMEM
copy(thr_tile_S, fragment);
copy(fragment, thr_tile_D);
}

template <typename Element>
void copy_direct(TransposeParams<Element> params) {
//
// Given a 2D shape, perform an efficient copy
//

using namespace cute;

//
// Make tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

//
// Tile tensors
//

// Define a statically sized block (M, N).
// Note, by convention, capital letters are used to represent static modes.
auto block_shape = make_shape(Int<1>{}, Int<16384>{});

if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
(size<1>(tensor_shape) % size<1>(block_shape))) {
std::cerr << "The tensor shape must be divisible by the block shape."
<< std::endl;
}
// Equivalent check to the above
if (not evenly_divides(tensor_shape, block_shape)) {
std::cerr << "Expected the block_shape to evenly divide the tensor shape."
<< std::endl;
}

// Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static
// tile shape, and modes (m', n') correspond to the number of tiles.
//
// These will be used to determine the CUDA kernel grid dimensions.
Tensor tiled_tensor_S =
tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
Tensor tiled_tensor_D =
tiled_divide(tensor_D, block_shape); // ((M, N), m', n')

// Thread arrangement
Layout thr_layout =
make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});

//
// Determine grid and block dimensions
//

auto gridDim = compat::dim3(
size<1>(tiled_tensor_S),
size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
auto blockDim = compat::dim3(size(thr_layout));

//
// Launch the kernel
//
compat::launch<copy_kernel<decltype(tiled_tensor_S),
decltype(tiled_tensor_D), decltype(thr_layout)>>(
gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
}
Loading
Loading