intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cmake/BuildFlags.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/BuildFlags.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/CUTLASS.cmake‎
Lines changed: 17 additions & 0 deletions b/‎cmake/CUTLASS.cmake‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎cmake/Modules/FindSYCL.cmake‎
Lines changed: 2 additions & 0 deletions b/‎cmake/Modules/FindSYCL.cmake‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/ATen/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion b/‎src/ATen/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/ATen/native/cutlass/Attention.cpp‎
Lines changed: 65 additions & 0 deletions b/‎src/ATen/native/cutlass/Attention.cpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/ATen/native/cutlass/Attention.h‎
Lines changed: 32 additions & 0 deletions b/‎src/ATen/native/cutlass/Attention.h‎
Lines changed: 32 additions & 0 deletions
@@ -56,6 +56,11 @@ if(USE_XCCL)
   endif()
 endif()
 
+set(USE_CUTLASS ON)
+if (USE_CUTLASS)
+  include(${TORCH_XPU_OPS_ROOT}/cmake/CUTLASS.cmake)
+endif()
+
 if(BUILD_TEST)
   add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
 endif()
 
@@ -26,15 +26,15 @@ endfunction()
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
   # # -- Host flags (SYCL_CXX_FLAGS)
   if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    list(APPEND SYCL_HOST_FLAGS /std:c++17)
+    list(APPEND SYCL_HOST_FLAGS /std:c++20)
     list(APPEND SYCL_HOST_FLAGS /MD)
     list(APPEND SYCL_HOST_FLAGS /EHsc) # exception handling
     # SYCL headers warnings
     list(APPEND SYCL_HOST_FLAGS /wd4996) # allow usage of deprecated functions
     list(APPEND SYCL_HOST_FLAGS /wd4018) # allow signed and unsigned comparison
   elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     list(APPEND SYCL_HOST_FLAGS -fPIC)
-    list(APPEND SYCL_HOST_FLAGS -std=c++17)
+    list(APPEND SYCL_HOST_FLAGS -std=c++20)
     list(APPEND SYCL_HOST_FLAGS -Wunused-variable)
     # SYCL headers warnings
     list(APPEND SYCL_HOST_FLAGS -Wno-deprecated-declarations)
 
@@ -0,0 +1,17 @@
+if(NOT __CUTLASS_INCLUDED)
+  set(__CUTLASS_INCLUDED TRUE)
+  include(FetchContent)
+  FetchContent_Declare(
+      repo-cutlass-sycl
+      GIT_REPOSITORY https://github.com/intel/cutlass-sycl #https://github.com/rolandschulz/cutlass-fork.git
+      GIT_TAG        sycl-develop #gcc-support 
+      GIT_SHALLOW    OFF
+  )
+  FetchContent_GetProperties(repo-cutlass-sycl)
+  if(NOT repo-cutlass-sycl_POPULATED)
+    FetchContent_Populate(repo-cutlass-sycl)
+  endif()
+  set(CUTLASS_SYCL_INCLUDE_DIRS ${repo-cutlass-sycl_SOURCE_DIR}/include
+                                ${repo-cutlass-sycl_SOURCE_DIR}/tools/util/include)
+  set(CUTLASS_SYCL_COMPILE_DEFINITIONS CUTLASS_ENABLE_SYCL SYCL_INTEL_TARGET)
+endif()
@@ -469,6 +469,7 @@ macro(SYCL_ADD_LIBRARY sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
+    PRIVATE
     ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
@@ -528,6 +529,7 @@ macro(SYCL_ADD_EXECUTABLE sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
+    PRIVATE
     ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
 
@@ -3,20 +3,23 @@
 file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
 file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
-file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp" "native/cutlass/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
+file(GLOB xpu_cutlass_sycl "native/cutlass/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
 if(USE_ONEMKL_XPU)
   list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl})
 endif()
 list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
+list(APPEND ATen_XPU_CUTLASS_SYCL_SRCS ${xpu_cutlass_sycl})
 
 set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE)
 set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
+set(ATen_XPU_CUTLASS_SYCL_SRCS ${ATen_XPU_CUTLASS_SYCL_SRCS} PARENT_SCOPE)
 
 foreach(HEADER  ${xpu_h})
   install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
 
@@ -0,0 +1,65 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/scaled_dot_product_attention.h>
+#endif
+
+#include <ATen/native/cutlass/Attention.h>
+#include <ATen/native/cutlass/sycl/AttentionKernels.h>
+
+#include <comm/SYCLContext.h>
+
+namespace at {
+namespace native {
+namespace cutlass_sycl{
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value) {
+    
+    std::cout << "lfq: entering cutlass sdpa_backward" << std::endl;
+    
+    auto ps = at::matmul(query, key.transpose(-2, -1));
+    ps = ps / std::sqrt(scale);
+    ps = at::softmax(ps, -1).to(query.dtype());
+    auto dps = at::empty_like(ps);
+    cutlass_sdpa_backward(batch_size, num_head_q, num_head_kv, seq_len_q, seq_len_kv,
+                 head_dim_qk, head_dim_v,
+                 grad_out.data_ptr(),
+                 query.data_ptr(),
+                 key.data_ptr(),
+                 value.data_ptr(),
+                 ps.data_ptr(),
+                 nullptr,
+                 grad_query.data_ptr(),
+                 grad_key.data_ptr(),
+                 grad_value.data_ptr(),
+                 dps.data_ptr());
+}
+} // cutlass_sycl
+} // namespace native
+} // namespace at
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at {
+namespace native {
+namespace cutlass_sycl{
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value);
+
+} // namespace cutlass_sycl
+} // namespace native
+} // namespace at