intel
diff --git a/‎.github/workflows/_linux_build.yml
+62-73 b/‎.github/workflows/_linux_build.yml
+62-73
diff --git a/‎.github/workflows/nightly_ondemand.yml
+1 b/‎.github/workflows/nightly_ondemand.yml
+1
diff --git a/‎.github/workflows/nightly_ondemand_rolling.yml
+1 b/‎.github/workflows/nightly_ondemand_rolling.yml
+1
diff --git a/‎cmake/BuildFlags.cmake
+1-4 b/‎cmake/BuildFlags.cmake
+1-4
diff --git a/‎cmake/Modules/FindSYCLToolkit.cmake
-7 b/‎cmake/Modules/FindSYCLToolkit.cmake
-7
diff --git a/‎src/ATen/native/transformers/Attention.cpp
-10 b/‎src/ATen/native/transformers/Attention.cpp
-10
diff --git a/‎src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+9-2 b/‎src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+9-2
diff --git a/‎src/ATen/native/xpu/sycl/AdaptiveAveragePooling3dKernels.cpp
+7 b/‎src/ATen/native/xpu/sycl/AdaptiveAveragePooling3dKernels.cpp
+7
@@ -33,10 +33,12 @@ on:
         type: string
         default: 'lts'
         description: Driver lts/rolling
+      update_lkg:
+        required: false
+        type: string
+        default: 'false'
+        description: Whether update LKG torch version to issue #1280
     outputs:
-      whl_name:
-        description: The name of the wheel file
-        value: ${{ jobs.build.outputs.whl_name }}
       torch_commit_id:
         description: The commit id of the torch build
         value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }}
@@ -46,7 +48,6 @@ permissions:
 
 jobs:
   build:
-    if: ${{ inputs.pytorch }} != 'nightly_wheel'
     runs-on: ${{ inputs.runner }}
     outputs:
       TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }}
@@ -65,24 +66,22 @@ jobs:
           which conda && conda clean -ay
           conda remove --all -y -n xpu_build || \
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
-          conda create -n xpu_build python=${{ inputs.python }} cmake ninja -y
+          conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
       - name: Build Pytorch XPU
         run: |
@@ -100,74 +99,64 @@ jobs:
           else
             export _GLIBCXX_USE_CXX11_ABI=1
           fi
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-            repo="${{ github.repository }}"
-            last_commit=$(gh --repo $repo issue view $commit_issue --json body -q .body | grep ${{ inputs.pytorch }} | cut -d'[' -f 2 | cut -d']' -f 1)
-            cd ../pytorch
-            current_commit=$(git rev-parse HEAD)
-            is_fork_pr=false
-            if [ -n "${{ github.event.pull_request }}" ] && [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then
-              is_fork_pr=true
-            fi
-            echo ">>>>>>>>>>>>Fork PR: ${is_fork_pr}, pytorch branch: ${{ inputs.pytorch }}, last commit: ${last_commit}, current commit: ${current_commit}"
+          build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          repo="${{ github.repository }}"
+          last_commit=$(gh --repo $repo issue view $commit_issue --json body -q .body | grep ${{ inputs.pytorch }} | cut -d'[' -f 2 | cut -d']' -f 1)
+          cd ../pytorch
+          current_commit=$(git rev-parse HEAD)
+          is_fork_pr=false
+          if [ -n "${{ github.event.pull_request }}" ] && [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then
+            is_fork_pr=true
+          fi
+          echo ">>>>>>>>>>>>Fork PR: ${is_fork_pr}, pytorch branch: ${{ inputs.pytorch }}, last commit: ${last_commit}, current commit: ${current_commit}"
 
-            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-            pip install -r requirements.txt
-            WERROR=1 python setup.py bdist_wheel 2>&1 | tee pytorch_${current_commit}_build.log
+          export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          pip install -r requirements.txt
+          WERROR=1 python setup.py bdist_wheel 2>&1 | tee pytorch_${current_commit}_build.log
 
-            if [[ "${is_fork_pr}" == "false" ]]; then
-              if [ -f dist/torch*.whl ] && [ "${last_commit}" != "${current_commit}" ] && [[ "${{ inputs.pytorch }}" == "main" || "${{ inputs.pytorch }}" == "release/"* ]]; then
-                echo "Wheel build successful, update last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
-                gh --repo $repo issue view $commit_issue --json body -q .body | sed "s;${last_commit};${current_commit};g" | sed '/^$/d' > new_body.txt
-                gh --repo $repo issue edit $commit_issue --body-file new_body.txt
-              fi
-              if [ ! -f dist/torch*.whl ]; then
-                echo "Wheel build failed, use last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
-                gh --repo $repo issue comment $commit_issue -b "Wheel build failed with commit [${current_commit}](https://github.com/pytorch/pytorch/tree/${current_commit}), refer ${build_url}. CC @intel/torch-xpu-ops-maintain @EikanWang @riverliuintel @fengyuan14 @xytintel @etaf @chuanqi129 @mengfei25"
-                python setup.py clean
-                git clean -df .
-                git reset --hard
-                git checkout $last_commit
-                # apply PRs for stock pytorch
-                python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-                git status && git show -s
-                git submodule sync && git submodule update --init --recursive
-                if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-                  echo "Don't replace torch-xpu-ops!"
-                else
-                  rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-                  # Workaround for torch-xpu-ops ci test
-                  sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-                fi
-                WERROR=1 python setup.py bdist_wheel
+          if [[ "${is_fork_pr}" == "false" ]]; then
+            if [ -f dist/torch*.whl ] && \
+                [ "${{ inputs.update_lkg }}" == "true" ] && \
+                [ "${last_commit}" != "${current_commit}" ] && \
+                [[ "${{ inputs.pytorch }}" == "main" || "${{ inputs.pytorch }}" == "release/"* ]]; then
+              echo "Wheel build successful, update last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
+              gh --repo $repo issue view $commit_issue --json body -q .body | sed "s;${last_commit};${current_commit};g" | sed '/^$/d' > new_body.txt
+              gh --repo $repo issue edit $commit_issue --body-file new_body.txt
+              gh --repo $repo issue comment $commit_issue -b "Update LKG torch, refer ${build_url}"
+            fi
+            if [ ! -f dist/torch*.whl ]; then
+              echo "Wheel build failed, use last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
+              gh --repo $repo issue comment $commit_issue -b "Wheel build failed with commit [${current_commit}](https://github.com/pytorch/pytorch/tree/${current_commit}), refer ${build_url}. CC @intel/torch-xpu-ops-maintain @EikanWang @riverliuintel @fengyuan14 @xytintel @etaf @chuanqi129 @mengfei25"
+              python setup.py clean
+              git clean -df .
+              git reset --hard
+              git checkout $last_commit
+              # apply PRs for stock pytorch
+              python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+              git status && git show -s
+              git submodule sync && git submodule update --init --recursive
+              if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+                echo "Don't replace torch-xpu-ops!"
+              else
+                rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+                # Workaround for torch-xpu-ops ci test
+                sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
               fi
-            else
-              echo "Forked PR, don't update the issue"
+              WERROR=1 python setup.py bdist_wheel
             fi
-            pip install --force-reinstall dist/*.whl
-            cp dist/*.whl ${{ github.workspace }}/
-            cp pytorch_${current_commit}_build.log ${{ github.workspace }}/
           else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ../pytorch
-            git reset --hard && git checkout ${TORCH_COMMIT_ID}
-            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
-            rm -rf third_party/torch-xpu-ops
-            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
-            cd third_party/torch-xpu-ops
-            git checkout ${TORCH_XPU_OPS_COMMIT}
-            cd ../..
+            echo "Forked PR, don't update the issue"
           fi
+          pip install --force-reinstall dist/*.whl
+          cp dist/*.whl ${{ github.workspace }}/
+          cp pytorch_${current_commit}_build.log ${{ github.workspace }}/
       - name: Torch Config
         run: |
           source activate xpu_build
           source .github/scripts/env.sh ${{ inputs.pytorch }}
           python -c "import torch; print(torch.__config__.show())"
           python -c "import torch; print(torch.__config__.parallel_info())"
           python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-
           cd ..
           python pytorch/torch/utils/collect_env.py
       - name: Identify Build version
 
@@ -79,6 +79,7 @@ jobs:
       abi: 1
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
       runner: pvc_e2e
+      update_lkg: 'true'
 
   Linux-Nightly-Ondemand-UT-Tests:
     if: ${{ github.event_name == 'schedule' || inputs.ut != '' }}
 
@@ -81,6 +81,7 @@ jobs:
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
       driver: rolling
       runner: pvc_rolling
+      update_lkg: 'true'
 
   Linux-Nightly-Ondemand-UT-Tests-Rolling:
     if: ${{ github.event_name == 'schedule' || inputs.ut != '' }}
 
@@ -83,9 +83,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
-    # Equivalent to build option -fpreview-breaking-changes for SYCL compiler.
-    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
-    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
   endif()
   set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
 
@@ -113,7 +110,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_DEVICE_LINK_FLAGS ${SYCL_DEVICE_LINK_FLAGS} --offload-compress)
 
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-poison-unsupported-fp64-kernels")
-  set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-intel-128-GRF-per-thread")
+  set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-intel-enable-auto-large-GRF-mode")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-intel-greater-than-4GB-buffer-required")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
@@ -171,13 +171,6 @@ set(SYCL_FLAGS "")
 set(SYCL_LINK_FLAGS "")
 list(APPEND SYCL_FLAGS "-fsycl")
 list(APPEND SYCL_LINK_FLAGS "-fsycl")
-if(LINUX)
-  string(REGEX MATCH "libsycl-preview.so" is_abi_neutral ${SYCL_LIBRARY})
-  if(is_abi_neutral)
-    list(APPEND SYCL_FLAGS "-fpreview-breaking-changes")
-    list(APPEND SYCL_LINK_FLAGS "-fpreview-breaking-changes")
-  endif()
-endif()
 
 set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}")
 
 
@@ -38,16 +38,6 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_xpu(
   auto T = qkv.is_nested() ? native::NestedTensor_get_max_size(
                                  *native::get_nested_tensor_impl(qkv))[0]
                            : qkv.size(1);
-  if (qkv.is_nested()) {
-    // Don't mess with non-nested case for now since it's not set up to fiddle
-    // with mask size.
-
-    // Round T up to next multiple of 8 so as to be able to utilize Tensor
-    // cores. Otherwise, sometimes with padding, *no* row will have the maximum
-    // sequence length and so we'll have a non-divisible-by-8 dimension even if
-    // the model author chose a multiple of 8.
-    T = T + (8 - (T % 8)) % 8;
-  }
   auto _3D = qkv_bias.size(0);
   auto D = _3D / 3;
   TORCH_CHECK(D % num_head == 0);
 
@@ -5,10 +5,10 @@
 
 #include <comm/xpu_aten.h>
 
-#include <ATen/ops/mean.h>
-#include <ATen/ops/zeros_like.h>
 #include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
 #include <ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/zeros_like.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
 
@@ -22,6 +22,13 @@ Tensor adaptive_avg_pool2d_backward_xpu(
 
   native::adaptive_pool_empty_output_check(
       grad_output, "adaptive_avg_pool2d_backward");
+  TORCH_CHECK(
+      input.dim() == grad_output.dim(),
+      __func__,
+      ": Expected dimensions ",
+      input.dim(),
+      " for `gradOutput_` but got dimensions ",
+      grad_output.dim());
 
   checkAllSameGPU(__func__, {grad_output_arg, input_arg});
 
 
@@ -563,6 +563,13 @@ void adaptive_avg_pool3d_backward_kernel(
   TensorArg input_arg{input, "input", 3};
 
   adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool3d_backward");
+  TORCH_CHECK(
+      input.dim() == gradOutput_.dim(),
+      __func__,
+      ": Expected dimensions ",
+      input.dim(),
+      " for `gradOutput_` but got dimensions ",
+      gradOutput_.dim());
 
   checkAllSameGPU(
       "adaptive_avg_pool3d_backward_xpu",