diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml
index f1c97bd9a9..36fc6c6bfc 100644
--- a/.github/actions/get-runner/action.yml
+++ b/.github/actions/get-runner/action.yml
@@ -16,8 +16,6 @@ outputs:
   pytest_extra_args:
     value: ${{ steps.runner.outputs.pytest_extra_args }}
 
-permissions: read-all
-
 runs:
   using: composite
   steps:
@@ -59,10 +57,17 @@ runs:
           if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
         }' |wc -l)"
         cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
-        pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v cx="${cpus_per_xpu}" '{
+        pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
           if (x > 0) {
+            split(z, xpu_list, ",");
             for (i=0;i<x;i++) {
-              printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"", i, cx, i*cx, (i+1)*cx-1);
+              if (z != "") {
+                  ze = xpu_list[i+1];
+              } else {
+                  ze = i;
+              }
+              printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
+                      ze, cx, i*cx, (i+1)*cx-1);
             }
           }else {
             printf(" -n 1 ");
@@ -82,8 +87,5 @@ runs:
         cd ${RUNNER_WORKSPACE}/..
         if [ "${PWD}" != "/" ];then
           ls -al
-          sudo chmod 777 -R torch-xpu-ops _temp _actions _tool || true
-          # mount HOME dir to use caches to save time
-          rm -rf _temp && mkdir _temp
-          ln -sf ${HOME} _temp/_github_home
+          sudo chmod 777 -R . || true
         fi
diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml
index 7d8188af69..543a0b2044 100644
--- a/.github/actions/linux-e2etest/action.yml
+++ b/.github/actions/linux-e2etest/action.yml
@@ -25,10 +25,14 @@ inputs:
 runs:
   using: composite
   steps:
+    - name: Check Python
+      shell: bash -xe {0}
+      run: |
+        which python && python -V
+        which pip && pip list
     - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
       shell: bash -x {0}
       run: |
-        pip list |grep -E 'intel|torch'
         cp ./.github/scripts/inductor_xpu_test.sh ./pytorch
         cd ./pytorch
         # check param
@@ -62,10 +66,16 @@ runs:
                 contains "accuracy,performance" $scenario
                 $contains_status
                 if [ "${MODEL_ONLY_NAME}" == "" ];then
-                  for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ])
+                  for var in $(seq 0 $[ ${xpu_num} - 1 ])
                   do
-                    cpu_list="$(echo "${cpus_per_xpu} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')"
-                    numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} &
+                    cpu_list="$(echo "${cpus_per_xpu} ${var}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')"
+                    if [ "${ZE_AFFINITY_MASK}" != "" ];then
+                      xpu_list=($(echo ${ZE_AFFINITY_MASK} |sed 's/,/ /g'))
+                      xpu_id=${xpu_list[$[ ${var} + 1 ]]}
+                    else
+                      xpu_id=${var}
+                    fi
+                    numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${var} &
                   done
                 else
                   for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g')
diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
index 3031e0dc2a..cc5b9e0a70 100644
--- a/.github/actions/linux-testenv/action.yml
+++ b/.github/actions/linux-testenv/action.yml
@@ -18,8 +18,6 @@ inputs:
     default: 'None'
     description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
 
-permissions: read-all
-
 runs:
   using: composite
   steps:
@@ -113,18 +111,23 @@ runs:
     - name: Install E2E Requirements
       shell: bash -xe {0}
       run: |
-        cd pytorch
-        TIMM_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/timm.txt 2> /dev/null || echo 'v1.0.14')"
-        TORCHBENCH_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/torchbench.txt 2> /dev/null || echo 'e03a63be')"
+        # common
         pip install pandas psutil scipy pyyaml
-        if [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@${TIMM_COMMIT_ID}
+        cd pytorch
+        if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then
+          pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0
+          TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')"
+        elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
+          TIMM_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/timm.txt 2> /dev/null || echo 'v1.0.14')"
+          pip install git+https://github.com/huggingface/pytorch-image-models@${TIMM_COMMIT_ID}
         elif [[ "${{ inputs.suite }}" == *"torchbench"* ]];then
+          TORCHBENCH_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/torchbench.txt 2> /dev/null || echo 'e03a63be')"
           cd ../
-          rm -rf ./e2e-benchmark
-          git clone https://github.com/pytorch/benchmark e2e-benchmark
-          cd e2e-benchmark
+          rm -rf ./benchmark
+          git clone https://github.com/pytorch/benchmark benchmark
+          cd benchmark
           git checkout ${TORCHBENCH_COMMIT_ID}
+          sed -i 's/^ *pynvml.*//' requirements.txt
           pip install -r requirements.txt
           if [ "${{ github.event_name }}" == "pull_request" ];then
             while read line
@@ -134,10 +137,11 @@ runs:
           else
             python install.py --continue_on_fail
           fi
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@${TIMM_COMMIT_ID}
           # for dlrm
           pip install pyre-extensions
           curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
+          # for soft_actor_critic, temp fix
+          pip install git+https://github.com/nocoding03/gym@fix-np
           cd ../pytorch
         elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then
           cd ../
@@ -146,20 +150,17 @@ runs:
           git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark pt2e-performance
           cd pt2e-performance
           TORCHBENCH_COMMIT_ID="$(git rev-parse --short  HEAD)"
+          sed -i 's/^ *pynvml.*//' requirements.txt
           pip install -r requirements.txt
           python install.py --continue_on_fail
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@${TIMM_COMMIT_ID}
           # for dlrm
           pip install pyre-extensions
           curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
           cd ../pytorch
         else
           pip install -r ./.ci/docker/requirements-ci.txt
-          pip install -U pytest-timeout pytest-xdist
+          pip install -U pytest pytest-timeout pytest-xdist
         fi
-        # transformers
-        pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0
-        TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')"
         # install the corresponding torchao
         pip uninstall -y torchao
         if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then
diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
index c6ac2eb2b5..fb79182006 100644
--- a/.github/actions/linux-uttest/action.yml
+++ b/.github/actions/linux-uttest/action.yml
@@ -6,11 +6,14 @@ inputs:
     type: string
     description: Which ut to launch
 
-permissions: read-all
-
 runs:
   using: composite
   steps:
+    - name: Check Python
+      shell: bash -xe {0}
+      run: |
+        which python && python -V
+        which pip && pip list
     - name: op_regression
       shell: timeout 3600 bash -xe {0}
       if: ${{ inputs.ut_name == 'op_regression' || inputs.ut_name == 'basic' }}
@@ -20,6 +23,8 @@ runs:
         cd pytorch/third_party/torch-xpu-ops/test/regressions
         pytest --junit-xml=${{ github.workspace }}/ut_log/op_regression.xml \
           2> ${log_dir}/op_regression_test_error.log |tee ${log_dir}/op_regression_test.log
+        echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/regressions" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression.log
+        echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression.log
     - name: op_regression_dev1
       shell: timeout 300 bash -xe {0}
       if: ${{ inputs.ut_name == 'op_regression_dev1' || inputs.ut_name == 'basic' }}
@@ -30,6 +35,8 @@ runs:
         timeout 180 pytest test_operation_on_device_1.py \
           --junit-xml=${{ github.workspace }}/ut_log/op_regression_dev1.xml \
           2> ${log_dir}/op_regression_dev1_test_error.log |tee ${log_dir}/op_regression_dev1_test.log
+        echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/regressions" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression_dev1.log
+        echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression_dev1.log
     - name: op_transformers
       shell: timeout 3600 bash -xe {0}
       if: ${{ inputs.ut_name == 'op_transformers' || inputs.ut_name == 'basic' }}
@@ -41,6 +48,8 @@ runs:
         pytest test/test_transformers.py -k xpu \
           --junit-xml=${{ github.workspace }}/ut_log/op_transformers.xml \
           2> ${log_dir}/op_transformers_test_error.log |tee ${log_dir}/op_transformers_test.log
+        echo -e "File Path: cd pytorch" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_transformers.log
+        echo -e "Reproduce Command: pytest -sv test/failed_case -k xpu" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_transformers.log
     - name: op_extended
       shell: timeout 3600 bash -xe {0}
       if: ${{ inputs.ut_name == 'op_extended' || inputs.ut_name == 'basic' }}
@@ -53,6 +62,8 @@ runs:
           2> ${log_dir}/op_extended_test_error.log |tee ${log_dir}/op_extended_test.log
         ls -al
         cp *.xml ${{ github.workspace }}/ut_log
+        echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/xpu/extended" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_extended.log
+        echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_extended.log
     - name: op_ut
       shell: timeout 18000 bash -xe {0}
       if: ${{ inputs.ut_name == 'op_ut' }}
@@ -89,6 +100,8 @@ runs:
           tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test.log
         ls -al
         cp *.xml ${{ github.workspace }}/ut_log
+        echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/xpu" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_ut.log
+        echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_ut.log
     - name: torch_xpu
       shell: timeout 3600 bash -xe {0}
       if: ${{ inputs.ut_name == 'torch_xpu' }}
@@ -129,7 +142,6 @@ runs:
         python test/profiling/llama.py | \
           tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log
         python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
-        bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
 
         # All xpu ut under test/profiler
         cd ../../test/profiler
@@ -147,10 +159,6 @@ runs:
       if: ${{ inputs.ut_name == 'xpu_distributed' }}
       run: |
         xpu-smi topology -m
-        sudo rm -rf ${{ github.workspace }}/ptrace_scope.bk
-        sudo cp /proc/sys/kernel/yama/ptrace_scope ${{ github.workspace }}/ptrace_scope.bk
-        cat ${{ github.workspace }}/ptrace_scope.bk
-        echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope
         mkdir -p ut_log/xpu_distributed
         cd pytorch/third_party/torch-xpu-ops/test/xpu
         XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
@@ -161,3 +169,39 @@ runs:
         python run_distributed.py \
           2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
           tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
+
+    # Summary
+    - name: UT Test Results Summary
+      shell: timeout 180 bash -xe {0}
+      run: |
+        pip install junitparser
+        python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
+        # Check the failure logs
+        if ls ${{ github.workspace }}/failures*.log 1> /dev/null 2>&1; then
+          echo -e "Exist Failure logs"
+          echo "Found Failure logs as below: "
+          for file in ${{ github.workspace }}/failures*.log; do
+            echo "  - $file"
+            cp "$file" ${{ github.workspace }}/ut_log
+          done
+          echo -e "Failure logs Copied"
+        else
+          echo -e "No Failure logs"
+        fi
+        # Copied the passed logs
+        if ls passed*.log 1> /dev/null 2>&1; then
+          cp passed*.log ${{ github.workspace }}/ut_log
+          echo -e "Passed logs Copied"
+        else
+          echo -e "No Passed logs"
+        fi
+        # Copied the Summary logs
+        if ls category*.log 1> /dev/null 2>&1; then
+          cp category*.log ${{ github.workspace }}/ut_log
+          echo -e "Category logs Copied"
+        else
+          echo -e "No Category logs"
+        fi
+        if [ -e ut_failure_list.csv ];then
+            cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true
+        fi
diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml
index bb0ec2a55e..8704ed0fc6 100644
--- a/.github/actions/pt2e/action.yml
+++ b/.github/actions/pt2e/action.yml
@@ -20,17 +20,20 @@ inputs:
 runs:
   using: composite
   steps:
+    - name: Check Python
+      shell: bash -xe {0}
+      run: |
+        which python && python -V
+        which pip && pip list
     - name: Prepare dataset
       shell: bash -xe {0}
       run: |
         # dataset
-        if [ ! -d ${HOME}/datasets/imagenet ];then
-          rm -rf ${HOME}/datasets/imagenet
-          mkdir -p ${HOME}/datasets/imagenet
-          cd ${HOME}/datasets/imagenet
-          wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
-          tar -xf ILSVRC2012_img_val.tar
+        if [ ! -d ${dataset_dir} ];then
+          rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir}
           wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
+          wget -q https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
+          tar -xf ILSVRC2012_img_val.tar
           bash valprep.sh
         fi
     - name: PT2E Test (${{ inputs.dt }}  ${{ inputs.scenario }})
@@ -41,7 +44,7 @@ runs:
         echo "Mode,Model,Dtype,Result" |tee ${pt2e_logs_dir}/summary.csv
         if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then
           models="alexnet,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,resnet152,resnet18,resnet50,resnext50_32x4d,shufflenet_v2_x1_0,squeezenet1_1,vgg16"
-          cmd_line=" python pt2e-accuracy/scripts/modelbench/quant/inductor_quant_acc.py --device xpu --dataset_dir ${HOME}/datasets/imagenet "
+          cmd_line=" python pt2e-accuracy/scripts/modelbench/quant/inductor_quant_acc.py --device xpu --dataset_dir ${dataset_dir} "
           for model_name in $(echo $models |sed 's/,/ /g')
           do
             if [[ "${{ inputs.dt }}" == *"float32"* ]];then
diff --git a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv
index 4a60aecac6..0359c4024e 100644
--- a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv
+++ b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv
@@ -52,7 +52,7 @@ sebotnet33ts_256,pass,pass,pass,pass,pass
 selecsls42b,pass,pass,pass,pass,pass
 spnasnet_100,pass,pass,pass,pass,pass
 # https://github.com/intel/torch-xpu-ops/issues/1768
-swin_base_patch4_window7_224,pass,pass,pass,pass,pass
+swin_base_patch4_window7_224,pass,fail_accuracy,fail_accuracy,pass,pass
 swsl_resnext101_32x16d,pass,pass,pass,pass,pass
 tf_efficientnet_b0,pass,pass,pass,pass,pass
 tf_mixnet_l,pass,pass,pass,pass,pass
diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index c9afb73eb8..c21f9ceda1 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -3,6 +3,7 @@
 import os
 import re
 from junitparser import JUnitXml, Error, Failure, Skipped
+from collections import defaultdict
 
 parser = argparse.ArgumentParser(description='Test results analyzer')
 parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files')
@@ -10,6 +11,16 @@
 
 failures = []
 summaries = []
+failures_by_category = defaultdict(list)
+passed_cases = []
+passed_by_category = defaultdict(list)
+category_totals = defaultdict(lambda: {
+    'Test cases': 0,
+    'Passed': 0,
+    'Skipped': 0,
+    'Failures': 0,
+    'Errors': 0
+})
 
 error_types = [
     "RuntimeError",
@@ -38,6 +49,14 @@ def get_name(case):
         return case.get('name', '')
     return ' '.join(case.name.split())
 
+def get_category_from_case(case):
+    if isinstance(case, dict):
+        return case.get('category', 'unknown')
+    else:
+        if hasattr(case, '_file_category'):
+            return case._file_category
+        return 'unknown'
+
 def get_result(case):
     if isinstance(case, dict):
         return case.get('status', 'failed')
@@ -108,6 +127,7 @@ def print_failures(failure_list=None):
     print_header = True
     for case in failures:
         print_md_row({
+            'Category': get_category_from_case(case),
             'Class name': get_classname(case),
             'Test name': get_name(case),
             'Status': get_result(case),
@@ -116,13 +136,33 @@ def print_failures(failure_list=None):
         }, print_header, failure_list=failure_list)
         print_header = False
 
+def generate_failures_log():
+    if not failures:
+        return
+
+    for case in failures:
+        category = get_category_from_case(case)
+        failures_by_category[category].append(case)
+
+    for category, category_failures in failures_by_category.items():
+        if not category_failures:
+            continue
+
+        log_filename = f"failures_{category}.log"
+        with open(log_filename, "w", encoding='utf-8') as log_file:
+            for case in category_failures:
+                class_name = get_classname(case)
+                test_name = get_name(case)
+                log_file.write(f"{category},{class_name},{test_name}\n")
+
 def parse_log_file(log_file):
     with open(log_file, encoding='utf-8') as f:
         content = f.read()
 
     ut_name = os.path.splitext(os.path.basename(log_file))[0]
+    category = determine_category(ut_name)
     summary = {
-        'Category': determine_category(ut_name),
+        'Category': category,
         'UT': ut_name,
         'Test cases': 0,
         'Passed': 0,
@@ -170,19 +210,29 @@ def parse_log_file(log_file):
             for match in error_matches:
                 error_msg.append(match.group(0).strip())
 
-        failures.append({
+        failure_case = {
             'classname': ut_name,
             'name': f"{case_match.group(2)}:{test_name}",
             'error': " ".join(error_msg),
             'status': 'failed',
-            'source': 'Log'
-        })
+            'source': 'Log',
+            'category': category
+        }
+        failures.append(failure_case)
+        failures_by_category[category].append(failure_case)
         failures_number += 1
 
     if failures_number > summary['Failures']:
         summary['Failures'] = failures_number
     summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped']
 
+    # Update category totals
+    category_totals[category]['Test cases'] += summary['Test cases']
+    category_totals[category]['Passed'] += summary['Passed']
+    category_totals[category]['Skipped'] += summary['Skipped']
+    category_totals[category]['Failures'] += summary['Failures']
+    category_totals[category]['Errors'] += summary['Errors']
+
     return summary
 
 def determine_category(ut):
@@ -192,6 +242,10 @@ def determine_category(ut):
         return 'op_regression_dev1'
     elif ut == 'op_extended':
         return 'op_extended'
+    elif ut == 'op_transformers':
+        return 'op_transformers'
+    elif ut == 'test_xpu':
+        return 'test_xpu'
     elif 'op_ut' in ut:
         return 'op_ut'
     else:
@@ -223,12 +277,55 @@ def process_xml_file(xml_file):
             }
             summaries.append(suite_summary)
 
+            # Update category totals
+            category_totals[category]['Test cases'] += suite_summary['Test cases']
+            category_totals[category]['Passed'] += suite_summary['Passed']
+            category_totals[category]['Skipped'] += suite_summary['Skipped']
+            category_totals[category]['Failures'] += suite_summary['Failures']
+            category_totals[category]['Errors'] += suite_summary['Errors']
+
             for case in suite:
                 if get_result(case) not in ["passed", "skipped"]:
+                    case._file_category = category
                     failures.append(case)
+                elif get_result(case) == "passed":
+                    case._file_category = category
+                    passed_cases.append(case)
+                    passed_by_category[category].append(case)
     except Exception as e:
         print(f"Error processing {xml_file}: {e}", file=sys.stderr)
 
+def generate_passed_log():
+    if not passed_cases:
+        return
+
+    for category, category_passed in passed_by_category.items():
+        if not category_passed:
+            continue
+
+        log_filename = f"passed_{category}.log"
+        with open(log_filename, "w", encoding='utf-8') as log_file:
+            for case in category_passed:
+                class_name = get_classname(case)
+                test_name = get_name(case)
+                status = get_result(case)
+                log_file.write(f"{category},{class_name},{test_name}\n")
+
+def generate_category_totals_log():
+    """Generate log files with category totals"""
+    for category, totals in category_totals.items():
+        if totals['Test cases'] == 0:
+            continue
+
+        log_filename = f"category_{category}.log"
+        with open(log_filename, "w", encoding='utf-8') as log_file:
+            log_file.write(f"Category: {category}\n")
+            log_file.write(f"Test cases: {totals['Test cases']}\n")
+            log_file.write(f"Passed: {totals['Passed']}\n")
+            log_file.write(f"Skipped: {totals['Skipped']}\n")
+            log_file.write(f"Failures: {totals['Failures']}\n")
+            log_file.write(f"Errors: {totals['Errors']}\n")
+
 def print_summary():
     print("### Results Summary")
     print_header = True
@@ -276,6 +373,10 @@ def main():
 
     with open("ut_failure_list.csv", "w") as failure_list:
         print_failures(failure_list=failure_list)
+
+    generate_failures_log()
+    generate_passed_log()
+    generate_category_totals_log()
     print_summary()
 
 
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index cc3131ccef..0e6b95ec45 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -2,17 +2,13 @@
 ut_suite="${1:-op_regression}"   # op_regression / op_extended / op_ut / torch_xpu
 
 # usage
-# compare_and_filter_logs <UT'log> <Known_issue.log> [output.log]
+# check_new_failed <UT'log> <Known_issue.log> [output.log]
+all_pass=""
 
-compare_and_filter_logs() {
+check_new_failed() {
     local file_UT="$1"
     local file_known_issue="$2"
     local output_file="${3:-${file_UT%.*}_filtered.log}"
-    local filtered_content="${file_UT%.*}_removed.log"
-    local temp_file="temp_parts.log"
-    local temp_file_labeled="temp_parts_labeled.log"
-    local temp_output="${3:-${file_UT%.*}_filtered_temp.log}"
-    local temp_final="${file_UT%.*}_final_temp.log"
 
     if [[ $# -lt 2 ]]; then
         echo "[ERROR] Need 2 files to compare"
@@ -29,36 +25,12 @@ compare_and_filter_logs() {
 
     # Filter the same content from file_UT as file_known_issue
     echo "Filtering $file_known_issue for $file_UT"
+    if grep -q $'\r' "$file_UT"; then
+        echo "Detected log from windows"
+        sed -i 's/\r$//' "$file_UT"
+    fi
     grep -vFxf "$file_known_issue" "$file_UT" > "$output_file"
 
-    # Keep the filtered UT cases
-    grep -noFf "$file_known_issue" "$file_UT" > "$filtered_content"
-    echo "Filtered cases file: $filtered_content"
-    true > "$temp_file"
-    true > "$temp_file_labeled"
-    true > "$temp_output"
-    true > "$temp_final"
-    grep -E '\.py$|,' "$output_file" > "$temp_output"
-    while IFS= read -r line; do
-        IFS=',' read -ra parts <<< "$line"
-        for part in "${parts[@]}"; do
-            part_trimmed=$(echo "$part" | xargs)
-            if [[ -n "$part_trimmed" ]] && ! grep -qF "$part_trimmed" "$file_known_issue"; then
-                echo "$part_trimmed" >> "$temp_file"
-                echo -e "\n\033[1;33m[Check the failed cases in summary line]\033[0m"
-                echo -e "\033[1;33mCase not found in ${file_known_issue}: '${part_trimmed}' (from line: '${line}')\033[0m"
-            else
-                echo -e "\n\033[1;33m[Check the failed cases in summary line]\033[0m"
-                echo -e "\n\033[1;32m${part_trimmed} found in ${file_known_issue} (from line: '${line}')\033[0m"
-            fi
-        done
-    done < "$temp_output"
-
-    awk '{print $0 " [in summary line]"}' "$temp_file" > "$temp_file_labeled"
-    grep -vE '\.py$|,' "$output_file" > "$temp_final"
-    cat "$temp_file_labeled" >> "$temp_final"
-    mv "$temp_final" "$output_file"
-
     echo -e "\n\033[1;31m[New failed cases Summary]\033[0m"
     if [[ -z "$(tr -d ' \t\n\r\f' < "$output_file" 2>/dev/null)" ]]; then
         echo -e "\033[1;32mNo new failed cases found\033[0m"
@@ -66,20 +38,15 @@ compare_and_filter_logs() {
         echo -e "\n\033[1;31mNew failed cases, not in known issues\033[0m"
         cat "$output_file"
     fi
+}
 
-    if [[ -s "$filtered_content" ]]; then
-        echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m"
-        awk -F':' '{
-            line_number = $1
-            $1 = ""
-            gsub(/^ /, "", $0)
-            printf "\033[33m%3d\033[0m: %s\n", line_number, $0
-        }' "$filtered_content"
-    else
-        echo -e "\n\033[1;32mNo Skipped Cases\033[0m"
-    fi
-
-    rm -f ${temp_output} ${temp_file} ${temp_final}
+check_filtered_logs() {
+  local file_UT="$1"
+  local file_known_issue="$2"
+  local filtered_content="${file_UT%.*}_removed.log"
+  # Keep the filtered UT cases
+  grep -noFf "$file_known_issue" "$file_UT" > "$filtered_content"
+  echo "Filtered cases file: $filtered_content"
 }
 
 check_passed_known_issues() {
@@ -91,6 +58,10 @@ check_passed_known_issues() {
         return 1
     fi
     echo "Checking for known issues that are now passing in $file_passed_UT"
+    if grep -q $'\r' "$file_passed_UT"; then
+        echo "Detected log from windows"
+        sed -i 's/\r$//' "$file_passed_UT"
+    fi
     grep -Fxf "$file_passed_UT" "$file_known_issue" > "$output_file"
     echo -e "\n\033[1;32m[New passed cases Summary]\033[0m"
     if [[ -s "$output_file" ]]; then
@@ -99,131 +70,133 @@ check_passed_known_issues() {
     else
         echo -e "\033[1;33mNo known issues are now passing\033[0m"
     fi
+
+    rm -f ${output_file}
 }
 
-get_pass_fail_log() {
-  local p_row="$1"
-  local p_col="$2"
-  local ut_log="$3"
-  grep -E "${p_row}" "${ut_log}" | awk -v p="${p_col}" '{
-    for (i=1;i<=NF;i++) {
-      if ($i ~ p) {
-        print $i;
-      }
-    }
-  }'
+check_test_cases() {
+    local log_file="$1"
+    declare -A expected_cases=(
+        ["op_extended"]=5349
+        ["op_regression"]=244
+        ["op_regression_dev1"]=1
+        ["op_transformers"]=237
+        ["op_ut"]=120408
+        ["test_xpu"]=69
+    )
+
+    if [[ ! -f "$log_file" ]]; then
+        echo "False"
+        echo "[ERROR] Need test file $log_file" >&2
+        return 1
+    fi
+
+    all_pass="true"
+    local current_category=""
+
+    while IFS= read -r line; do
+        if [[ $line =~ ^Category:\ ([^[:space:]]+) ]]; then
+            current_category="${BASH_REMATCH[1]}"
+        elif [[ $line =~ Test\ cases:\ ([0-9]+) ]] && [[ -n "$current_category" ]]; then
+            actual_cases="${BASH_REMATCH[1]}"
+            expected_cases_value="${expected_cases[$current_category]}"
+
+            if [[ -n "$expected_cases_value" ]]; then
+                threshold=$(echo "$expected_cases_value * 0.95" | bc -l | awk '{print int($1+0.5)}')
+
+                echo "Category: $current_category"
+                echo "Expected number: $expected_cases_value"
+                echo "Current number: $actual_cases"
+                echo "Threshold(95%): $threshold"
+
+                if [[ "$actual_cases" -lt "$threshold" ]]; then
+                    echo "  Status: ❌ Abnormal (reduction exceeds 5%)"
+                    all_pass="false"
+                else
+                    reduction=$(echo "scale=2; ($actual_cases/$expected_cases_value - 1) * 100" | bc -l)
+                    echo "  Status: ✅ Normal (reduction ${reduction}%)"
+                fi
+                echo "----------------------------------------"
+            fi
+            current_category=""
+        fi
+    done < "$log_file"
 }
 
-if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then
-    get_pass_fail_log ".FAILED" "::.*::" "${ut_suite}"_test.log > ./"${ut_suite}"_failed.log
-    grep -E "Timeout" "${ut_suite}"_test.log | grep "test" >> ./"${ut_suite}"_failed.log
-    get_pass_fail_log "PASSED" "::.*::" "${ut_suite}"_test.log > ./"${ut_suite}"_passed.log
+
+if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' || "${ut_suite}" == 'op_ut' || "${ut_suite}" == 'test_xpu' ]]; then
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
-    cat "./${ut_suite}_failed.log"
-    echo -e "========================================================================="
-    echo -e "Checking Failed cases in ${ut_suite}"
-    echo -e "========================================================================="
-    compare_and_filter_logs "${ut_suite}"_failed.log Known_issue.log
-    echo -e "========================================================================="
-    echo -e "Checking New passed cases in Known issue list for ${ut_suite}"
-    echo -e "========================================================================="
-    check_passed_known_issues "${ut_suite}"_passed.log Known_issue.log
-    if [[ -f "${ut_suite}_failed_filtered.log" ]]; then
-      num_failed=$(wc -l < "./${ut_suite}_failed_filtered.log")
-    else
-      num_failed=$(wc -l < "./${ut_suite}_failed.log")
-    fi
-    num_passed=$(wc -l < "./${ut_suite}_passed.log")
-    if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]]; then
-      echo -e "[ERROR] UT ${ut_suite} test Fail"
-      exit 1
+    if [[ -f "failures_${ut_suite}.log" ]]; then
+      cat "./failures_${ut_suite}.log"
     else
-      echo -e "[PASS] UT ${ut_suite} test Pass"
+      echo -e "\033[1;32mNo failed cases\033[0m"
     fi
-fi
-if [[ "${ut_suite}" == 'op_ut' ]]; then
-    get_pass_fail_log ".FAILED" "::.*::" op_ut_with_skip_test.log > ./"${ut_suite}"_with_skip_test_failed.log
-    grep -E "Timeout" op_ut_with_skip_test.log | grep "test" >> ./"${ut_suite}"_with_skip_test_failed.log
-    get_pass_fail_log "PASSED" "::.*::" op_ut_with_skip_test.log > ./"${ut_suite}"_with_skip_test_passed.log
-    get_pass_fail_log ".FAILED" "::.*::" op_ut_with_only_test.log  > ./"${ut_suite}"_with_only_test_failed.log
-    grep -E "Timeout" op_ut_with_only_test.log | grep "test" >> ./"${ut_suite}"_with_only_test_failed.log
-    get_pass_fail_log "PASSED" "::.*::" op_ut_with_only_test.log > ./"${ut_suite}"_with_only_test_passed.log
-    echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} with skip"
-    echo -e "========================================================================="
-    cat "./${ut_suite}_with_skip_test_failed.log"
     echo -e "========================================================================="
-    echo -e "Checking Failed cases in ${ut_suite} with skip"
-    echo -e "========================================================================="
-    compare_and_filter_logs "${ut_suite}"_with_skip_test_failed.log Known_issue.log
-    echo -e "========================================================================="
-    echo -e "Checking New passed cases in Known issue list for ${ut_suite} with skip"
-    echo -e "========================================================================="
-    check_passed_known_issues "${ut_suite}"_with_skip_test_passed.log Known_issue.log
-    if [[ -f "${ut_suite}_with_skip_test_failed_filtered.log" ]]; then
-      num_failed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_failed_filtered.log")
+    echo -e "Checking Test case number for ${ut_suite}"
+    echo -e "========================================================================="
+    check_test_cases category_${ut_suite}.log
+    echo -e "========================================================================="
+    echo -e "Checking Filtered cases for ${ut_suite}"
+    echo -e "========================================================================="
+    if [[ -f "failures_${ut_suite}.log" ]]; then
+      check_filtered_logs failures_${ut_suite}.log Known_issue.log
+      num_filtered=$(wc -l < "./failures_${ut_suite}_removed.log")
+      if [[ $num_filtered -gt 0 ]]; then
+          echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m"
+          awk -F':' '{
+              line_number = $1
+              $1 = ""
+              gsub(/^ /, "", $0)
+              printf "\033[33m%3d\033[0m: %s\n", line_number, $0
+          }' "failures_${ut_suite}_removed.log"
+      else
+          echo -e "\n\033[1;32mNo Skipped Cases\033[0m"
+      fi
     else
-      num_failed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_failed.log")
+      echo -e "\033[1;32mNo need to check filtered cases\033[0m"
     fi
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} with only"
-    echo -e "========================================================================="
-    cat "./${ut_suite}_with_only_test_failed.log"
-    echo -e "========================================================================="
-    echo -e "Checking Failed cases in ${ut_suite} with only"
+    echo -e "Checking New passed cases in Known issue list for ${ut_suite}"
     echo -e "========================================================================="
-    compare_and_filter_logs "${ut_suite}"_with_only_test_failed.log Known_issue.log
+    check_passed_known_issues passed_${ut_suite}.log Known_issue.log
     echo -e "========================================================================="
-    echo -e "Checking New passed cases in Known issue list for ${ut_suite} with only"
+    echo -e "Checking New Failed cases in ${ut_suite}"
     echo -e "========================================================================="
-    check_passed_known_issues "${ut_suite}"_with_only_test_passed.log Known_issue.log
-    if [[ -f "${ut_suite}_with_only_test_failed_filtered.log" ]]; then
-      num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed_filtered.log")
+    if [[ -f "failures_${ut_suite}.log" ]]; then
+      check_new_failed failures_${ut_suite}.log Known_issue.log
     else
-      num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed.log")
+      echo -e "\033[1;32mNo need to check failed cases\033[0m"
     fi
-    ((num_failed=num_failed_with_skip+num_failed_with_only))
-    num_passed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_passed.log")
-    num_passed_with_only=$(wc -l < "./${ut_suite}_with_only_test_passed.log")
-    ((num_passed=num_passed_with_skip+num_passed_with_only))
-    if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]]; then
-      echo -e "[ERROR] UT ${ut_suite} test Fail"
-      exit 1
+
+    if [[ -f "failures_${ut_suite}_filtered.log" ]]; then
+      num_failed=$(wc -l < "./failures_${ut_suite}_filtered.log")
+    elif [[ -f "failures_${ut_suite}.log" ]]; then
+      num_failed=$(wc -l < "./failures_${ut_suite}.log")
     else
-      echo -e "[PASS] UT ${ut_suite} test Pass"
+      num_failed=0
     fi
-fi
-if [[ "${ut_suite}" == 'torch_xpu' ]]; then
-    echo "Pytorch XPU binary UT checking"
-    cd ../../pytorch || exit
-    for xpu_case in build/bin/*{xpu,sycl}*; do
-      if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-        case_name=$(basename "$xpu_case")
-        cd ../ut_log/torch_xpu || exit
-        grep -E "FAILED" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_failed.log
-        wc -l < "./binary_ut_${ut_suite}_${case_name}_failed.log" | tee -a ./binary_ut_"${ut_suite}"_failed_summary.log
-        grep -E "PASSED|Pass" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_passed.log
-        wc -l < "./binary_ut_${ut_suite}_${case_name}_passed.log" | tee -a ./binary_ut_"${ut_suite}"_passed_summary.log
-        cd - || exit
-      fi
-    done
+    num_passed=$(wc -l < "./passed_${ut_suite}.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite}"
+    echo -e "Provide the reproduce command for ${ut_suite}"
     echo -e "========================================================================="
-    cd ../ut_log/torch_xpu || exit
-    cat "./binary_ut_${ut_suite}_${case_name}_failed.log"
-    num_failed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_failed_summary.log)
-    num_passed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_passed_summary.log)
-    ((num_failed=num_failed_binary_ut))
-    if [[ $num_failed -gt 0 ]] || [[ $num_passed_binary_ut -le 0 ]]; then
+    if [[ $num_failed -gt 0 ]]; then
+      echo -e "Need reproduce command"
+      if [[ -f "reproduce_${ut_suite}.log" ]]; then
+        cat "./reproduce_${ut_suite}.log"
+      fi
+    else
+      echo -e "Not need reproduce command"
+    fi
+    if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]] || [[ "$all_pass" == 'false' ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
     else
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
+
 if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
     grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
     grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
@@ -232,13 +205,29 @@ if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
     echo -e "========================================================================="
     cat "./${ut_suite}_xpu_distributed_test_failed.log"
     echo -e "========================================================================="
-    echo -e "Checking Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Checking Filtered cases for ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log
+    check_filtered_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log
+    num_filtered_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_removed.log")
+    if [[ $num_filtered_xpu_distributed -gt 0 ]]; then
+        echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m"
+        awk -F':' '{
+            line_number = $1
+            $1 = ""
+            gsub(/^ /, "", $0)
+            printf "\033[33m%3d\033[0m: %s\n", line_number, $0
+        }' "${ut_suite}_xpu_distributed_test_failed_removed.log"
+    else
+        echo -e "\n\033[1;32mNo Skipped Cases\033[0m"
+    fi
     echo -e "========================================================================="
     echo -e "Checking New passed cases in Known issue list for ${ut_suite}"
     echo -e "========================================================================="
     check_passed_known_issues "${ut_suite}"_xpu_distributed_test_passed.log Known_issue.log
+    echo -e "========================================================================="
+    echo -e "Checking Failed cases in ${ut_suite} xpu distributed"
+    echo -e "========================================================================="
+    check_new_failed "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log
     if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then
       num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log")
     else
diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml
index a40601d625..decf4e612f 100644
--- a/.github/workflows/_linux_accelerate.yml
+++ b/.github/workflows/_linux_accelerate.yml
@@ -26,7 +26,7 @@ on:
       runner:
         required: true
         type: string
-        default: 'linux.idc.xpu'
+        default: 'pvc_rolling'
         description: Runner label
       accelerate:
         required: false
@@ -45,11 +45,15 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+defaults:
+  run:
+    shell: bash {0}
+
 jobs:
   conditions-filter:
     name: conditions-filter
     if: ${{ github.event.pull_request.draft == false }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     timeout-minutes: 10
     env:
       GH_TOKEN: ${{ github.token }}
@@ -66,22 +70,46 @@ jobs:
           disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
           echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
 
-  Torch-XPU-Accelerate-Tests:
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+  prepare:
+    runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }}
     needs: conditions-filter
     if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_accelerate')) }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+      pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  tests:
+    runs-on: ${{ needs.prepare.outputs.runner_id }}
+    needs: prepare
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
+              --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.prepare.outputs.user_id }}
+              -e ZE_AFFINITY_MASK
+      env:
+        WORK_DIR: 'accelerate'
+        PYTORCH_DEBUG_XPU_FALLBACK: 1
+        HF_HUB_ETAG_TIMEOUT: 120
+        HF_HUB_DOWNLOAD_TIMEOUT: 120
+        PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
     env:
-      WORK_DIR: 'accelerate'
-      NEOReadDebugKeys: 0
-      DisableScratchPages: 0
       accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }}
       transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
       python: ${{ inputs.python != '' && inputs.python || '3.10' }}
-      PYTORCH_DEBUG_XPU_FALLBACK: 1
-      ZE_AFFINITY_MASK: 0
-      PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
-      HF_HUB_ETAG_TIMEOUT: 120
-      HF_HUB_DOWNLOAD_TIMEOUT: 120
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
@@ -93,25 +121,22 @@ jobs:
           repository: huggingface/accelerate
           ref: ${{ env.accelerate }}
           path: accelerate
-      - name: Create unique Conda ENV name
+      - name: Setup python-${{ env.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.python }}
+      - name: Check python
         run: |
-          random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
-          echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV
-      - name: Prepare Conda ENV
+          which python && python -V
+          which pip && pip list
+          pip install -U pip wheel setuptools
+      - name: Install pytorch and deps
         run: |
-          echo "Using Conda ENV name: $CONDA_ENV_NAME"
-          conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
-          source activate $CONDA_ENV_NAME
-          pip install junitparser pytest-timeout
+          pip install junitparser
           pip install transformers==${{ env.transformers }}
-      - name: Prepare Stock XPU Pytorch
-        run: |
-          source activate $CONDA_ENV_NAME
           pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
       - name: Prepare Accelerate
         run: |
-          source activate $CONDA_ENV_NAME
           cd $WORK_DIR
           pip install -e .
           pip install -e ".[testing]"
@@ -120,7 +145,6 @@ jobs:
           cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
       - name: Report installed versions
         run: |
-          source activate $CONDA_ENV_NAME
           echo "pip installed packages:"
           pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt
           echo "lspci gpu devices:"
@@ -131,16 +155,17 @@ jobs:
           xpu-smi discovery -y --json --dump -1
       - name: Sanity check installed packages
         run: |
-          source activate $CONDA_ENV_NAME
+          # Use latest pytest
+          pip install -U pytest pytest-timeout pytest-xdist
           # These checks are to exit earlier if for any reason torch
           # packages were reinstalled back to CUDA versions (not expected).
           pip show torch | grep Version | grep xpu
           pip show torchaudio | grep Version | grep xpu
           pip show torchvision | grep Version | grep xpu
           python -c 'import torch; exit(not torch.xpu.is_available())'
-      - name: Run tests
+          printenv
+      - name: Run tests on ${{ needs.prepare.outputs.hostname }}
         run: |
-          source activate $CONDA_ENV_NAME
           cd $WORK_DIR && rm -rf reports && mkdir -p reports
           # Excluding tests due to:
           # * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on
@@ -150,8 +175,7 @@ jobs:
           # * tests/test_big_modeling.py::test_dispatch_model_tied_weights_memory_with_nested_offload_cpu fails
           #   with OOM. That's a new test added by https://github.com/huggingface/accelerate/pull/3445
           pattern="not test_profiler and not test_gated and not test_dispatch_model_tied_weights_memory_with_nested_offload_cpu"
-          cmd=(python3 -m pytest --timeout 600 -rsf --junitxml=reports/accelerate.xml -k "$pattern" \
-            tests/)
+          cmd=(python -m pytest --junitxml=reports/accelerate.xml -k "$pattern" tests/)
           {
             echo "### Running"
             echo "\`\`\`"
@@ -162,28 +186,20 @@ jobs:
       - name: Print result tables
         if: ${{ ! cancelled() }}
         run: |
-          source activate $CONDA_ENV_NAME
           cd $WORK_DIR
           {
             echo "### Results"
-            python3 $PARSE_JUNIT reports/accelerate.xml --stats
+            python $PARSE_JUNIT reports/accelerate.xml --stats
             echo "### Failed"
-            python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed
+            python $PARSE_JUNIT reports/accelerate.xml --errors --failed
             echo "### Skipped"
-            python3 $PARSE_JUNIT reports/accelerate.xml --skipped
+            python $PARSE_JUNIT reports/accelerate.xml --skipped
           } >> $GITHUB_STEP_SUMMARY
       - name: Print environment
         if: ${{ ! cancelled() }}
         uses: ./torch-xpu-ops/.github/actions/print-environment
         with:
-          conda: $CONDA_ENV_NAME
           pip_packages: 'accelerate transformers'
-      - name: Clean up
-        if: ${{ always() }}
-        run: |
-          if [ -n "$CONDA_ENV_NAME" ]; then
-            conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          fi
       - name: Upload Test log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
index d7257a4e77..ab49d28490 100644
--- a/.github/workflows/_linux_e2e.yml
+++ b/.github/workflows/_linux_e2e.yml
@@ -46,6 +46,10 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
 
 jobs:
   runner:
@@ -79,14 +83,11 @@ jobs:
       options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
               -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
       env:
-        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-        MODEL_ONLY_NAME: ${{ inputs.model }}
         xpu_num: ${{ needs.runner.outputs.xpu_num }}
         cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
-    env:
-      GH_TOKEN: ${{ github.token }}
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        MODEL_ONLY_NAME: ${{ inputs.model }}
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        dataset_dir: ${{ runner.temp }}/../_datasets/imagenet
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
@@ -189,6 +190,7 @@ jobs:
 
       # On-demand launch
       - name: Get test data types
+        if: ${{ contains(inputs.test_type, 'ondemand') }}
         id: dtypes
         run: |
           e2e_dtypes="$(echo "${{ inputs.dt }}" |sed 's/,int8//;s/int8,//')"
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index 5231a29419..b65d985008 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -22,6 +22,11 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  reference_issue: 1689
 
 jobs:
   runner:
@@ -56,19 +61,15 @@ jobs:
               -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
       env:
         AGENT_TOOLSDIRECTORY: /opt/xpu-tool
-        REFERENCE_ISSUE: 1689
-    env:
-      GH_TOKEN: ${{ github.token }}
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Prepare test env on ${{ needs.runner.outputs.hostname }}
+      - name: Prepare test env
         uses: ./.github/actions/linux-testenv
         with:
           pytorch: ${{ inputs.pytorch }}
           python: ${{ inputs.python }}
-
-      - name: Run Torch XPU Op Benchmark
+      - name: Run Torch XPU Op Benchmark on ${{ needs.runner.outputs.hostname }}
         run: |
           mkdir -p ${{ github.workspace }}/op_benchmark
           cd test/microbench
@@ -92,9 +93,6 @@ jobs:
   op_benchmark_test_results_check:
     needs: op_benchmark
     runs-on: ubuntu-24.04
-    env:
-      GH_TOKEN: ${{ github.token }}
-      reference_issue: 1689
     steps:
       - name: Install gh-cli
         run: |
@@ -155,6 +153,6 @@ jobs:
           path: ${{ github.workspace }}/op_benchmark
       - name: Upload Reference Run ID
         run: |
-          gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} --json body -q .body | \
+          gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \
             sed "s/Inductor-XPU-OP-Benchmark-Data:.*/Inductor-XPU-OP-Benchmark-Data: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt
-          gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE} --body-file new_body.txt
+          gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
index 2d8bfd4af2..fc3deaf7a3 100644
--- a/.github/workflows/_linux_transformers.yml
+++ b/.github/workflows/_linux_transformers.yml
@@ -21,7 +21,7 @@ on:
       runner:
         required: true
         type: string
-        default: 'linux.idc.xpu'
+        default: 'pvc_rolling'
         description: Runner label
       driver:
         required: false
@@ -58,8 +58,6 @@ env:
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HF_HUB_ETAG_TIMEOUT: 120
   HF_HUB_DOWNLOAD_TIMEOUT: 120
-  NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-  DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
   python: ${{ inputs.python != '' && inputs.python || '3.10' }}
   accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.7.0'}}
   datasets: ${{ inputs.datasets != '' && inputs.datasets || 'v3.6.0'}}
@@ -76,8 +74,12 @@ env:
     libswresample-dev
     libswscale-dev
     pciutils
-  PYTEST_TIMEOUT: 600
   TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
+  AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+
+defaults:
+  run:
+    shell: bash {0}
 
 jobs:
   conditions-filter:
@@ -101,7 +103,7 @@ jobs:
           echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
 
   prepare:
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+    runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }}
     needs: conditions-filter
     if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }}
     outputs:
@@ -109,6 +111,11 @@ jobs:
       torchvision: ${{ steps.getver.outputs.torchvision }}
       torchaudio: ${{ steps.getver.outputs.torchaudio }}
       triton: ${{ steps.getver.outputs.triton }}
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+      pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
     steps:
       - id: getver
         run: |
@@ -127,10 +134,28 @@ jobs:
           echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT"
           echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT"
           echo "triton=$triton" | tee -a "$GITHUB_OUTPUT"
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
 
   tests:
     needs: prepare
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+    runs-on: ${{ needs.prepare.outputs.runner_id }}
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
+              --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.prepare.outputs.user_id }}
+              -e ZE_AFFINITY_MASK
+      env:
+        PYTORCH_DEBUG_XPU_FALLBACK: '1'
+        TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
+        # enable pytest parallel run, and continue others if meets crash case such as segmentation fault
+        PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
     strategy:
       fail-fast: false
       max-parallel: 1
@@ -152,52 +177,16 @@ jobs:
           # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals)
           # * https://github.com/huggingface/transformers/issues/36267 (marian tests)
           - test_case: 'tests_models_0'
-            cmd: 'tests/models --num-shards 16 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py'
+            cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py'
             filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
           - test_case: 'tests_models_1'
-            cmd: 'tests/models --num-shards 16 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py'
+            cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py'
             filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
           - test_case: 'tests_models_2'
-            cmd: 'tests/models --num-shards 16 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py'
+            cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py'
             filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
           - test_case: 'tests_models_3'
-            cmd: 'tests/models --num-shards 16 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_4'
-            cmd: 'tests/models --num-shards 16 --shard-id 4 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_5'
-            cmd: 'tests/models --num-shards 16 --shard-id 5 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_6'
-            cmd: 'tests/models --num-shards 16 --shard-id 6 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_7'
-            cmd: 'tests/models --num-shards 16 --shard-id 7 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_8'
-            cmd: 'tests/models --num-shards 16 --shard-id 8 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_9'
-            cmd: 'tests/models --num-shards 16 --shard-id 9 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_10'
-            cmd: 'tests/models --num-shards 16 --shard-id 10 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_11'
-            cmd: 'tests/models --num-shards 16 --shard-id 11 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_12'
-            cmd: 'tests/models --num-shards 16 --shard-id 12 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_13'
-            cmd: 'tests/models --num-shards 16 --shard-id 13 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_14'
-            cmd: 'tests/models --num-shards 16 --shard-id 14 --ignore=tests/models/marian/test_modeling_marian.py'
-            filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
-          - test_case: 'tests_models_15'
-            cmd: 'tests/models --num-shards 16 --shard-id 15 --ignore=tests/models/marian/test_modeling_marian.py'
+            cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py'
             filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
           # Excluding tests due to:
           # * Some ray tests hang, reason unknown
@@ -212,9 +201,6 @@ jobs:
           - test_case: 'tests_utils'
             cmd: '--ignore=tests/utils/test_import_utils.py tests/utils'
             filter: 'not test_load_img_url_timeout'
-    env:
-      PYTORCH_DEBUG_XPU_FALLBACK: '1'
-      TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
@@ -253,20 +239,18 @@ jobs:
             sleep 1;
             if (( $SECONDS - start_time > 60 )); then false; fi
           done
-      - name: Create unique Conda ENV name
-        run: |
-          random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
-          echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV
-      - name: Prepare Conda ENV
+      - name: Setup python-${{ env.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.python }}
+      - name: Check python
         run: |
-          echo "Using Conda ENV name: $CONDA_ENV_NAME"
-          conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
-          source activate $CONDA_ENV_NAME
-          pip install junitparser pytest-shard pytest-timeout
-      - name: Prepare Stock XPU Pytorch
+          which python && python -V
+          which pip && pip list
+          pip install -U pip wheel setuptools
+      - name: Prepare pytorch and deps
         run: |
-          source activate $CONDA_ENV_NAME
+          pip install junitparser
           pip install $TORCH_INDEX \
             torch==${{ needs.prepare.outputs.torch }} \
             torchvision==${{ needs.prepare.outputs.torchvision }} \
@@ -275,7 +259,6 @@ jobs:
       - name: Prepare Transformers
         run: |
           pwd
-          source activate $CONDA_ENV_NAME
           cd transformers
           pip install \
             accelerate==${{ env.accelerate }} \
@@ -287,7 +270,6 @@ jobs:
           cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
       - name: Report installed versions
         run: |
-          source activate $CONDA_ENV_NAME
           LOGS_DIR="${{ github.workspace }}/transformers/logs"
           echo "pip installed packages:"
           pip list | tee "$LOGS_DIR/pip_list-$TEST_CASE.txt"
@@ -299,36 +281,35 @@ jobs:
           xpu-smi discovery -y --json --dump -1
       - name: Sanity check installed packages
         run: |
-          source activate $CONDA_ENV_NAME
+          # Use latest pytest
+          pip install -U pytest pytest-timeout pytest-xdist pytest-shard
           # These checks are to exit earlier if for any reason Transformers
           # reinstalled torch packages back to CUDA versions (not expected).
           pip show torch | grep Version | grep xpu
           pip show torchaudio | grep Version | grep xpu
           pip show torchvision | grep Version | grep xpu
           python -c 'import torch; exit(not torch.xpu.is_available())'
-      - name: Run tests
+      - name: Run tests on ${{ needs.prepare.outputs.hostname }}
         run: |
-          source activate $CONDA_ENV_NAME
           cd transformers
-          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml \
-            -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true
+          python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \
+                  -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true
       - name: Check for errors in tests
         run: |
-          source activate $CONDA_ENV_NAME
-          python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml
+          python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml
       - name: Print environment
         if: ${{ ! cancelled() }}
         uses: ./torch-xpu-ops/.github/actions/print-environment
         with:
-          conda: $CONDA_ENV_NAME
           pip_packages: 'accelerate datasets transformers'
           to: 'transformers/logs/environment-$TEST_CASE.md'
       - name: Clean up
         if: ${{ always() }}
         run: |
-          du -sh ${{ env.HF_HOME }} || true
-          if [ -n "$CONDA_ENV_NAME" ]; then
-            conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
+          if [ -d "$HF_HOME" ]; then
+            ls -al ${{ env.HF_HOME }}
+            du -sh ${{ env.HF_HOME }}
+            rm -rf ${{ env.HF_HOME }}
           fi
       - name: Upload reports
         if: ${{ ! cancelled() }}
@@ -346,7 +327,7 @@ jobs:
   report:
     needs: tests
     if: ${{ success() || failure() }}
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+    runs-on: ubuntu-24.04
     steps:
       - name: Download reports
         uses: actions/download-artifact@v4
@@ -366,16 +347,12 @@ jobs:
         uses: actions/checkout@v4
         with:
           path: torch-xpu-ops
-      - name: Create unique Conda ENV name
-        run: |
-          random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
-          echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV
-      - name: Prepare Conda ENV
+      - name: Setup python-${{ env.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.python }}
+      - name: Install pip deps
         run: |
-          echo "Using Conda ENV name: $CONDA_ENV_NAME"
-          conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
-          source activate $CONDA_ENV_NAME
           pip install junitparser
       - name: Print results table
         if: ${{ ! cancelled() }}
@@ -409,8 +386,7 @@ jobs:
       - name: Print baseline difference
         if: ${{ ! cancelled() }}
         run: |
-          source activate $CONDA_ENV_NAME
-          python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true
+          python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true
       - name: Print failure lines
         if: ${{ ! cancelled() }}
         run: |
@@ -477,9 +453,3 @@ jobs:
           for f in $(find transformers/logs -name "environment-*.md"); do
              diff $f $first_md
           done
-      - name: Clean up
-        if: ${{ always() }}
-        run: |
-          if [ -n "$CONDA_ENV_NAME" ]; then
-            conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          fi
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 146db6c72e..1751d9d972 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -29,6 +29,11 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  UT_SKIP_ISSUE: 1624
 
 jobs:
   runner:
@@ -50,10 +55,10 @@ jobs:
         id: runner-info
         uses: ./.github/actions/get-runner
 
-  docker:
+  test-in-container:
     needs: runner
-    runs-on: ${{ needs.runner.outputs.runner_id }}
     if: ${{ ! contains(inputs.ut, 'distributed') }}
+    runs-on: ${{ needs.runner.outputs.runner_id }}
     container:
       image: mengfeili/intel-pvc-driver:1146-1136
       volumes:
@@ -64,30 +69,19 @@ jobs:
       env:
         AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
         PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }}
-    env:
-      GH_TOKEN: ${{ github.token }}
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Prepare test env on ${{ needs.runner.outputs.hostname }}
+      - name: Prepare test env
         uses: ./.github/actions/linux-testenv
         with:
           pytorch: ${{ inputs.pytorch }}
           torch_xpu_ops: ${{ inputs.torch_xpu_ops }}
           python: ${{ inputs.python }}
-      - name: Run XPU UT Test
+      - name: Run XPU UT Test on ${{ needs.runner.outputs.hostname }}
         uses: ./.github/actions/linux-uttest
         with:
           ut_name: ${{ inputs.ut }}
-      - name: UT Test Results Summary
-        run: |
-          pip install junitparser
-          python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
-          if [ -e ut_failure_list.csv ];then
-              cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true
-          fi
       - name: Upload Inductor XPU UT Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
@@ -102,47 +96,27 @@ jobs:
           name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }}
           path: ${{ github.workspace }}/ut_log/ut_failure_list.csv
           if-no-files-found: ignore
-      - name: Reset Ptrace_scope
-        if: ${{ always() }}
-        run: |
-          if [ -f "${{ github.workspace }}/ptrace_scope.bk" ]; then
-            sudo cp ${{ github.workspace }}/ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
-          fi
 
-  host:
+  test-in-baremetal:
     needs: runner
-    runs-on: ${{ needs.runner.outputs.runner_id }}
     if: ${{ contains(inputs.ut, 'distributed') }}
+    runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      GH_TOKEN: ${{ github.token }}
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
     steps:
-      - name: Init test
-        run: |
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
-          sudo rm -rf ~/.triton /tmp/*inductor* /tmp/tmp*
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Prepare test env on ${{ needs.runner.outputs.runner_id }}
+      - name: Prepare test env
         uses: ./.github/actions/linux-testenv
         with:
           pytorch: ${{ inputs.pytorch }}
           torch_xpu_ops: ${{ inputs.torch_xpu_ops }}
           python: ${{ inputs.python }}
-      - name: Run XPU UT Test
+      - name: Run XPU UT Test on ${{ needs.runner.outputs.hostname }}
         uses: ./.github/actions/linux-uttest
         with:
           ut_name: ${{ inputs.ut }}
-      - name: UT Test Results Summary
-        run: |
-          pip install junitparser
-          python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
-          if [ -e ut_failure_list.csv ];then
-              cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true
-          fi
       - name: Upload Inductor XPU UT Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
@@ -157,21 +131,12 @@ jobs:
           name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }}
           path: ${{ github.workspace }}/ut_log/ut_failure_list.csv
           if-no-files-found: ignore
-      - name: Reset Ptrace_scope
-        if: ${{ always() }}
-        run: |
-          if [ -f "${{ github.workspace }}/ptrace_scope.bk" ]; then
-            sudo cp ${{ github.workspace }}/ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
-          fi
 
   summary:
-    needs: [docker, host]
-    if: ${{ ! cancelled() && ! (endsWith(needs.docker.result, 'ed') && endsWith(needs.host.result, 'ed')) }}
+    needs: [test-in-container, test-in-baremetal]
+    if: ${{ ! cancelled() && ! (endsWith(needs.test-in-container.result, 'ed') && endsWith(needs.test-in-baremetal.result, 'ed')) }}
     runs-on: ubuntu-24.04
     timeout-minutes: 30
-    env:
-      GH_TOKEN: ${{ github.token }}
-      UT_SKIP_ISSUE: 1624
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
@@ -181,15 +146,22 @@ jobs:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }}
           path: ${{ github.workspace }}/ut_log
       - name: Check UT Results
-        shell: bash
         run: |
           ls -al ${{ github.workspace }}/ut_log
           cd ${{ github.workspace }}/ut_log/${{ inputs.ut }}
+
+          for log_file in "${{ github.workspace }}/ut_log"/{failures,passed,category,reproduce}_*.log; do
+            [[ -f "$log_file" ]] && cp "$log_file" ./
+          done
           # get distributed known issues
           gh --repo intel/torch-xpu-ops issue view $UT_SKIP_ISSUE --json body -q .body |sed -E '/^(#|$)/d' > Known_issue.log.tmp
           # get skipped known issues
-          gh api "repos/${{ github.repository }}/issues?labels=skipped" \
-            --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log
+          count=$(gh api "repos/${{ github.repository }}/issues?labels=skipped" --jq 'length')
+          if [ "$count" -gt 0 ]; then
+            echo -e "$count issues with skipped label found"
+            gh api "repos/${{ github.repository }}/issues?labels=skipped" \
+              --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log
+          fi
           if [ "${{ inputs.ut }}" == "basic" ];then
             ut_list="op_regression op_transformers op_extended op_regression_dev1"
           else
@@ -201,7 +173,7 @@ jobs:
             cp Known_issue.log.tmp Known_issue.log
             awk -v r="${ut_name}" 'BEGIN{ print_row = 0 }{
               if ( ! ( $0 ~ /[a-zA-Z0-9]/ ) ) { print_row = 0 };
-              if ( print_row == 1 && $NF ~ r ) { print $1 };
+              if ( print_row == 1 && $1 ~ r ) { print $0 };
               if ( $0 ~ /Cases:/ ) { print_row = 1 };
             }' issues.log >> Known_issue.log
             bash ut_result_check.sh ${ut_name}
diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml
index 14129446e2..e650e32d14 100644
--- a/.github/workflows/_windows_ut.yml
+++ b/.github/workflows/_windows_ut.yml
@@ -112,7 +112,7 @@ jobs:
           call conda activate windows_ci
           cd ../pytorch
           pip install -r requirements.txt
-          pip install cmake setuptools==72.1.0 clang-format
+          pip install cmake setuptools clang-format
           pip install mkl-static mkl-include
           set USE_STATIC_MKL=1
           copy "%CONDA_PREFIX%\Library\bin\libiomp*5md.dll" .\torch\lib
@@ -123,7 +123,8 @@ jobs:
               set CMAKE_PREFIX_PATH="%CONDA_PREFIX%\Library"
           )
           python setup.py clean
-          set MAX_JOBS=4
+          set MAX_JOBS=32
+          set TORCH_XPU_ARCH_LIST=mtl-h,bmg,lnl-m
           python setup.py bdist_wheel > build_torch_wheel_log.log
           echo "[INFO] begin to install torch whls"
           for /r C:\actions-runner\_work\torch-xpu-ops\pytorch\dist %%i in (torch*.whl) do (
@@ -141,24 +142,22 @@ jobs:
           python -c "import torch; print(torch.__config__.show())"
           python -c "import torch; print(torch.__config__.parallel_info())"
           python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-
       - name: Upload Windows build log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Torch-XPU-Windows-Log-${{ github.event.pull_request.number || github.sha }}
           path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\build_torch_wheel_log.log'
-
       - name: Upload Windows binary
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Torch-XPU-Windows-Binary-${{ github.event.pull_request.number || github.sha }}
           path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\dist'
-
       - name: Run XPU OP Extended UT
         if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule'
         shell: cmd
+        continue-on-error: true
         run: |
           call "C:\ProgramData\miniforge3\Scripts\activate.bat"
           call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
@@ -168,9 +167,12 @@ jobs:
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
           python run_test_with_skip_mtl.py
 
+          if not exist "%GITHUB_WORKSPACE%\ut_log" mkdir "%GITHUB_WORKSPACE%\ut_log"
+          copy op_extended.xml %GITHUB_WORKSPACE%\ut_log /Y
       - name: Run Test XPU UT
-        if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule'
+        if: contains(inputs.ut, 'test_xpu') || github.event_name == 'schedule'
         shell: cmd
+        continue-on-error: true
         run: |
           call "C:\ProgramData\miniforge3\Scripts\activate.bat"
           call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
@@ -178,3 +180,103 @@ jobs:
           call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu/
           python run_test_win_with_skip_mtl.py
+
+          if not exist "%GITHUB_WORKSPACE%\ut_log" mkdir "%GITHUB_WORKSPACE%\ut_log"
+          copy test_xpu.xml %GITHUB_WORKSPACE%\ut_log /Y
+      - name: UT Test Results Summary
+        shell: cmd
+        run: |
+          call conda activate windows_ci
+          pip install junitparser
+          echo "GITHUB_WORKSPACE: %GITHUB_WORKSPACE%"
+          for %%i in ("%GITHUB_WORKSPACE%\ut_log\*.xml") do (
+            python .\.github\scripts\check-ut.py "%%i" >> "%GITHUB_STEP_SUMMARY%"
+          )
+          @echo off
+
+          REM Check the failure logs
+          if exist "%GITHUB_WORKSPACE%\failures*.log" (
+            echo Exist Failure logs
+            echo Found Failure logs as below: 
+            for %%f in ("%GITHUB_WORKSPACE%\failures*.log") do (
+              echo   - %%f
+              copy "%%f" "%GITHUB_WORKSPACE%\ut_log\"
+            )
+            echo Failure logs Copied
+          ) else (
+            echo No Failure logs
+          )
+
+          REM Copied the passed logs
+          if exist "passed*.log" (
+            copy "passed*.log" "%GITHUB_WORKSPACE%\ut_log\"
+            echo Passed logs Copied
+          ) else (
+            echo No Passed logs
+          )
+
+          REM Copied the Summary logs
+          if exist "category*.log" (
+            copy "category*.log" "%GITHUB_WORKSPACE%\ut_log\"
+            echo Category logs Copied
+          ) else (
+            echo No Category logs
+          )
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows
+          path: "${{ github.workspace }}/ut_log"
+          if-no-files-found: ignore
+
+  summary:
+    needs: [ut_test]
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+    env:
+      GH_TOKEN: ${{ github.token }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Download XPU UT Logs
+        uses: actions/download-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows
+          path: ${{ github.workspace }}/ut_log
+      - name: Check UT Results
+        shell: bash
+        run: |
+          ls -al ${{ github.workspace }}/ut_log
+          cd ${{ github.workspace }}/ut_log
+
+          # get skipped known issues
+          count=$(gh api "repos/${{ github.repository }}/issues?labels=skipped_windows" --jq 'length')
+          if [ "$count" -gt 0 ]; then
+            echo -e "$count issues with skipped label found"
+            gh api "repos/${{ github.repository }}/issues?labels=skipped_windows" \
+              --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log
+          fi
+
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          for ut_name in $(echo ${{ inputs.ut }} |sed 's/,/ /g')
+          do
+            touch Known_issue.log
+            if [ -f "issues.log" ]; then
+              awk -v r="${ut_name}" 'BEGIN{ print_row = 0 }{
+                if ( ! ( $0 ~ /[a-zA-Z0-9]/ ) ) { print_row = 0 };
+                if ( print_row == 1 && $1 ~ r ) { print $0 };
+                if ( $0 ~ /Cases:/ ) { print_row = 1 };
+              }' issues.log > Known_issue.log
+            else
+              echo "Info: issues.log not found or empty, using empty Known_issue.log"
+            fi
+            bash ut_result_check.sh ${ut_name}
+          done
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows
+          path: ${{ github.workspace }}/ut_log
+          overwrite: true
diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml
index b4e657b847..4adba0df6c 100644
--- a/.github/workflows/bisect_search.yml
+++ b/.github/workflows/bisect_search.yml
@@ -37,6 +37,10 @@ on:
 
 permissions: read-all
 
+defaults:
+  run:
+    shell: bash -xe {0}
+
 jobs:
   get_runner:
     runs-on: ${{ inputs.runner }}
@@ -87,12 +91,9 @@ jobs:
         USE_XCCL: 0
         USE_KINETO: 0
     env:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      GH_TOKEN: ${{ github.token }}
-    defaults:
-      run:
-        shell: bash -xe {0}
+       GH_TOKEN: ${{ github.token }}
+       HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
       - name: Check runner
         run: |
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 51ef597765..d408622efe 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -95,7 +95,7 @@ jobs:
               echo "No such scheduler: ${{ github.event.schedule }}"
               exit 1
             fi
-            ut='["basic","op_ut","xpu_profiling","torch_xpu","xpu_distributed"]'
+            ut='["basic","op_ut","xpu_profiling","xpu_distributed"]'
             suite='["huggingface","timm_models","torchbench","pt2e"]'
             triton=''
             python='3.10'
@@ -144,6 +144,7 @@ jobs:
     if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }}
     name: linux-ut
     needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
@@ -160,6 +161,7 @@ jobs:
     if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }}
     name: linux-e2e
     needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
@@ -189,6 +191,7 @@ jobs:
     name: linux-microbench
     permissions: write-all
     needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
@@ -203,9 +206,10 @@ jobs:
     if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }}
     name: windows
     needs: [Conditions-Filter]
+    secrets: inherit
     uses: ./.github/workflows/_windows_ut.yml
     with:
-      ut: 'op_extended,torch_xpu'
+      ut: 'op_extended,test_xpu'
       python: ${{ needs.Conditions-Filter.outputs.python }}
       src_changed: false
       has_label: true
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index bfb2d913a8..be6b260734 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -60,6 +60,7 @@ jobs:
       src_changed: ${{ steps.check-files.outputs.src_changed }}
       has_label: ${{ steps.check-label.outputs.has_label }}
       disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }}
+      pytorch: ${{ contains(steps.check-pr-desc.outputs.disabled_tests, 'disable_build') && 'nightly_wheel' || github.base_ref }}
     steps:
       - uses: dorny/paths-filter@v2
         id: check-files
@@ -90,7 +91,7 @@ jobs:
           echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
 
   linux-build:
-    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}}
+    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') }}
     needs: [conditions-filter, preci-lint-check]
     secrets: inherit
     strategy:
@@ -100,11 +101,12 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       runner: pvc_rolling
-      pytorch: main
+      pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
 
   linux-ut:
     needs: [conditions-filter, linux-build]
-    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_ut')}}
+    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_ut') }}
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
@@ -112,12 +114,13 @@ jobs:
     uses: ./.github/workflows/_linux_ut.yml
     with:
       runner: pvc_rolling
-      pytorch: main
+      pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
       ut: ${{ matrix.ut_name }}
 
   linux-distributed:
     needs: [conditions-filter, linux-build]
-    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_distribute')}}
+    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_distribute') }}
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
@@ -125,12 +128,13 @@ jobs:
     uses: ./.github/workflows/_linux_ut.yml
     with:
       runner: pvc_rolling
-      pytorch: main
+      pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
       ut: ${{ matrix.ut_name }}
 
   linux-e2e:
     name: linux-e2e
     if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
+    secrets: inherit
     needs: [conditions-filter, linux-build]
     strategy:
       fail-fast: false
@@ -139,7 +143,7 @@ jobs:
     uses: ./.github/workflows/_linux_e2e.yml
     with:
       runner: pvc_rolling
-      pytorch: main
+      pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
       suite: ${{ matrix.suite }}
   linux-e2e-summary:
     name: linux-e2e-summary
@@ -152,9 +156,10 @@ jobs:
     name: windows
     if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} 
     needs: [conditions-filter, preci-lint-check]
+    secrets: inherit
     uses: ./.github/workflows/_windows_ut.yml
     with: 
-      ut: op_extended,torch_xpu
+      ut: op_extended,test_xpu
       runner: Windows_CI
       src_changed: ${{ needs.conditions-filter.outputs.src_changed }}
       has_label: ${{ needs.conditions-filter.outputs.has_label }}
diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp
index b011baa80b..0fe0ba2636 100644
--- a/src/ATen/native/xpu/Copy.cpp
+++ b/src/ATen/native/xpu/Copy.cpp
@@ -71,7 +71,12 @@ void memcpyAsync(
   Device dst_device = iter.device(0);
   Device src_device = iter.device(1);
   if (dst_device == src_device) {
-    copy_kernel(iter);
+    //    copy_kernel(iter);
+    auto dst = (char*)iter.data_ptr(0);
+    auto src = (char*)iter.data_ptr(1);
+    size_t size = iter.numel() * iter.element_size(0);
+    auto q = copy_stream.queue();
+    q.copy(src, dst, size);
   } else {
     TORCH_INTERNAL_ASSERT(p2p_enabled == true);
     auto dst = (char*)iter.data_ptr(0);
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index 011d1ab451..01016494cf 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/AdaptivePooling.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/xpu/sycl/LaunchUtils.h>
+#include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <comm/MemoryFormat.h>
 #include <comm/xpu_aten.h>
 #include <vector>
@@ -627,6 +628,220 @@ struct AdaptiveAvgPool2dKernelFunctor {
   PackedTensorAccessor64<scalar_t, 4> output_;
 };
 
+template <typename scalar_t, typename opmath_t, typename vec_t, int vec_size>
+struct AdaptiveAvgPool2dKernelFunctor_cl {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t index = item.get_global_linear_id();
+    if (index < numel_) {
+      int _ow, _oh, _oc, _ob;
+      int oc_vec_ = oc_ / vec_size;
+
+      _oc = index % oc_vec_;
+      _ow = index / oc_vec_ % ow_;
+      _oh = index / oc_vec_ / ow_ % oh_;
+      _ob = index / oc_vec_ / ow_ / oh_;
+
+      int64_t _ih0 = native::start_index(_oh, oh_, ih_);
+      int64_t _ih1 = native::end_index(_oh, oh_, ih_);
+      int64_t _iw0 = native::start_index(_ow, ow_, iw_);
+      int64_t _iw1 = native::end_index(_ow, ow_, iw_);
+      int64_t kh = _ih1 - _ih0;
+      int64_t kw = _iw1 - _iw0;
+      int64_t _ib = _ob;
+      int64_t _ic = _oc;
+
+      opmath_t sum[vec_size] = {static_cast<opmath_t>(0)};
+      for (int _ih = _ih0; _ih < _ih1; _ih++) {
+        for (int _iw = _iw0; _iw < _iw1; _iw++) {
+          auto read = input_
+              [_ic + _iw * oc_vec_ + _ih * oc_vec_ * iw_ +
+               _ib * ih_ * iw_ * oc_vec_];
+#pragma unroll
+          for (int v = 0; v < vec_size; v++) {
+            sum[v] += opmath_t(read[v]);
+          }
+        }
+      }
+#pragma unroll
+      for (int v = 0; v < vec_size; v++) {
+        sum[v] /= kh * kw;
+      }
+      vec_t output_value;
+#pragma unroll
+      for (int v = 0; v < vec_size; v++) {
+        output_value[v] = static_cast<scalar_t>(sum[v]);
+      }
+      output_[index] = output_value;
+    }
+  }
+  AdaptiveAvgPool2dKernelFunctor_cl(
+      vec_t* output,
+      const vec_t* input,
+      int ih,
+      int iw,
+      int ob,
+      int oc,
+      int oh,
+      int ow,
+      int64_t numel)
+      : output_(output),
+        input_(input),
+        ih_(ih),
+        iw_(iw),
+        ob_(ob),
+        oc_(oc),
+        oh_(oh),
+        ow_(ow),
+        numel_(numel) {}
+
+ private:
+  int ih_;
+  int iw_;
+  int ob_;
+  int oc_;
+  int oh_;
+  int ow_;
+  int64_t numel_;
+  const vec_t* input_;
+  vec_t* output_;
+};
+
+#define LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(                                  \
+    scalar_t,                                                             \
+    opmath_t,                                                             \
+    vec_size,                                                             \
+    num_wg,                                                               \
+    wg_size,                                                              \
+    queue,                                                                \
+    output,                                                               \
+    input,                                                                \
+    ih,                                                                   \
+    iw,                                                                   \
+    ob,                                                                   \
+    oc,                                                                   \
+    oh,                                                                   \
+    ow,                                                                   \
+    numel)                                                                \
+  {                                                                       \
+    using vec_t = memory::aligned_vector<scalar_t, vec_size>;             \
+    vec_t* output_vec =                                                   \
+        reinterpret_cast<vec_t*>(output.mutable_data_ptr<scalar_t>());    \
+    const vec_t* input_vec =                                              \
+        reinterpret_cast<const vec_t*>(input.const_data_ptr<scalar_t>()); \
+    auto kfn = AdaptiveAvgPool2dKernelFunctor_cl<                         \
+        scalar_t,                                                         \
+        opmath_t,                                                         \
+        vec_t,                                                            \
+        vec_size>(output_vec, input_vec, ih, iw, ob, oc, oh, ow, numel);  \
+    sycl_kernel_submit(num_wg* wg_size, wg_size, queue, kfn);             \
+  }
+
+template <typename scalar_t, typename opmath_t>
+void launch_adaptive_avg_pool2d_kernel_cl(const Tensor& input, Tensor& output) {
+  int ih = input.size(2);
+  int iw = input.size(3);
+  int ob = output.size(0);
+  int oc = output.size(1);
+  int oh = output.size(2);
+  int ow = output.size(3);
+
+  int64_t numel = ob * oc * oh * ow;
+  int vec_size = 1;
+  for (vec_size = std::min(
+           8,
+           memory::can_vectorize_up_to<scalar_t>(
+               (char*)output.mutable_data_ptr<scalar_t>()));
+       vec_size > 1;
+       vec_size /= 2) {
+    if (oc % vec_size != 0)
+      continue;
+    if (2 * numel / vec_size > syclMaxWorkItemsPerTile()) {
+      numel /= vec_size;
+      break;
+    }
+  }
+
+  auto wg_size = syclDeviceMaxWorkGroupSize();
+  int64_t num_wg = (numel + wg_size - 1) / wg_size;
+  switch (vec_size) {
+    case 8:
+      LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(
+          scalar_t,
+          opmath_t,
+          8,
+          num_wg,
+          wg_size,
+          at::xpu::getCurrentSYCLQueue(),
+          output,
+          input,
+          ih,
+          iw,
+          ob,
+          oc,
+          oh,
+          ow,
+          numel);
+      return;
+    case 4:
+      LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(
+          scalar_t,
+          opmath_t,
+          4,
+          num_wg,
+          wg_size,
+          at::xpu::getCurrentSYCLQueue(),
+          output,
+          input,
+          ih,
+          iw,
+          ob,
+          oc,
+          oh,
+          ow,
+          numel);
+      return;
+    case 2:
+      LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(
+          scalar_t,
+          opmath_t,
+          2,
+          num_wg,
+          wg_size,
+          at::xpu::getCurrentSYCLQueue(),
+          output,
+          input,
+          ih,
+          iw,
+          ob,
+          oc,
+          oh,
+          ow,
+          numel);
+      return;
+    case 1:
+      LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(
+          scalar_t,
+          opmath_t,
+          1,
+          num_wg,
+          wg_size,
+          at::xpu::getCurrentSYCLQueue(),
+          output,
+          input,
+          ih,
+          iw,
+          ob,
+          oc,
+          oh,
+          ow,
+          numel);
+      return;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
+  }
+}
+#undef LAUNCH_AVGPOOL_CHANNEL_LAST_VEC
+
 template <typename scalar_t, typename opmath_t, bool is_channels_last>
 void launch_adaptive_avg_pool2d_kernel(
     PackedTensorAccessor64<const scalar_t, 4> input,
@@ -724,8 +939,13 @@ void adaptive_avg_pool2d_kernel(
         auto iacc = input_.packed_accessor64<const scalar_t, 4>();
         auto oacc = output.packed_accessor64<scalar_t, 4>();
         if (is_smf_channels_last(output)) {
-          launch_adaptive_avg_pool2d_kernel<scalar_t, opmath_t, true>(
-              iacc, oacc);
+          if (input_.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+            launch_adaptive_avg_pool2d_kernel_cl<scalar_t, opmath_t>(
+                input_, output);
+          } else {
+            launch_adaptive_avg_pool2d_kernel<scalar_t, opmath_t, true>(
+                iacc, oacc);
+          }
         } else {
           launch_adaptive_avg_pool2d_kernel<scalar_t, opmath_t, false>(
               iacc, oacc);
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 2cccaba92b..f8e1b6906e 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -423,7 +423,7 @@ template <
     typename index_t>
 struct BatchNormCollectStatisticsKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     int plane = item.get_group(1);
     int tid = item.get_local_linear_id();
@@ -1874,7 +1874,7 @@ template <
     typename index_t>
 struct BatchNormBackwardReduceKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     index_t plane = item.get_group(1);
 
@@ -4162,7 +4162,7 @@ template <
     typename stat_accscalar_t,
     typename index_t>
 struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     index_t plane = item.get_group(1);
     index_t N = grad_output_.size(0) * grad_output_.size(2);
@@ -4370,7 +4370,7 @@ template <
     typename index_t>
 struct BatchNormBackwardVectorizedKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     index_t plane = item.get_group(1);
     index_t N = grad_output_.size(0) * grad_output_.size(2);
diff --git a/src/ATen/native/xpu/sycl/Dequant_int4.cpp b/src/ATen/native/xpu/sycl/Dequant_int4.cpp
index b217e18ad1..8a52dbfb32 100644
--- a/src/ATen/native/xpu/sycl/Dequant_int4.cpp
+++ b/src/ATen/native/xpu/sycl/Dequant_int4.cpp
@@ -22,7 +22,7 @@ struct DequantInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
         weight_dequant(weight_dequant) {}
 
   void sycl_ker_config_convention(sycl::handler& cgh) {}
-  [[intel::reqd_sub_group_size(SgSize)]] void operator()(
+  [[sycl::reqd_sub_group_size(SgSize)]] void operator()(
       sycl::nd_item<1> it) const {
     int constexpr GroupN = TileN;
     int constexpr GroupK = SgSize * TileK;
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 454da74825..f4b3424a54 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -545,7 +545,7 @@ void launch_max_pool2d_kernel(
   if constexpr (is_channels_last) {
     for (vec_size =
              std::min(8, memory::can_vectorize_up_to<scalar_t>((char*)input));
-         vec_size >= 1;
+         vec_size > 1;
          vec_size /= 2) {
       if (numPlane % vec_size != 0) {
         continue;
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 899ffab56c..0a2b5cdbc7 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -25,8 +25,7 @@ template <
     int r_args_depth = 1,
     int res_arg_index = 0>
 struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  template <typename TLA, typename TLW>
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  template <typename TLA, typename TLW> void operator()(
       const int64_t chunk_size,
       TLA tlAddress,
       TLW tlWGMeta,
@@ -117,7 +116,7 @@ struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
 
 template <typename out_t, NormType norm_type, typename opmath_t, int SIMD>
 struct lpnormChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item_id) const {
     auto lid = item_id.get_local_linear_id();
     auto group_id = item_id.get_group(0);
@@ -481,8 +480,7 @@ std::vector<Tensor> foreach_norm_kernel(
 
 template <typename T, int SIMD>
 struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  template <typename TLA, typename TLW>
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  template <typename TLA, typename TLW> void operator()(
       int64_t chunk_size,
       TLA tlAddressMeta,
       TLW tlWGMeta,
@@ -555,7 +553,7 @@ struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
 
 template <typename T, int SIMD>
 struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item_id) const {
     auto local_range = item_id.get_local_range(0);
     auto lid = item_id.get_local_linear_id();
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 935ab99f74..261c9a0627 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -66,7 +66,7 @@ struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using WelfordOp =
       WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     const int64_t i = item.get_group(0);
     WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
@@ -114,7 +114,7 @@ struct GNRowwiseMomentsVectorizedFunctor
       WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
   using vec_t = memory::aligned_vector<T, VEC_SIZE>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     WelfordType val[VEC_SIZE];
     WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
@@ -476,7 +476,7 @@ void group_norm_kernel(
 template <typename T, typename T_ACC, int SIMD>
 struct Compute1dBackwardFusedParamsFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     const int64_t G = group_;
     const int64_t D = C_ / G;
@@ -630,7 +630,7 @@ template <typename T, int SIMD, int kReduceTileSize>
 struct GammaBeta1dBackwardLargeKernel : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     const int64_t c =
         item.get_group(1) * item.get_local_range(1) + item.get_local_id(1);
@@ -890,7 +890,7 @@ template <typename T, int SIMD>
 struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     const int64_t nc = item.get_group(0);
     T_ACC sum1 = 0;
@@ -941,7 +941,7 @@ struct ComputeInternalGradientsVectorizedFunctor
   using vec_t = memory::aligned_vector<T, VEC_SIZE>;
   using acc_vec_t = memory::aligned_vector<T_ACC, VEC_SIZE>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     acc_vec_t sum1_vec;
     acc_vec_t sum2_vec;
@@ -1038,7 +1038,7 @@ struct ComputeBackwardFusedParamsFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     const int64_t G = group_;
     const int64_t D = C_ / G;
@@ -1176,7 +1176,7 @@ template <typename T, int SIMD, int kReduceTileSize>
 struct GammaBetaBackwardFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     auto group_x = item.get_group(1);
     auto group_size_x = item.get_local_range(1);
diff --git a/src/ATen/native/xpu/sycl/IndexKernelUtils.h b/src/ATen/native/xpu/sycl/IndexKernelUtils.h
index 1b1cdc1fdb..4be8b335be 100644
--- a/src/ATen/native/xpu/sycl/IndexKernelUtils.h
+++ b/src/ATen/native/xpu/sycl/IndexKernelUtils.h
@@ -42,7 +42,7 @@ inline bool fast_gather_kernel_eligible(
 
 template <int Alignment, typename index_t>
 struct VectorizedGatherKernel {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item) const {
     int64_t ind = idx_[item.get_group(1)];
     if (allow_neg_indices_) {
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index cedda6a6b7..92a4195d01 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -186,7 +186,7 @@ struct RowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using WelfordType = WelfordData<T_ACC, int64_t>;
   using WelfordOp = WelfordOps<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
 
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item_id) const {
     const int64_t i = item_id.get_group(0);
     WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
@@ -435,7 +435,7 @@ WelfordDataLN compute_stats(
 template <typename T, typename T_ACC>
 struct VectorizedLayerNormKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<2> item_id) const {
     auto i1 = item_id.get_group(1);
     const T* block_row = X_ + i1 * N_;
diff --git a/src/ATen/native/xpu/sycl/LinearInt4.cpp b/src/ATen/native/xpu/sycl/LinearInt4.cpp
index 25665b639b..31ed632baf 100644
--- a/src/ATen/native/xpu/sycl/LinearInt4.cpp
+++ b/src/ATen/native/xpu/sycl/LinearInt4.cpp
@@ -41,7 +41,7 @@ struct LinearInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
         ldc(ldc) {}
   void sycl_ker_config_convention(sycl::handler& cgh) {}
 
-  [[intel::reqd_sub_group_size(16)]] void operator()(
+  [[sycl::reqd_sub_group_size(16)]] void operator()(
       sycl::nd_item<1> it) const {
     int constexpr Unroll = 2;
     int constexpr SgSize = 16;
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index 8b018de6b8..6b67e081d1 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -79,7 +79,7 @@ struct NllLoss2dForwardNoReduceKernelFunctor {
 
 template <typename scalar_t, typename accscalar_t, typename index_t, int SIMD>
 struct NllLoss2dForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     scalar_t cur_weight;
     accscalar_t input_sum = 0;
diff --git a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp
index 28047972fd..3c5d3271bc 100644
--- a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp
+++ b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp
@@ -51,7 +51,7 @@ void multilabel_margin_loss_shape_check(
 template <typename scalar_t, typename accscalar_t>
 struct MultilabelMarginLossForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
+  [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
   operator()(sycl::nd_item<1> item) const {
     int k = item.get_group(0);
     const scalar_t* input_k = input_ + k * dim_;
@@ -148,7 +148,7 @@ struct MultilabelMarginLossForwardKernelFunctor
 template <typename scalar_t, typename accscalar_t>
 struct MultilabelMarginLossBackwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
+  [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
   operator()(sycl::nd_item<1> item) const {
     int k = item.get_group(0);
     const scalar_t* input_k = input_ + k * dim_;
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index 6dd893100e..6117b6d261 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -600,7 +600,7 @@ template <
     class Norm,
     bool one_moment = false>
 struct FusedNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<3> item_id) const {
     accscalar_t sum1 = 0;
     accscalar_t sum2 = 0;
@@ -747,7 +747,7 @@ template <
     class Norm,
     bool one_moment = false>
 struct RowwiseMomentsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<3> item_id) const {
     index_t local_id = item_id.get_local_id(2);
 
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 2a4f749e15..fdedf5fb09 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -216,7 +216,7 @@ template <
     bool is_same_dtype>
 struct DispatchSoftmaxForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
       return;
@@ -933,7 +933,7 @@ template <
     bool is_same_dtype = false>
 struct DispatchSoftmaxBackwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+  [[sycl::reqd_sub_group_size(SIMD)]] void operator()(
       sycl::nd_item<1> item) const {
     if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
       return;
diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h
index cce01ba4b5..aad93d9eb6 100644
--- a/src/ATen/native/xpu/sycl/SortingKernels.h
+++ b/src/ATen/native/xpu/sycl/SortingKernels.h
@@ -15,7 +15,7 @@ namespace xpu {
 template <typename method_t, typename key_t, typename value_t>
 struct SegmentedGroupRadixSortPairsFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
+  [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
       sycl::nd_item<1> item) const {
     int seg_idx = item.get_group(0);
     int seg_offset = seg_idx * num_elements_;
@@ -96,7 +96,7 @@ void segmented_group_radix_sort_pairs_kernel(
 template <typename method_t, typename key_t, typename value_t>
 struct SegmentedRadixSortPairsUpsweepFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
+  [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
       sycl::nd_item<1> item) const {
     int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
         method_t::PROCESSING_LENGTH;
@@ -179,7 +179,7 @@ void segmented_radix_sort_pairs_upsweep_kernel(
 template <typename method_t>
 struct SegmentedRadixSortPairsScanFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
+  [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
       sycl::nd_item<1> item) const {
     constexpr int RADIX_BUCKETS = 16;
     int seg_idx = item.get_group(0);
@@ -218,7 +218,7 @@ void segmented_radix_sort_pairs_scan_kernel(
 template <typename method_t, typename key_t, typename value_t>
 struct SegmentedRadixSortPairsDownsweepFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
+  [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
       sycl::nd_item<1> item) const {
     int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
         method_t::PROCESSING_LENGTH;
@@ -448,7 +448,7 @@ struct SegmentedGroupRadixSelectPairsFunctor
     MAX_KV_BYTES = std::max(sizeof(key_t), sizeof(value_t)),
   };
 
-  [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
+  [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
       sycl::nd_item<1> item) const {
     int seg_idx = item.get_group(0);
     int seg_offset = seg_idx * nelements_;
diff --git a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp
index 7ae95e36b8..dcecf27ab0 100644
--- a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp
@@ -231,7 +231,7 @@ inline T reduceGroupWithNThreadLocalReductions(
 
 template <typename T, unsigned int Power2Size>
 struct ComputeModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
-  [[intel::reqd_sub_group_size(32)]] void operator()(
+  [[sycl::reqd_sub_group_size(32)]] void operator()(
       sycl::nd_item<3> item) const {
     int tidx = item.get_local_id(2);
     int stidx = item.get_local_range(2) +
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index af353985ae..ee0d285eaf 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -16,8 +16,13 @@ static int64_t syclMaxWorkGroupSize(
   auto dev = q.get_device();
 
   auto kid = ::sycl::get_kernel_id<KernelClass>();
-  auto kbundle =
-      ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(ctx, {kid});
+  // The kernel won't be built for devices except for the first device.
+  // Launching kernel on devices except for the first device will raise
+  // runtime error. Here is an alternative as a temporary solution to
+  // provide an extra hint to SYCL runtime.
+  // https://github.com/intel/llvm/issues/15127
+  auto kbundle = ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(
+      ctx, {dev}, {kid});
 
   ::sycl::kernel k = kbundle.get_kernel(kid);
   return k.get_info<::sycl::info::kernel_device_specific::work_group_size>(dev);
diff --git a/src/xccl/IpcExchange.hpp b/src/xccl/IpcExchange.hpp
new file mode 100644
index 0000000000..e515cd6ce0
--- /dev/null
+++ b/src/xccl/IpcExchange.hpp
@@ -0,0 +1,400 @@
+#include <poll.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <future>
+#include <system_error>
+#include "xccl/ze_symbol.hpp"
+
+#include <sycl/sycl.hpp>
+
+#include <pwd.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <chrono>
+#include <thread>
+
+struct exchange_contents {
+  // first 4-byte is file descriptor for drmbuf or gem object
+  union {
+    ze_ipc_mem_handle_t ipc_handle;
+    int fd = -1;
+  };
+  size_t offset = 0;
+  int pid = -1;
+};
+
+#define sysCheck(x)                                                  \
+  if (x == -1) {                                                     \
+    throw std::system_error(std::make_error_code(std::errc(errno))); \
+  }
+
+// We can't inherit it from cmsghdr because flexible array member
+struct exchange_fd {
+  char obscure[CMSG_LEN(sizeof(int)) - sizeof(int)];
+  int fd;
+
+  exchange_fd(int cmsg_level, int cmsg_type, int fd) : fd(fd) {
+    auto* cmsg = reinterpret_cast<cmsghdr*>(obscure);
+    cmsg->cmsg_len = sizeof(exchange_fd);
+    cmsg->cmsg_level = cmsg_level;
+    cmsg->cmsg_type = cmsg_type;
+  }
+
+  exchange_fd() : fd(-1) {
+    memset(obscure, 0, sizeof(obscure));
+  };
+};
+
+void un_send_fd(int sock, int fd, int rank, size_t offset) {
+  iovec iov[1];
+  msghdr msg;
+  auto rank_offset = std::make_pair(rank, offset);
+
+  iov[0].iov_base = &rank_offset;
+  iov[0].iov_len = sizeof(rank_offset);
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+  msg.msg_name = nullptr;
+  msg.msg_namelen = 0;
+
+  exchange_fd cmsg(SOL_SOCKET, SCM_RIGHTS, fd);
+
+  msg.msg_control = &cmsg;
+  msg.msg_controllen = sizeof(exchange_fd);
+  sysCheck(sendmsg(sock, &msg, 0));
+}
+
+std::tuple<int, int, size_t> un_recv_fd(int sock) {
+  iovec iov[1];
+  msghdr msg;
+  std::pair<int, size_t> rank_offset;
+
+  iov[0].iov_base = &rank_offset;
+  iov[0].iov_len = sizeof(rank_offset);
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+  msg.msg_name = nullptr;
+  msg.msg_namelen = 0;
+
+  exchange_fd cmsg;
+  msg.msg_control = &cmsg;
+  msg.msg_controllen = sizeof(exchange_fd);
+  int n_recv = recvmsg(sock, &msg, 0);
+  sysCheck(n_recv);
+  // assert(n_recv == sizeof(int));
+
+  return std::make_tuple(cmsg.fd, rank_offset.first, rank_offset.second);
+}
+
+int prepare_socket(const char* sockname) {
+  sockaddr_un un;
+  memset(&un, 0, sizeof(un));
+  un.sun_family = AF_UNIX;
+  strcpy(un.sun_path, sockname);
+
+  auto sock = socket(AF_UNIX, SOCK_STREAM, 0);
+  sysCheck(sock);
+
+  int on = 1;
+  sysCheck(ioctl(sock, FIONBIO, &on));
+
+  auto size = offsetof(sockaddr_un, sun_path) + strlen(un.sun_path);
+  sysCheck(bind(sock, (sockaddr*)&un, size));
+
+  return sock;
+}
+
+int server_listen(const char* sockname) {
+  unlink(sockname);
+  auto sock = prepare_socket(sockname);
+  sysCheck(listen(sock, 10));
+
+  return sock;
+}
+
+int serv_accept(int listen_sock) {
+  sockaddr_un un;
+
+  socklen_t len = sizeof(un);
+  auto accept_sock = accept(listen_sock, (sockaddr*)&un, &len);
+  sysCheck(accept_sock);
+
+  return accept_sock;
+}
+
+bool wait_for_socket_file(const char* path, int max_seconds = 10) {
+  struct stat buffer;
+  for (int i = 0; i < max_seconds * 10; ++i) {
+    if (stat(path, &buffer) == 0) {
+      return true;
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  return false;
+}
+
+int client_connect(const char* server, const char* client) {
+  if (!wait_for_socket_file(server, 10)) {
+    std::cerr << "Error: timeout waiting for server socket file: " << server
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  auto sock = prepare_socket(client);
+  sockaddr_un sun;
+  memset(&sun, 0, sizeof(sun));
+  sun.sun_family = AF_UNIX;
+  strcpy(sun.sun_path, server);
+  auto len = offsetof(sockaddr_un, sun_path) + strlen(server);
+  const int max_retries = 50;
+  int retry = 0;
+  int ret = -1;
+  while (retry < max_retries) {
+    ret = connect(sock, (sockaddr*)&sun, len);
+    if (ret == 0)
+      break;
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    retry++;
+  }
+  if (ret != 0) {
+    perror("connect failed");
+    exit(EXIT_FAILURE);
+  }
+
+  // sysCheck(connect(sock, (sockaddr*)&sun, len));
+  return sock;
+}
+
+void un_allgather(
+    exchange_contents* send_buf,
+    exchange_contents recv_buf[],
+    int rank,
+    int world) {
+  const char* servername_prefix = "/tmp/open-peer-ipc-mem-server-rank_";
+  const char* clientname_prefix = "/tmp/open-peer-ipc-mem-client-rank_";
+  char server_name[64];
+  /* get username to make server_name unique */
+  auto uid = getuid();
+  auto pwd = getpwuid(uid);
+  snprintf(
+      server_name,
+      sizeof(server_name),
+      "%s%d_%s",
+      servername_prefix,
+      rank,
+      pwd->pw_name);
+  unlink(server_name);
+  auto s_listen = server_listen(server_name);
+
+  pollfd fdarray[world];
+  int recv_socks[world - 1];
+
+  for (auto& pollfd : fdarray)
+    pollfd.fd = -1;
+  std::fill(recv_socks, recv_socks + world - 1, -1);
+
+  auto fd_guard = [&]() {
+    for (int i = 0, j = 0; i < world; ++i) {
+      if (i != rank && recv_socks[j] != -1)
+        sysCheck(close(recv_socks[j++]));
+      if (fdarray[i].fd != -1)
+        sysCheck(close(fdarray[i].fd));
+    }
+  };
+
+  struct guard__ {
+    using F = decltype(fd_guard);
+    F f;
+    guard__(const F& f) : f(f) {}
+    ~guard__() {
+      f();
+    }
+  } free_fd(fd_guard);
+
+  // connect to all ranks
+  for (int i = 0; i < world; ++i) {
+    if (rank == i) {
+      fdarray[i].fd = s_listen;
+      fdarray[i].events = POLLIN;
+      fdarray[i].revents = 0;
+    } else {
+      char peer_name[64];
+      char client_name[64];
+
+      snprintf(
+          client_name,
+          sizeof(client_name),
+          "%s%d-%d_%s",
+          clientname_prefix,
+          rank,
+          i,
+          pwd->pw_name);
+      unlink(client_name);
+
+      snprintf(
+          peer_name,
+          sizeof(peer_name),
+          "%s%d_%s",
+          servername_prefix,
+          i,
+          pwd->pw_name);
+      fdarray[i].fd = client_connect(peer_name, client_name);
+      fdarray[i].events = POLLOUT;
+      fdarray[i].revents = 0;
+    }
+  }
+
+  // std::future<std::tuple<int, int, size_t>> future_fds[world -1];
+  int slot = 0;
+  uint32_t send_progress = 1 << rank;
+
+  while (slot < world - 1 || send_progress != (1 << world) - 1) {
+    sysCheck(ppoll(fdarray, world, nullptr, nullptr));
+
+    for (int i = 0; i < world; ++i) {
+      if (i == rank && (fdarray[i].revents & POLLIN)) {
+        // auto accept_sock = serv_accept(fdarray[i].fd);
+        // future_fds[slot ++] = std::async(
+        //     std::launch::async, [=]() {
+        //     struct sock_guard{
+        //       int sock;
+        //       sock_guard(int sock) : sock(sock) {}
+        //       ~guard_sock() {sysCheck(close(sock));}
+        //     } release(accept_sock);
+        //     auto ret = un_recv_fd(accept_sock);
+        //     return ret;});
+        recv_socks[slot++] = serv_accept(fdarray[i].fd);
+      } else if (
+          (send_progress & (1 << i)) == 0 && fdarray[i].revents & POLLOUT) {
+        un_send_fd(fdarray[i].fd, send_buf->fd, rank, send_buf->offset);
+        send_progress |= 1 << i;
+      }
+    }
+  }
+
+  for (int i = 0; i < world - 1; ++i) {
+    // future_fds[i].wait();
+    // auto [fd, peer, offset] = future_fds[i].get();
+    auto [fd, peer, offset] = un_recv_fd(recv_socks[i]);
+    recv_buf[peer].fd = fd;
+    recv_buf[peer].offset = offset;
+  }
+
+  recv_buf[rank] = *send_buf;
+}
+
+class IpcChannel {
+ public:
+  IpcChannel() {
+    initialized = false;
+  }
+  void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) {
+    if (initialized)
+      return;
+
+    if (!load_level_zero_library()) {
+      throw std::runtime_error("Failed to initialize Level Zero");
+    }
+
+    zeCheck_dynamic(zeInit_dynamic(0));
+    int tmp_rank, tmp_world;
+
+    tmp_world = world_in;
+    tmp_rank = rank_in;
+
+    rank = tmp_rank;
+    world = tmp_world;
+    initialized = true;
+  }
+  void release(sycl::queue& queue) {
+    if (!initialized)
+      return;
+    try {
+      auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+          queue.get_context());
+      for (int i = 0; i < world; i++) {
+        if (i != rank) {
+          zeCheck_dynamic(zeMemCloseIpcHandle_dynamic(
+              l0_ctx, (char*)buffers[i] - offsets[i]));
+        }
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Warning: Level Zero cleanup failed: " << e.what()
+                << std::endl;
+    }
+    sycl::free(buffers[rank], queue);
+    initialized = false;
+  }
+
+  // buffer_size as element size
+  void exchange_peer_ipc_mem(
+      sycl::queue& queue,
+      void* ptr,
+      uint32_t rank_in,
+      uint32_t world_in) {
+    if (!initialized)
+      init(queue, rank_in, world_in);
+    if (!load_level_zero_library()) {
+      throw std::runtime_error("Level Zero not available");
+    }
+
+    // Step 1: Get base address of the pointer
+    sycl::context ctx = queue.get_context();
+    auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
+
+    void* base_addr;
+    size_t base_size;
+    zeCheck_dynamic(
+        zeMemGetAddressRange_dynamic(l0_ctx, ptr, &base_addr, &base_size));
+
+    // Step 2: Get IPC mem handle from base address
+    alignas(64) exchange_contents send_buf;
+    alignas(64) exchange_contents recv_buf[world];
+
+    // fill in the exchange info
+    zeCheck_dynamic(
+        zeMemGetIpcHandle_dynamic(l0_ctx, base_addr, &send_buf.ipc_handle));
+    send_buf.offset = (char*)ptr - (char*)base_addr;
+
+    send_buf.pid = getpid();
+
+    // Step 3: Exchange the handles and offsets
+    memset(recv_buf, 0, sizeof(recv_buf));
+    // Overkill if we don't really needs all peer's handles
+    un_allgather(&send_buf, recv_buf, rank, world);
+    for (uint32_t i = 0; i < world; i++) {
+      // Step 4: Prepare pid file descriptor of next process
+      auto* peer = recv_buf + i;
+      // Step 6: Open IPC handle of remote peer
+      auto l0_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+          queue.get_device());
+      void* peer_base;
+
+      zeCheck_dynamic(zeMemOpenIpcHandle_dynamic(
+          l0_ctx,
+          l0_device,
+          peer->ipc_handle,
+          ZE_IPC_MEMORY_FLAG_BIAS_CACHED,
+          &peer_base));
+
+      buffers[i] = (char*)peer_base + peer->offset;
+      offsets[i] = peer->offset;
+      ipc_handle[i] = send_buf.ipc_handle;
+    }
+  }
+
+  bool initialized;
+  static constexpr uint32_t max_rank = 16;
+  void* buffers[max_rank];
+  void* sync_buffer[max_rank];
+  size_t offsets[max_rank];
+  ze_ipc_mem_handle_t ipc_handle[max_rank];
+  int rank, world;
+  int size_per_buffer;
+  int data_size_per_buffer;
+  int buffer_index;
+};
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index 9f72cd1bd3..8a8be26659 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -283,6 +283,11 @@ bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
 
 void ProcessGroupXCCL::WorkXCCL::synchronize() {
   synchronizeStream();
+  if (c10d::allow_inflight_collective_as_graph_input()) {
+    c10d::unregister_work(
+        c10::intrusive_ptr<
+            ProcessGroupXCCL::WorkXCCL>::unsafe_reclaim_from_nonowning(this));
+  }
 }
 
 void ProcessGroupXCCL::WorkXCCL::synchronizeStream() {
@@ -427,17 +432,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus(
   pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
 }
 
-void ProcessGroupXCCL::setCompletedPgStatus(
-    c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work) {
-  pgStatus_->lastCompletedSeq = static_cast<int64_t>(work->getSequencenumber());
-  pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_);
-  pgStatus_->lastCompletedNumelIn = work->numelIn_;
-  pgStatus_->lastCompletedNumelOut = work->numelOut_;
-  // To avoid complexity, we're not computing duration.
-  FlightRecorderXCCL::get()->retire_id(
-      work->trace_id_, /*compute_duration*/ false);
-}
-
 void ProcessGroupXCCL::setSequenceNumberForGroup() {}
 
 uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() {
@@ -767,8 +761,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
   work->future_->markCompleted(at::IValue(*work->outputs_));
+  auto id = work->trace_id_;
   work->future_->addCallback(
-      [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); });
+      [id](at::ivalue::Future&) {
+        FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+      },
+      /*use_future*/ false);
   work->blockingWait_ = blockingWait_;
 
   work->numelIn_ = 0;
@@ -879,9 +877,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);
     work->future_->markCompleted(at::IValue(*work->outputs_));
-    work->future_->addCallback([this, work](at::ivalue::Future&) {
-      this->setCompletedPgStatus(work);
-    });
+    auto id = work->trace_id_;
+    work->future_->addCallback(
+        [id](at::ivalue::Future&) {
+          FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+        },
+        /*use_future*/ false);
 
     work->numelIn_ = work->numelOut_ = tensor.numel();
     setEnqueuedPgStatus(work);
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index e7aa39c82d..1d3241e4d4 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -415,8 +415,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   const std::vector<uint64_t>& groupRanks() const;
   void setEnqueuedPgStatus(c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
-  void setCompletedPgStatus(
-      c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
   bool dumpDebuggingInfo(bool includeStackTrace = true);
 
  protected:
diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp
new file mode 100644
index 0000000000..d49d126122
--- /dev/null
+++ b/src/xccl/XPUSymmetricMemory.cpp
@@ -0,0 +1,460 @@
+#include <xccl/IpcExchange.hpp>
+#include <xccl/ProcessGroupXCCL.hpp>
+#include <xccl/XPUSymmetricMemory.hpp>
+#include <xccl/XPUSymmetricMemoryUtils.hpp>
+
+#include <ATen/ceil_div.h>
+#include <ATen/xpu/XPUContext.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/util/error.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+
+#include <sys/socket.h>
+#include <unistd.h>
+
+namespace c10d {
+namespace symmetric_memory {
+
+static StoreExchange storeExchange = StoreExchange("XPUSymmetricMemory");
+
+AllocationRef::AllocationRef(
+    void* ptr,
+    HandleType handle,
+    size_t block_size,
+    int device_idx,
+    bool local_allocation)
+    : ptr(ptr),
+      handle(handle),
+      block_size(block_size),
+      device_idx(device_idx),
+      local_allocation(local_allocation){}
+
+AllocationRef::~AllocationRef() {
+  if (is_finalizing()) {
+    return;
+  }
+  // Currently, we cannot free virtual memory exchanged from other device.
+  if (!local_allocation) {
+      return;
+  }
+  c10::Device local_device(c10::DeviceType::XPU, device_idx);
+  c10::DeviceGuard guard(local_device);
+  c10::xpu::syncStreamsOnDevice();
+  auto stream = at::xpu::getCurrentXPUStream();
+  sycl::free(ptr, stream);
+}
+
+XPUSymmetricMemory::XPUSymmetricMemory(
+    std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs,
+    std::vector<void*> buffers,
+    std::vector<void*> signal_pads,
+    HandleType mc_handle,
+    void* mc_addr,
+    size_t buffer_size,
+    int local_device_idx,
+    int rank,
+    int world_size)
+    : alloc_refs_(std::move(alloc_refs)),
+      buffers_(std::move(buffers)),
+      signal_pads_(std::move(signal_pads)),
+      mc_handle_(mc_handle),
+      mc_addr_(mc_addr),
+      buffer_size_(buffer_size),
+      local_device_idx_(local_device_idx),
+      rank_(rank),
+      world_size_(world_size) {
+  const size_t arr_size = sizeof(void*) * world_size_;
+  buffers_dev_ = reinterpret_cast<void**>(
+      c10::xpu::XPUCachingAllocator::raw_alloc(arr_size));
+  signal_pads_dev_ = reinterpret_cast<void**>(
+      c10::xpu::XPUCachingAllocator::raw_alloc(arr_size));
+
+  c10::Device local_device(c10::DeviceType::XPU, local_device_idx);
+  c10::DeviceGuard guard(local_device);
+
+  at::xpu::getCurrentXPUStream().queue().memcpy(
+      buffers_dev_, buffers_.data(), arr_size);
+  at::xpu::getCurrentXPUStream().queue().memcpy(
+      signal_pads_dev_, signal_pads_.data(), arr_size);
+}
+
+std::vector<void*> XPUSymmetricMemory::get_buffer_ptrs() {
+  return buffers_;
+}
+
+std::vector<void*> XPUSymmetricMemory::get_signal_pad_ptrs() {
+  return signal_pads_;
+}
+
+void** XPUSymmetricMemory::get_buffer_ptrs_dev() {
+  return buffers_dev_;
+}
+
+void** XPUSymmetricMemory::get_signal_pad_ptrs_dev() {
+  return signal_pads_dev_;
+}
+
+size_t XPUSymmetricMemory::get_buffer_size() {
+  return buffer_size_;
+}
+
+size_t XPUSymmetricMemory::get_signal_pad_size() {
+  return signal_pad_size;
+}
+
+bool XPUSymmetricMemory::has_multicast_support() {
+  return false;
+}
+
+void* XPUSymmetricMemory::get_multicast_ptr() {
+  return nullptr;
+}
+
+at::Tensor XPUSymmetricMemory::get_buffer(
+    int rank,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    int64_t storage_offset) {
+  const size_t numel = std::accumulate(
+      sizes.begin(),
+      sizes.end(),
+      static_cast<size_t>(1),
+      std::multiplies<size_t>());
+  const auto element_size = c10::elementSize(dtype);
+  const auto req_size = (numel + storage_offset) * element_size;
+  TORCH_CHECK(
+      req_size <= buffer_size_,
+      "XPUSymmetricMemory::get_buffer: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      buffer_size_,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+      storage_offset * element_size;
+  // check the device of this device buffer
+  auto ptr_to_device_id = c10::xpu::get_device_idx_from_pointer(data_ptr);
+  auto device = c10::Device(c10::DeviceType::XPU, ptr_to_device_id);
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+
+  return at::for_blob(data_ptr, sizes)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+void check_channel(int channel, int world_size) {
+  TORCH_CHECK(
+      channel >= 0,
+      "channel for barrier(), put_signal() and wait_signal() ",
+      "must be greater than 0 (got ",
+      channel,
+      ")");
+  const size_t num_channels = signal_pad_size / sizeof(uint32_t) * world_size;
+  TORCH_CHECK(
+      static_cast<size_t>(channel) < num_channels,
+      "The maximum supported channel for barrier(), put_signal() and wait_signal() is ",
+      num_channels - 1,
+      " (got ",
+      channel,
+      ")");
+}
+
+void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) {
+  check_channel(channel, world_size_);
+
+  // Currently, we leverage oneCCL for barrier. Later, we may move to SYCL
+  // implementation.
+  auto group = c10d::resolve_process_group(group_name_);
+  if (group == nullptr) {
+    TORCH_WARN(
+        "Process group '",
+        group_name_,
+        "' not found, please init process group first before calling SymmetricMemory");
+    throw std::runtime_error("Process group not found");
+  }
+  auto* xcclPg = dynamic_cast<c10d::ProcessGroupXCCL*>(
+      group->getBackend(c10::DeviceType::XPU).get());
+
+  c10::Device local_device(c10::DeviceType::XPU, local_device_idx_);
+  c10::DeviceGuard guard(local_device);
+
+  static thread_local at::Tensor barrier_tensor;
+  if (!barrier_tensor.defined() || barrier_tensor.device() != local_device) {
+    barrier_tensor = at::zeros(
+        {1}, at::TensorOptions().device(local_device).dtype(at::kFloat));
+  } else {
+    barrier_tensor.zero_();
+  }
+
+  c10d::AllreduceOptions arOpts;
+  arOpts.asyncOp = false;
+  auto work =
+      xcclPg->allreduce_impl(barrier_tensor, "xccl:symm_mem_barrier", arOpts);
+
+  if (work) {
+    bool success = work->wait(std::chrono::milliseconds(timeout_ms));
+    TORCH_CHECK(
+        success,
+        "Barrier timeout after ",
+        timeout_ms,
+        " ms for group '",
+        group_name_,
+        "'");
+  }
+}
+
+void XPUSymmetricMemory::put_signal(
+    int dst_rank,
+    int channel,
+    size_t timeout_ms) {
+  LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported";
+}
+
+void XPUSymmetricMemory::wait_signal(
+    int src_rank,
+    int channel,
+    size_t timeout_ms) {
+  LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported";
+}
+
+int XPUSymmetricMemory::get_rank() {
+  return rank_;
+}
+
+int XPUSymmetricMemory::get_world_size() {
+  return world_size_;
+}
+
+c10::Device XPUSymmetricMemory::get_device() {
+  return c10::Device(c10::DeviceType::XPU, local_device_idx_);
+}
+
+Block::Block(
+    c10::intrusive_ptr<AllocationRef> alloc_ref,
+    int device_idx,
+    size_t block_size,
+    size_t buffer_size,
+    size_t signal_pad_offset,
+    const std::optional<std::string>& group_name)
+    : alloc_ref(std::move(alloc_ref)),
+      device_idx(device_idx),
+      block_size(block_size),
+      buffer_size(buffer_size),
+      signal_pad_offset(signal_pad_offset),
+      default_group_name(std::move(group_name)) {}
+
+void* XPUSymmetricMemoryAllocator::alloc(
+    size_t size,
+    int device_idx,
+    const std::optional<std::string>& group_name) {
+  size_t signal_pad_offset = at::round_up(size, 16UL);
+  size_t block_size = signal_pad_offset + signal_pad_size;
+
+  sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue();
+  void* ptr = sycl::malloc_device(block_size, current_queue);
+  current_queue.memset(ptr, 0, block_size);
+  auto alloc_ref =
+      c10::make_intrusive<AllocationRef>(ptr, ptr, block_size, device_idx, true);
+  auto block = c10::make_intrusive<Block>(
+      std::move(alloc_ref),
+      device_idx,
+      block_size,
+      size,
+      signal_pad_offset,
+      group_name);
+
+  {
+    std::unique_lock lock(mutex_);
+    ptr_to_block_.emplace(ptr, std::move(block));
+  }
+  return ptr;
+}
+
+void XPUSymmetricMemoryAllocator::free(void* ptr) {
+  std::unique_lock lock(mutex_);
+  ptr_to_block_.erase(ptr);
+}
+
+size_t XPUSymmetricMemoryAllocator::get_alloc_size(void* ptr) {
+  auto block = find_block(ptr);
+  TORCH_CHECK(
+      block != nullptr,
+      "XPUSymmetricMemoryAllocator::get_alloc_size: input must be allocated ",
+      "via XPUSymmetricMemoryAllocator::alloc");
+  return block->buffer_size;
+}
+
+struct RendezvousRequest {
+  int device_idx;
+  int pid;
+  size_t block_size;
+  size_t buffer_size;
+  size_t signal_pad_offset;
+  bool has_multicast_support;
+};
+
+void validate_rendezvous_requests(
+    const std::vector<RendezvousRequest>& reqs,
+    int world_size) {
+  TORCH_CHECK(reqs.size() == (size_t)world_size);
+
+  std::unordered_set<int> device_indices;
+  device_indices.reserve(world_size);
+  for (auto req : reqs) {
+    device_indices.insert(req.device_idx);
+  }
+
+  for (int r = 1; r < world_size; ++r) {
+    TORCH_CHECK(reqs[r].block_size == reqs[0].block_size);
+    TORCH_CHECK(reqs[r].buffer_size == reqs[0].buffer_size);
+    TORCH_CHECK(reqs[r].signal_pad_offset == reqs[0].signal_pad_offset);
+  }
+}
+
+c10::intrusive_ptr<SymmetricMemory> XPUSymmetricMemoryAllocator::rendezvous(
+    void* ptr,
+    const std::optional<std::string>& group_name) {
+  auto block = find_block(ptr);
+  if (block == nullptr) {
+    return nullptr;
+  }
+
+  // The group_name passed to rendezvous() takes precedence over
+  // the default group_name specified during allocation.
+  std::string group_name_;
+  // Treat empty string and std::nullopt the same as empty string seems to be
+  // implicitly used that way
+  if (group_name.has_value() && group_name != "") {
+    group_name_ = *group_name;
+  } else {
+    if (!block->default_group_name.has_value()) {
+      TORCH_CHECK(
+          false,
+          "XPUSymmetricMemory::rendezvous: `group_name` is neither "
+          "specified during allocation nor passed to rendezvous().");
+    }
+    group_name_ = *block->default_group_name;
+  }
+
+  auto it = block->symm_mems.find(group_name_);
+  if (it != block->symm_mems.end()) {
+    return it->second;
+  }
+
+  c10::Device local_device(c10::DeviceType::XPU, block->device_idx);
+  c10::DeviceGuard guard(local_device);
+
+  // IpcChannel is used to do inter-process communication
+  IpcChannel ipc_channel;
+  auto group_info = get_group_info(group_name_);
+  auto store = group_info.store;
+  int rank = group_info.rank;
+  int world_size = group_info.world_size;
+  sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue();
+
+  auto local_req = RendezvousRequest{
+      .device_idx = block->device_idx,
+      .pid = getpid(),
+      .block_size = block->block_size,
+      .buffer_size = block->buffer_size,
+      .signal_pad_offset = block->signal_pad_offset,
+      .has_multicast_support = false};
+  auto reqs = storeExchange.all_gather(store, rank, world_size, local_req);
+  validate_rendezvous_requests(reqs, world_size);
+
+  std::vector<int> pids(world_size);
+  for (int r = 0; r < world_size; ++r) {
+    pids[r] = reqs[r].pid;
+  }
+
+  // do IPC exchange for all peer ranks
+  ipc_channel.exchange_peer_ipc_mem(current_queue, ptr, rank, world_size);
+
+  // no physical memory handle, so handles and buffers are both for virtual
+  // address
+  std::vector<HandleType> handles(world_size);
+  std::vector<void*> buffers(world_size, nullptr);
+  std::vector<void*> signal_pads(world_size, nullptr);
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      handles[r] = block->alloc_ref->handle;
+      buffers[r] = ptr;
+      signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset);
+      continue;
+    } else {
+      buffers[r] = ipc_channel.buffers[r];
+      handles[r] = ipc_channel.buffers[r];
+      signal_pads[r] =
+          (void*)((uintptr_t)buffers[r] + block->signal_pad_offset);
+    }
+  }
+  storeExchange.barrier(store, rank, world_size);
+
+  HandleType mc_handle{};
+  void* mc_addr = nullptr;
+
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs;
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      alloc_refs.emplace_back(block->alloc_ref);
+      continue;
+    }
+    alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
+        buffers[r], handles[r], block->block_size, block->device_idx, false));
+  }
+
+  auto symm_mem = c10::make_intrusive<XPUSymmetricMemory>(
+      std::move(alloc_refs),
+      std::move(buffers),
+      std::move(signal_pads),
+      mc_handle,
+      mc_addr,
+      block->buffer_size,
+      block->device_idx,
+      group_info.rank,
+      group_info.world_size);
+  symm_mem->set_group_name(group_name_);
+  block->symm_mems[group_name_] = symm_mem;
+  return symm_mem;
+}
+
+bool XPUSymmetricMemoryAllocator::has_multicast_support(int device_idx) {
+  return false;
+}
+
+c10::DeviceType XPUSymmetricMemoryAllocator::supported_device_type() {
+  return c10::DeviceType::XPU;
+}
+
+std::string XPUSymmetricMemoryAllocator::name() {
+  return "XPU";
+}
+
+c10::intrusive_ptr<Block> XPUSymmetricMemoryAllocator::find_block(void* ptr) {
+  std::shared_lock lock(mutex_);
+  auto it = ptr_to_block_.find(ptr);
+  if (it == ptr_to_block_.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+struct RegisterXPUSymmetricMemoryAllocator {
+  RegisterXPUSymmetricMemoryAllocator() {
+    auto allocator = c10::make_intrusive<XPUSymmetricMemoryAllocator>();
+    // Query backend used for XPU
+    if (getSymmMemBackendXPU() == "XPU") {
+      // Direct set (static registration)
+      register_allocator(c10::DeviceType::XPU, allocator);
+    } else {
+      // Register availability in case `set_backend` is called dynamically
+      register_availability("XPU", allocator);
+    }
+  }
+};
+static RegisterXPUSymmetricMemoryAllocator register_allocator_;
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp
new file mode 100644
index 0000000000..2daac1114a
--- /dev/null
+++ b/src/xccl/XPUSymmetricMemory.hpp
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <sycl/sycl.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#include <xccl/XPUSymmetricMemoryTypes.hpp>
+
+namespace c10d::symmetric_memory {
+
+// Resource wrapper that owns a (vaddr, allocation handle) pair. Upon
+// destruction, it unmaps the vaddr and releases the allocation handle.
+struct AllocationRef : public c10::intrusive_ptr_target {
+  void* ptr;
+  HandleType handle;
+  size_t block_size;
+  int device_idx;
+  bool local_allocation;
+
+  AllocationRef(
+      void* ptr,
+      HandleType handle,
+      size_t block_size,
+      int device_idx,
+      bool local_allocation);
+
+  ~AllocationRef();
+};
+
+class XPUSymmetricMemory : public SymmetricMemory {
+ public:
+  XPUSymmetricMemory(
+      std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs,
+      std::vector<void*> buffers,
+      std::vector<void*> signal_pads,
+      HandleType mc_handle,
+      void* mc_addr,
+      size_t buffer_size,
+      int local_device_idx,
+      int rank,
+      int world_size);
+
+  ~XPUSymmetricMemory() override{};
+
+  std::vector<void*> get_buffer_ptrs() override;
+  std::vector<void*> get_signal_pad_ptrs() override;
+  void** get_buffer_ptrs_dev() override;
+  void** get_signal_pad_ptrs_dev() override;
+  size_t get_buffer_size() override;
+  size_t get_signal_pad_size() override;
+
+  bool has_multicast_support() override;
+  void* get_multicast_ptr() override;
+
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset);
+
+  void barrier(int channel, size_t timeout_ms) override;
+  void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
+  void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
+
+  int get_rank() override;
+  int get_world_size() override;
+  c10::Device get_device() override;
+
+  void set_group_name(const std::string& group_name) {
+    group_name_ = group_name;
+  }
+
+ private:
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  HandleType mc_handle_;
+  void* mc_addr_;
+  size_t buffer_size_;
+  int local_device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::string group_name_;
+};
+
+struct Block : public c10::intrusive_ptr_target {
+  c10::intrusive_ptr<AllocationRef> alloc_ref;
+  int device_idx;
+  size_t block_size;
+  size_t buffer_size;
+  size_t signal_pad_offset;
+  std::optional<std::string> default_group_name;
+  std::map<std::string, c10::intrusive_ptr<XPUSymmetricMemory>> symm_mems;
+
+  Block(
+      c10::intrusive_ptr<AllocationRef> alloc_ref,
+      int device_idx,
+      size_t block_size,
+      size_t buffer_size,
+      size_t signal_pad_offset,
+      const std::optional<std::string>& group_name);
+};
+
+class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
+ public:
+  void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) override;
+
+  void free(void* ptr) override;
+  size_t get_alloc_size(void* ptr) override;
+  c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) override;
+  bool has_multicast_support(int device_idx) override;
+  //  void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr);
+  c10::DeviceType supported_device_type() override;
+  std::string name() override;
+
+ private:
+  c10::intrusive_ptr<Block> find_block(void* ptr);
+
+  std::shared_mutex mutex_;
+  std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
+};
+
+} // namespace c10d::symmetric_memory
diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp
new file mode 100644
index 0000000000..4cab3b81f7
--- /dev/null
+++ b/src/xccl/XPUSymmetricMemoryTypes.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace c10d::symmetric_memory {
+
+constexpr size_t signal_pad_size = 2048;
+using HandleType = void*;
+
+} // namespace c10d::symmetric_memory
diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp
new file mode 100644
index 0000000000..7130fe7b6a
--- /dev/null
+++ b/src/xccl/XPUSymmetricMemoryUtils.cpp
@@ -0,0 +1,76 @@
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <c10/util/error.h>
+
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <level_zero/ze_api.h>
+#include <sycl/sycl.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <xccl/XPUSymmetricMemoryUtils.hpp>
+
+namespace c10d::symmetric_memory {
+
+std::string getSymmMemBackendXPU() {
+  static auto val = c10::utils::get_env("TORCH_SYMMMEM");
+  if (val.has_value()) {
+    TORCH_CHECK(
+        val.value() == "XPU",
+        "TORCH_SYMMMEM environment variable must be 'XPU'.");
+    return val.value();
+  }
+  return "XPU";
+}
+
+bool device_has_multicast_support(int device_idx) {
+  return false;
+}
+
+bool allow_overlapping_devices() {
+  return false;
+}
+
+void map_block(
+    void** ptr,
+    ze_physical_mem_handle_t handle,
+    size_t size,
+    int device_idx) {
+  sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue();
+  sycl::context sycl_ctx = current_queue.get_context();
+  ze_context_handle_t ze_context =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_ctx);
+  // 1. Reserve virtual address space
+  void* virtual_ptr = nullptr;
+  ze_result_t status = zeVirtualMemReserve(
+      ze_context, // context
+      nullptr, // let L0 pick virtual address
+      size, // size
+      &virtual_ptr // out: reserved address
+  );
+  TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed");
+
+  // 2. Map physical memory to virtual address
+  status = zeVirtualMemMap(
+      ze_context,
+      virtual_ptr, // virtual memory to map to
+      size,
+      handle, // physical memory handle
+      0, // flags
+      ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE // ze_memory_access_attribute_t
+  );
+  TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed");
+
+  // 3. Set access attributes
+  ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE;
+  status =
+      zeVirtualMemSetAccessAttribute(ze_context, virtual_ptr, size, access);
+  TORCH_CHECK(
+      status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed");
+
+  // 4. Return pointer
+  *ptr = virtual_ptr;
+}
+
+} // namespace c10d::symmetric_memory
diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp
new file mode 100644
index 0000000000..69189f45cf
--- /dev/null
+++ b/src/xccl/XPUSymmetricMemoryUtils.hpp
@@ -0,0 +1,89 @@
+#pragma once
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#include <xccl/XPUSymmetricMemoryTypes.hpp>
+
+namespace c10d {
+namespace symmetric_memory {
+
+std::string getSymmMemBackendXPU();
+
+bool device_has_multicast_support(int device_idx);
+
+bool allow_overlapping_devices();
+
+// A set of store-based exchange methods with a preset prefix typically type of
+// the SymmetricMemory.  Most used as static instances at respective
+// SymmetricMemory implementation files.
+class StoreExchange {
+ public:
+  StoreExchange(const std::string& store_prefix)
+      : store_prefix_(store_prefix) {}
+
+  // Put template function in header file so that compiler can easily access it.
+  template <typename T>
+  std::vector<T> all_gather(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size,
+      T val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+
+    std::vector<std::string> peer_keys;
+    peer_keys.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      std::ostringstream oss;
+      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
+      peer_keys.push_back(oss.str());
+    }
+    ++seq_id_;
+
+    {
+      std::vector<uint8_t> payload(
+          reinterpret_cast<uint8_t*>(&val),
+          reinterpret_cast<uint8_t*>(&val) + sizeof(T));
+      store->set(peer_keys[rank], payload);
+    }
+
+    std::vector<T> peer_vals;
+    peer_vals.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      if (r == rank) {
+        peer_vals.push_back(val);
+        continue;
+      }
+      store->wait({peer_keys[r]});
+      auto payload = store->get(peer_keys[r]);
+      TORCH_CHECK(payload.size() == sizeof(T));
+      T peer_val{};
+      std::memcpy(&peer_val, payload.data(), sizeof(T));
+      peer_vals.push_back(peer_val);
+    }
+    return peer_vals;
+  }
+
+  void barrier(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size) {
+    // TODO: implement an efficient one?
+    all_gather(store, rank, world_size, 0);
+  }
+
+ private:
+  const std::string store_prefix_;
+  size_t seq_id_ = 0;
+};
+
+// Returns a pointer of virtual address that is mapped to the physical memory
+// held by the handle.
+// todo: will follow such physical memory handle map with virtual address,
+// when L0 provides physical handle exchange API and we have multicast support.
+void map_block(
+    void** ptr,
+    ze_physical_mem_handle_t handle,
+    size_t size,
+    int device_idx);
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/src/xccl/ze_symbol.hpp b/src/xccl/ze_symbol.hpp
new file mode 100644
index 0000000000..20af666811
--- /dev/null
+++ b/src/xccl/ze_symbol.hpp
@@ -0,0 +1,254 @@
+#pragma once
+
+#include <dlfcn.h>
+#include <exception>
+#include <iostream>
+#include <unordered_map>
+
+#define zeVirtualMemMap zeVirtualMemMap_original
+#define zeVirtualMemReserve zeVirtualMemReserve_original
+#define zeVirtualMemSetAccessAttribute zeVirtualMemSetAccessAttribute_original
+
+#include <level_zero/ze_api.h>
+
+#undef zeVirtualMemMap
+#undef zeVirtualMemReserve
+#undef zeVirtualMemSetAccessAttribute
+
+typedef ze_result_t (*zeInit_t)(ze_init_flags_t flags);
+typedef ze_result_t (*zeMemGetAddressRange_t)(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    void** pBase,
+    size_t* pSize);
+typedef ze_result_t (*zeMemGetIpcHandle_t)(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    ze_ipc_mem_handle_t* pIpcHandle);
+typedef ze_result_t (*zeMemOpenIpcHandle_t)(
+    ze_context_handle_t hContext,
+    ze_device_handle_t hDevice,
+    ze_ipc_mem_handle_t handle,
+    ze_ipc_memory_flags_t flags,
+    void** pptr);
+typedef ze_result_t (
+    *zeMemCloseIpcHandle_t)(ze_context_handle_t hContext, const void* ptr);
+typedef ze_result_t (*zeVirtualMemMap_t)(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    size_t size,
+    ze_physical_mem_handle_t hPhysicalMemory,
+    size_t offset,
+    ze_memory_access_attribute_t access);
+typedef ze_result_t (*zeVirtualMemReserve_t)(
+    ze_context_handle_t hContext,
+    const void* pStart,
+    size_t size,
+    void** pptr);
+typedef ze_result_t (*zeVirtualMemSetAccessAttribute_t)(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    size_t size,
+    ze_memory_access_attribute_t access);
+
+bool load_level_zero_library();
+void unload_level_zero_library();
+
+#define zeCheck_dynamic(x)                                          \
+  do {                                                              \
+    if (!load_level_zero_library()) {                               \
+      throw std::runtime_error("Level Zero library not available"); \
+    }                                                               \
+    ze_result_t result = (x);                                       \
+    if (result != ZE_RESULT_SUCCESS) {                              \
+      auto e = zeException(result);                                 \
+      std::cout << "Throw " << e.what() << std::endl;               \
+      throw e;                                                      \
+    }                                                               \
+  } while (0)
+
+#define zeInit_dynamic(flags) zeInit_ptr(flags)
+#define zeMemGetAddressRange_dynamic(ctx, ptr, base, size) \
+  zeMemGetAddressRange_ptr(ctx, ptr, base, size)
+#define zeMemGetIpcHandle_dynamic(ctx, ptr, handle) \
+  zeMemGetIpcHandle_ptr(ctx, ptr, handle)
+#define zeMemOpenIpcHandle_dynamic(ctx, dev, handle, flags, ptr) \
+  zeMemOpenIpcHandle_ptr(ctx, dev, handle, flags, ptr)
+#define zeMemCloseIpcHandle_dynamic(ctx, ptr) zeMemCloseIpcHandle_ptr(ctx, ptr)
+#define zeVirtualMemMap_dynamic(ctx, ptr, size, phys_mem, offset, access) \
+  zeVirtualMemMap_ptr(ctx, ptr, size, phys_mem, offset, access)
+#define zeVirtualMemReserve_dynamic(ctx, start, size, ptr) \
+  zeVirtualMemReserve_ptr(ctx, start, size, ptr)
+#define zeVirtualMemSetAccessAttribute_dynamic(ctx, ptr, size, access) \
+  zeVirtualMemSetAccessAttribute_ptr(ctx, ptr, size, access)
+
+// Exception handling class
+class zeException : std::exception {
+  const char* zeResultToString(ze_result_t status) const {
+    static const std::unordered_map<ze_result_t, const char*> zeResultToStringMap{
+        {ZE_RESULT_SUCCESS, "[Core] success"},
+        {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"},
+        {ZE_RESULT_ERROR_UNINITIALIZED,
+         "[Validation] driver is not initialized"},
+        {ZE_RESULT_ERROR_INVALID_NULL_POINTER,
+         "[Validation] pointer argument may not be nullptr"},
+        {ZE_RESULT_ERROR_INVALID_NULL_HANDLE,
+         "[Validation] handle argument is not valid"},
+        {ZE_RESULT_ERROR_INVALID_ENUMERATION,
+         "[Validation] enumerator argument is not valid"},
+        {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"},
+        {ZE_RESULT_ERROR_UNSUPPORTED_SIZE,
+         "[Validation] size argument is not supported by the device"},
+        {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT,
+         "[Validation] alignment argument is not supported by the device"},
+        {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
+         "[Validation] generic error code for unsupported features"},
+        {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY,
+         "[Validation] native binary is not supported by the device"},
+        {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY,
+         "[Core] insufficient host memory to satisfy call"},
+        {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY,
+         "[Core] insufficient device memory to satisfy call"},
+        {ZE_RESULT_ERROR_DEVICE_LOST,
+         "[Core] device hung, reset, was removed, or driver update occurred"},
+        {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE,
+         "[Core] error occurred when building module, see build log for details"},
+        {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE,
+         "[Validation] object pointed to by handle still in-use by device"},
+    };
+    auto it = zeResultToStringMap.find(status);
+    if (it != zeResultToStringMap.end())
+      return it->second;
+    else
+      return "Unknown Reason";
+  }
+
+ public:
+  zeException(ze_result_t ret) : result_(ret) {}
+
+  ze_result_t result_;
+
+  const char* what() const noexcept override {
+    return zeResultToString(result_);
+  }
+};
+
+#define zeCheck(x)                                  \
+  if (x != ZE_RESULT_SUCCESS) {                     \
+    auto e = zeException(x);                        \
+    std::cout << "Throw " << e.what() << std::endl; \
+    throw e;                                        \
+  }
+
+static zeInit_t zeInit_ptr = nullptr;
+static zeMemGetAddressRange_t zeMemGetAddressRange_ptr = nullptr;
+static zeMemGetIpcHandle_t zeMemGetIpcHandle_ptr = nullptr;
+static zeMemOpenIpcHandle_t zeMemOpenIpcHandle_ptr = nullptr;
+static zeMemCloseIpcHandle_t zeMemCloseIpcHandle_ptr = nullptr;
+static zeVirtualMemMap_t zeVirtualMemMap_ptr = nullptr;
+static zeVirtualMemReserve_t zeVirtualMemReserve_ptr = nullptr;
+static zeVirtualMemSetAccessAttribute_t zeVirtualMemSetAccessAttribute_ptr =
+    nullptr;
+
+static void* ze_handle = nullptr;
+
+inline bool load_level_zero_library() {
+  if (ze_handle != nullptr) {
+    return true;
+  }
+  const char* lib_names[] = {"libze_loader.so"};
+
+  for (const char* lib_name : lib_names) {
+    ze_handle = dlopen(lib_name, RTLD_LAZY);
+    if (ze_handle != nullptr) {
+      break;
+    }
+  }
+
+  if (ze_handle == nullptr) {
+    std::cerr << "Failed to load Level Zero library: " << dlerror()
+              << std::endl;
+    return false;
+  }
+
+  zeInit_ptr = (zeInit_t)dlsym(ze_handle, "zeInit");
+  zeMemGetAddressRange_ptr =
+      (zeMemGetAddressRange_t)dlsym(ze_handle, "zeMemGetAddressRange");
+  zeMemGetIpcHandle_ptr =
+      (zeMemGetIpcHandle_t)dlsym(ze_handle, "zeMemGetIpcHandle");
+  zeMemOpenIpcHandle_ptr =
+      (zeMemOpenIpcHandle_t)dlsym(ze_handle, "zeMemOpenIpcHandle");
+  zeMemCloseIpcHandle_ptr =
+      (zeMemCloseIpcHandle_t)dlsym(ze_handle, "zeMemCloseIpcHandle");
+  zeVirtualMemMap_ptr = (zeVirtualMemMap_t)dlsym(ze_handle, "zeVirtualMemMap");
+  zeVirtualMemReserve_ptr =
+      (zeVirtualMemReserve_t)dlsym(ze_handle, "zeVirtualMemReserve");
+  zeVirtualMemSetAccessAttribute_ptr = (zeVirtualMemSetAccessAttribute_t)dlsym(
+      ze_handle, "zeVirtualMemSetAccessAttribute");
+
+  if (!zeInit_ptr || !zeMemGetAddressRange_ptr || !zeMemGetIpcHandle_ptr ||
+      !zeMemOpenIpcHandle_ptr || !zeMemCloseIpcHandle_ptr ||
+      !zeVirtualMemMap_ptr || !zeVirtualMemReserve_ptr ||
+      !zeVirtualMemSetAccessAttribute_ptr) {
+    std::cerr << "Failed to load Level Zero API functions" << std::endl;
+    dlclose(ze_handle);
+    ze_handle = nullptr;
+    return false;
+  }
+
+  return true;
+}
+
+inline void unload_level_zero_library() {
+  if (ze_handle != nullptr) {
+    dlclose(ze_handle);
+    ze_handle = nullptr;
+    zeInit_ptr = nullptr;
+    zeMemGetAddressRange_ptr = nullptr;
+    zeMemGetIpcHandle_ptr = nullptr;
+    zeMemOpenIpcHandle_ptr = nullptr;
+    zeMemCloseIpcHandle_ptr = nullptr;
+    zeVirtualMemMap_ptr = nullptr;
+    zeVirtualMemReserve_ptr = nullptr;
+    zeVirtualMemSetAccessAttribute_ptr = nullptr;
+  }
+}
+
+extern "C" {
+
+__attribute__((weak)) ze_result_t zeVirtualMemMap(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    size_t size,
+    ze_physical_mem_handle_t hPhysicalMemory,
+    size_t offset,
+    ze_memory_access_attribute_t access) {
+  if (!load_level_zero_library() || !zeVirtualMemMap_ptr) {
+    return ZE_RESULT_ERROR_UNINITIALIZED;
+  }
+  return zeVirtualMemMap_ptr(
+      hContext, ptr, size, hPhysicalMemory, offset, access);
+}
+
+__attribute__((weak)) ze_result_t zeVirtualMemReserve(
+    ze_context_handle_t hContext,
+    const void* pStart,
+    size_t size,
+    void** pptr) {
+  if (!load_level_zero_library() || !zeVirtualMemReserve_ptr) {
+    return ZE_RESULT_ERROR_UNINITIALIZED;
+  }
+  return zeVirtualMemReserve_ptr(hContext, pStart, size, pptr);
+}
+
+__attribute__((weak)) ze_result_t zeVirtualMemSetAccessAttribute(
+    ze_context_handle_t hContext,
+    const void* ptr,
+    size_t size,
+    ze_memory_access_attribute_t access) {
+  if (!load_level_zero_library() || !zeVirtualMemSetAccessAttribute_ptr) {
+    return ZE_RESULT_ERROR_UNINITIALIZED;
+  }
+  return zeVirtualMemSetAccessAttribute_ptr(hContext, ptr, size, access);
+}
+}
diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 916524073c..02b6e4e59d 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -12,6 +12,7 @@
 
 import torch
 import torch.distributed as c10d
+import torch.distributed._functional_collectives as _functional_collectives
 
 if not c10d.is_available() or not c10d.is_xccl_available():
     print("c10d XCCL not available, skipping tests", file=sys.stderr)
@@ -364,6 +365,31 @@ def test_nan_assert(self, type):
         # reset env
         os.environ["TORCH_XCCL_NAN_CHECK"] = "0"
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_oom(self):
+        pg = self._create_process_group_xccl()
+        dp_ranks = range(0, self.world_size)
+        dp_group = c10d.new_group(dp_ranks)
+        device = torch.device(f"xpu:{self.rank}")
+        torch.xpu.set_device(device)
+
+        shape = (16384 * 2, 16384 * 2)
+        weight = torch.ones(shape, device=device).half()
+        gradient = torch.zeros(shape, device=device).half()
+        ret = torch.randn(shape, device=device).half()
+
+        for iter in range(50):
+            output = torch.empty_like(ret)
+            output = ret + weight + gradient
+            ret = torch.nn.functional.linear(output, weight=ret)
+            dist.all_reduce(ret, op=dist.ReduceOp.SUM)
+        torch.xpu.synchronize()
+        self.assertLess(
+            torch.xpu.max_memory_allocated(),
+            torch.xpu.max_memory_reserved() * 2,
+        )
+
 
 class CommTest(MultiProcessTestCase):
     @property
@@ -626,6 +652,86 @@ def test_all_gather_into_tensor(self):
                     tensor.view(torch.float32),
                 )
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_unwaited(self) -> None:
+        # Verify that the process can terminate gracefully
+        # even with unwaited tensors
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+
+        # Case 1: Run collectives under context manager, and don't call wait on them.
+        with _functional_collectives.allow_inflight_collective_as_graph_input_ctx():
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+            input = torch.full(
+                (10240, 10240), float(self.rank), device=f"xpu:{self.rank}"
+            )
+            dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
+            # Non-functional collectives run under the context manager is registered in the work registry.
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
+            # Running another collective on the same tensor should still work
+            dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
+
+        # Case 2: Run collectives not under context manager, and don't call wait on them.
+        # NOTE: Here we intentionally test memory-stressed case.
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
+        for _ in range(50000):
+            input = torch.full(
+                (1024, 1024), float(self.rank), device=f"xpu:{self.rank}"
+            )
+            dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
+        # Work registry size is unchanged, since non-functional collectives not run under
+        # the context manager is not registered in the work registry.
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_wait_tensor(self) -> None:
+        # Verify that c10d_functional.wait_tensor() can be invoked on
+        # output tensor of non-functional collective
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+
+        # Case 1: under context manager (i.e. work is registered in registry)
+        with _functional_collectives.allow_inflight_collective_as_graph_input_ctx():
+            input1 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}")
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+            dist.all_reduce(input1, op=dist.ReduceOp.SUM, async_op=True)
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
+            torch.ops.c10d_functional.wait_tensor(input1)
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+
+            input2 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}")
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+            work = dist.all_reduce(input2, op=dist.ReduceOp.SUM, async_op=True)
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
+            work.wait()
+            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+            self.assertEqual(input1, input2)
+
+        # Case 2: not under context manager (i.e. work is not registered in registry)
+        input1 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}")
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+        dist.all_reduce(input1, op=dist.ReduceOp.SUM, async_op=True)
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+        # this does not take effect, since the underlying wait_tensor() logic would not
+        # be able to find the corresponding work object (because it's not registered in registry)
+        torch.ops.c10d_functional.wait_tensor(input1)
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+
+        input2 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}")
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+        work = dist.all_reduce(input2, op=dist.ReduceOp.SUM, async_op=True)
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+        work.wait()
+        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
+        self.assertEqual(input1, input2)
+
 
 instantiate_parametrized_tests(ProcessGroupXCCLTest)
 
diff --git a/test/xpu/distributed/test_symmetric_memory_xccl.py b/test/xpu/distributed/test_symmetric_memory_xccl.py
new file mode 100644
index 0000000000..37f5d3e6da
--- /dev/null
+++ b/test/xpu/distributed/test_symmetric_memory_xccl.py
@@ -0,0 +1,85 @@
+import torch
+import torch.distributed as dist
+from test_c10d_xccl import init_multigpu_helper, requires_xccl
+from torch.distributed._symmetric_memory import (
+    _fused_all_gather_matmul_fallback,
+    _fused_matmul_reduce_scatter_fallback,
+)
+
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests
+)
+
+@instantiate_parametrized_tests
+class AsyncTPTest(MultiProcContinuousTest):
+    @property
+    def device(self) -> torch.device:
+        return torch.device("xpu", self.rank)
+
+    def _init_process(self):
+        torch.xpu.set_device(self.device)
+        torch.manual_seed(42 + self.rank)
+        torch.use_deterministic_algorithms(True)
+        torch.set_deterministic_debug_mode("warn")
+        torch.utils.deterministic.fill_uninitialized_memory = True
+
+    @requires_xccl()
+    @parametrize("gather_dim", [0, 1])
+    def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
+        self._init_process()
+        BATCH = 8
+        M = 64
+        N = 16
+        K = 32
+        group = dist.group.WORLD
+        rank = self.rank    
+
+        torch.manual_seed(42 + rank)
+        A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu")
+        Bs = [torch.rand(K, N, device="xpu") for _ in range(3)]
+
+        ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback(
+            A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name
+        )
+        ag_output_1, mm_outputs_1 = torch.ops.symm_mem.fused_all_gather_matmul(
+            A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name
+        )
+
+        self.assertEqual(ag_output_0, ag_output_1)
+        self.assertEqual(ag_output_0.stride(), ag_output_1.stride())
+        for mm_output_0, mm_output_1 in zip(mm_outputs_0, mm_outputs_1):
+            self.assertEqual(mm_output_0, mm_output_1)
+            self.assertEqual(mm_output_0.stride(), mm_output_1.stride())
+
+    @requires_xccl()
+    @parametrize("scatter_dim", [0, 1])
+    def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
+        self._init_process()
+        
+        BATCH = 8
+        M = 64
+        N = 16
+        K = 32
+        group = dist.group.WORLD
+        rank = self.rank
+
+        torch.manual_seed(42 + rank)
+        A = torch.rand(BATCH, M, K, device="xpu")
+        B = torch.rand(K, N, device="xpu")
+
+        output_0 = _fused_matmul_reduce_scatter_fallback(
+            A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name
+        )
+        output_1 = torch.ops.symm_mem.fused_matmul_reduce_scatter(
+            A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name
+        )
+
+        self.assertEqual(output_0, output_1)
+        self.assertEqual(output_0.stride(), output_1.stride())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/xpu/extended/run_test_with_skip_mtl.py b/test/xpu/extended/run_test_with_skip_mtl.py
index e2683b64ce..4b6eaa5960 100644
--- a/test/xpu/extended/run_test_with_skip_mtl.py
+++ b/test/xpu/extended/run_test_with_skip_mtl.py
@@ -19,5 +19,6 @@
 
 os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
 test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
+test_command.extend(["--junit-xml", "./op_extended.xml"])
 res = pytest.main(test_command)
 sys.exit(res)
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
index ddde5f8c8a..4965406165 100644
--- a/test/xpu/run_distributed.py
+++ b/test/xpu/run_distributed.py
@@ -26,6 +26,10 @@ def run(test_command):
 
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
+test_command = ["python", "distributed/test_c10d_xccl.py"]
+res += run(test_command)
+test_command = ["python", "distributed/test_symmetric_memory_xccl.py"]
+res += run(test_command)
 
 # run pytest with skiplist
 for key in skip_dict:
diff --git a/test/xpu/run_test_win_with_skip_mtl.py b/test/xpu/run_test_win_with_skip_mtl.py
index 4ba56dba33..06a4849a6c 100644
--- a/test/xpu/run_test_win_with_skip_mtl.py
+++ b/test/xpu/run_test_win_with_skip_mtl.py
@@ -18,6 +18,7 @@
 sys.stdout = StringIO()
 
 test_command = ["-k", skip_options, "../../../../test/test_xpu.py", "-v"]
+test_command.extend(["--junit-xml", "./test_xpu.xml"])
 res = pytest.main(test_command)
 
 output = sys.stdout.getvalue()