diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 8cd490bc8..5778b5fd8 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -1,22 +1,47 @@
 import argparse
 import sys
 import os
+import re
 from junitparser import JUnitXml, Error, Failure, Skipped
 
-parser = argparse.ArgumentParser()
-parser.add_argument('junitxml', nargs='+')
+parser = argparse.ArgumentParser(description='Test results analyzer')
+parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files')
 args = parser.parse_args()
 
 failures = []
-suites = []
+summaries = []
+
+error_types = [
+    "RuntimeError",
+    "ValueError",
+    "TypeError",
+    "AttributeError",
+    "KeyError",
+    "IndexError",
+    "ImportError",
+    "AssertionError",
+    "Exception",
+    "OSError",
+    "Failed",
+    "TimeoutError",
+    "asyncio.TimeoutError",
+    "FileNotFoundError",
+    "PermissionError",
+    "NotImplementedError",
+]
 
 def get_classname(case):
-    return ' '.join(case.classname.split())
+    return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '')
 
 def get_name(case):
+    if isinstance(case, dict):
+        return case.get('name', '')
     return ' '.join(case.name.split())
 
 def get_result(case):
+    if isinstance(case, dict):
+        return case.get('status', 'failed')
+
     result = "passed"
     if case.result:
         if isinstance(case.result[0], Error):
@@ -28,88 +53,252 @@ def get_result(case):
     return result
 
 def get_message(case):
+    if isinstance(case, dict):
+        return case.get('error', '')
+
     if not case.result:
         return ""
-    return f"{case.result[0].message.splitlines()[0]}"
 
-def print_md_row(row, print_header):
+    full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message
+    if not full_text:
+        return ""
+
+    error_messages = []
+    capture_next_lines = False
+    indent_level = 0
+
+    for line in full_text.splitlines():
+        stripped_line = line.strip()
+        if not stripped_line:
+            continue
+
+        for error_type in error_types:
+            if stripped_line.startswith(error_type + ": "):
+                error_msg = stripped_line[len(error_type)+2:]
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break 
+            elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line:
+                error_msg = stripped_line.split(f'{error_type}:')[-1].strip()
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break
+
+    return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
+
+
+def print_md_row(row, print_header=False, fail_list=None):
     if print_header:
-        header = " | ".join([f"{key}" for key, _ in row.items()])
+        header = " | ".join([f"{key}" for key in row.keys()])
         print(f"| {header} |")
-        header = " | ".join(["-"*len(key) for key, _ in row.items()])
+        header = " | ".join(["---"] * len(row))
         print(f"| {header} |")
-    row = " | ".join([f"{value}" for _, value in row.items()])
-    print(f"| {row} |")
+    row_values = " | ".join([f"{value}" for value in row.values()])
+    print(f"| {row_values} |")
 
-def print_cases(cases):
-    print_header = True
-    for case in cases:
-        classname = get_classname(case)
-        name = get_name(case)
-        result = get_result(case)
-        message = get_message(case)
-        row = {
-            'Class name': classname,
-            'Test name': name,
-            'Status': result,
-            'Message': message,
-        }
-        print_md_row(row, print_header)
-        print_header = False
+    if fail_list != None:
+        fail_list.write(f"| {row_values} |\n")
+
+
+def get_similar_issues(classname, name, result, message):
+    import requests
+
+    os.environ["http_proxy"] = ""
+    os.environ["https_proxy"] = ""
+    DEFAULT_HOST_IP = "10.112.100.138"
+
+    def QnA(request, host_ip=DEFAULT_HOST_IP):
+        import json
+        url = f"http://{host_ip}:8888/v1/chatqna"
+    
+        headers = {"Content-Type": "application/json"}
+    
+        response = requests.post(url, headers=headers, json=request)
+        return response
+
+    prompt = f"unit test {name} {result} with {message}, is it a known issue? If yes, what is the issue id? And what is the owner and root cuase?"
+
+    request = {
+              "messages": prompt,
+              "stream": False
+           }
+
+    response = QnA (request)
+    if response.status_code==200:
+       result = response.json()["choices"][0]["message"]["content"]
+       answer = result.split("</think>")[-1].strip()
+       answer = answer.split("**Answer:**")[-1].strip()
+       return answer 
+    return ""
+
+def print_failures():
+    if not failures:
+        return
+
+    with open("ut_failure_list.csv", "w") as fail_list:
+        fail_list.write("sep=\'|\''.\n")
+
+        print("### Test Failures")
+        print_header = True
+        for case in failures:
+            #issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case))
+            print_md_row({
+                'Class name': get_classname(case),
+                'Test name': get_name(case),
+                'Status': get_result(case),
+                'Message': get_message(case),
+                #'Similar issue': issue,
+                'Source': case['source'] if isinstance(case, dict) else 'XML'
+            }, print_header, fail_list)
 
-def print_suite(suite):
+    print_header = False
+
+def parse_log_file(log_file):
+    with open(log_file, encoding='utf-8') as f:
+        content = f.read()
+
+    ut_name = os.path.splitext(os.path.basename(log_file))[0]
+    summary = {
+        'Category': determine_category(ut_name),
+        'UT': ut_name,
+        'Test cases': 0,
+        'Passed': 0,
+        'Skipped': 0,
+        'Failures': 0,
+        'Errors': 0,
+        'Source': 'Log'
+    }
+
+    # Extract test counts
+    test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content)
+    if test_run_match:
+        summary['Test cases'] = int(test_run_match.group(1))
+
+    # Extract skipped case number
+    skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE)
+    if skipped_match:
+        summary['Skipped'] = int(skipped_match.group(1))
+    else:
+        skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE)
+        if skipped_match:
+            summary['Skipped'] = int(skipped_match.group(1))
+
+    # Extract failures
+    failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL)
+    exist_test_names = set()
+    failures_number = 0
+
+    for block in failure_blocks:
+        case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block)
+        if not case_match:
+            continue
+
+        test_name = case_match.group(1)
+        if test_name in exist_test_names:
+            continue
+        exist_test_names.add(test_name)
+
+        error_msg = []
+        error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)"
+        error_matches = re.finditer(error_pattern, block, re.DOTALL)
+        if not error_matches and "Traceback" in block:
+            error_msg.append("Unknown error (see traceback)")
+        else:
+            for match in error_matches:
+                error_msg.append(match.group(0).strip())
+
+        failures.append({
+            'classname': ut_name,
+            'name': f"{case_match.group(2)}:{test_name}",
+            'error': " ".join(error_msg),
+            'status': 'failed',
+            'source': 'Log'
+        })
+        failures_number += 1
+
+    if failures_number > summary['Failures']:
+        summary['Failures'] = failures_number
+    summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped']
+
+    return summary
+
+def determine_category(ut):
+    if ut == 'op_regression':
+        return 'op_regression'
+    elif ut == 'op_regression_dev1':
+        return 'op_regression_dev1'
+    elif ut == 'op_extended':
+        return 'op_extended'
+    elif 'op_ut' in ut:
+        return 'op_ut'
+    else:
+        return 'unknown'
+
+def process_log_file(log_file):
+    try:
+        summary = parse_log_file(log_file)
+        summaries.append(summary)
+    except Exception as e:
+        print(f"Error processing {log_file}: {e}", file=sys.stderr)
+
+def process_xml_file(xml_file):
+    try:
+        xml = JUnitXml.fromfile(xml_file)
+        ut = os.path.basename(xml_file).split('.')[0]
+        category = determine_category(ut)
+
+        for suite in xml:
+            suite_summary = {
+                'Category': category,
+                'UT': ut,
+                'Test cases': suite.tests,
+                'Passed': suite.tests - suite.skipped - suite.failures - suite.errors,
+                'Skipped': suite.skipped,
+                'Failures': suite.failures,
+                'Errors': suite.errors,
+                'Source': 'XML'
+            }
+            summaries.append(suite_summary)
+
+            for case in suite:
+                if get_result(case) not in ["passed", "skipped"]:
+                    failures.append(case)
+    except Exception as e:
+        print(f"Error processing {xml_file}: {e}", file=sys.stderr)
+
+def print_summary():
+    print("### Results Summary")
     print_header = True
-    for suite in suites:
-        ut = args.junitxml[0]
-        del(args.junitxml[0])
-        ut = os.path.basename(ut).split('.')[0]
-        tests = suite.tests
-        skipped = suite.skipped
-        failures = suite.failures
-        errors = suite.errors
-        if ut == 'op_regression':
-            category = 'op_regression'
-        elif ut == 'op_regression_dev1':
-            category = 'op_regression_dev1'
-        elif ut == 'op_extended':
-            category = 'op_extended'
-        elif 'op_ut' in ut:
-            category = 'op_ut'
-        row = {
-            'Category': category,
-            'UT': ut,
-            'Test cases': tests,
-            'Passed': tests-skipped-failures-errors,
-            'Skipped': skipped,
-            'Failures': failures,
-            'Errors': errors,
-        }
-        print_md_row(row, print_header)
+
+    
+    for summary in summaries:
+        print_md_row({
+            'Category': summary['Category'],
+            'UT': summary['UT'],
+            'Test cases': summary['Test cases'],
+            'Passed': summary['Passed'],
+            'Skipped': summary['Skipped'],
+            'Failures': summary['Failures'],
+            'Errors': summary['Errors'],
+            'Source': summary['Source']
+        }, print_header)
+
         print_header = False
 
-xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ]
-for idx, xml in enumerate(xmls):
-    for suite in xml:
-        suites.append(suite)
-        for case in suite:
-            classname = get_classname(case)
-            name = get_name(case)
-            result = get_result(case)
-            if result not in ["passed", "skipped"]:
-                failures.append(case)
-
-printed = False
-def print_break(needed):
-    if needed:
-        print("")
-
-if failures:
-    print_break(printed)
-    print("### Failures")
-    print_cases(failures)
-    printed = True
-
-print("### Results Summary")
-print_suite(suites)
-
-sys.exit(0)
+def main():
+    for input_file in args.input_files:
+        if input_file.endswith('.log'):
+            process_log_file(input_file)
+        elif input_file.endswith('.xml'):
+            process_xml_file(input_file)
+        else:
+            print(f"Skipping unknown file type: {input_file}", file=sys.stderr)
+
+    print_failures()
+    print_summary()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 3fb1a1997..9bf611786 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
-    ((num_failed=num_failed_xpu_distributed))
+    cat "./${ut_suite}_test_failed.log"
+    ((num_failed=num_failed_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index b67be9f29..ae6c2064c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -65,7 +65,11 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
           cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
           # apply PRs for stock pytorch
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
@@ -167,11 +171,11 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/torch*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/pytorch_*.log
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index b409d5774..afd8afdf0 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
@@ -95,7 +95,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -182,6 +182,18 @@ jobs:
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log
           cp *.xml ${{ github.workspace }}/ut_log
+          find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c '
+              dir_path=$(dirname "$1");
+              case "$dir_path" in
+                  *"op_ut_with_skip_quantization/core"*)
+                      dir_name="op_ut_with_skip_quantization_core";;
+                  *)
+                      dir_name=$(basename "$dir_path");;
+              esac;
+              mv "$1" "$dir_path/${dir_name}_$(basename "$1")"
+          ' _ {} \;
+          cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log
+          cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log
           # Cases run with a on-demand white list, since some suites are too
           # slow to go through all operators on CPU. So add cases on-demand
           # when XPU implementatoin is done.
@@ -254,6 +266,7 @@ jobs:
           source activate xpu_op_${ZE_AFFINITY_MASK}
           pip install junitparser
           python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
+          if [ -f "ut_failure_list.csv"];then cp ut_failure_list.csv  ${{ github.workspace }}/ut_log/. fi 
       - name: UT Test Results Check
         shell: bash
         run: |
@@ -336,7 +349,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -417,3 +430,284 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
+      - name: Upload XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log/ut_failure_list.csv
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3dd204e32..0e9ee9f63 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test
@@ -112,7 +137,7 @@ jobs:
         if: ${{ inputs.pytorch }} != 'nightly_wheel'
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..cc40373bc
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,125 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict, skip_dict_python
+
+res = 0
+res2 = 0
+fail_test = []
+error_log = ""
+
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if len(value) > len(max_affinity):
+            max_affinity = value
+
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+
+from xpu_test_utils import launch_test
+
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key] if skip_dict_python[key] else []
+    test_command = ["python", key]
+    fail = run(test_command)
+    num_skipped = 0
+    num_err = 0
+    if fail.returncode:
+        for i, err in enumerate(fail.stderr.split("FAIL: ")):
+            if i == 0 and len(err) > 0:
+                error_log += err
+                continue
+            is_skipped = False
+            for skip_case in skip_list:
+                if skip_case in err:
+                    print("Skipped error: ", key + " " + skip_case)
+                    num_skipped += 1
+                    is_skipped = True
+                    break
+            if not is_skipped:
+                num_err += 1
+                res2 += fail.returncode
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+                        else:
+                            error_log += (line + "\n")
+                else:
+                    error_log += ("FAIL: " + err)
+            else:
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+
+    renamed_key = key.replace("../../../../", "").replace("/", "_")
+    if num_err > 0:
+        fail_test.append(key)
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(error_log)
+    else:
+        import pdb;pdb.set_trace()
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(fail.stdout)
+            f.write(fail.stderr)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res2)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..9ec4c59e0
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,465 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+       # https://github.com/intel/torch-xpu-ops/issues/1504
+       "test_ddp_parity_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_bf16_hook_has_wrapping_False_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy2",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy0",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy2",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+        "test_delayed_optim_step_offload_false_no_shard_xpu",
+        "test_delayed_optim_step_offload_false_none_xpu",
+        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
+        "test_delayed_optim_step_offload_true_none_xpu",
+        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
+        "test_delayed_reduce_scatter_offload_false_none_xpu",
+        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_true_none_xpu",
+        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_offload_false_none_xpu",
+        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_true_none_xpu",
+        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
+        "test_nested_always_wrap_model_offload_false_none_xpu",
+        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_true_none_xpu",
+        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_false_no_shard_xpu",
+        "test_nested_wrapped_model_offload_false_none_xpu",
+        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_true_none_xpu",
+        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
+        "test_transformer_offload_false_none_xpu",
+        "test_transformer_offload_false_shard_grad_op_xpu",
+        "test_transformer_offload_true_none_xpu",
+        "test_transformer_offload_true_shard_grad_op_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_hooks_multi_traversal_xpu",
+        "test_parity_with_ddp_xpu",
+        "test_parity_with_non_frozen_fsdp_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_fsdp_zero2_eval_with_prefetch",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_optimizer_overlap",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_multi_forward_cpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_flatten_sharded_optim_state_dict_nested",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_names",
+        "test_scatter_full_optim_state_dict_nested_halve_world_size",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_halve_world_size",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_use_orig_params",
+    ),
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    # https://github.com/intel/torch-xpu-ops/issues/1504
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_state_dict_save_load_flow_state_dict_type_local_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_state_dict",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_diff_hyperparams_sharding_strategy_str_full_shard",
+        "test_diff_hyperparams_sharding_strategy_str_no_shard",
+        "test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
+        "test_no_sync_correctness",
+    ),
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        # https://github.com/intel/torch-xpu-ops/issues/1509
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_stage_backward_weight_multiple_iters_xpu",
+        "test_stage_backward_weight_xpu",
+        "test_stage_backward_xpu",
+    ),
+    "../../../../test/distributed/pipelining/test_microbatch.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_chunk_spec_xpu",
+    ),
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
+        # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device
+        # https://github.com/intel/torch-xpu-ops/issues/1547
+        "test_dtensor_seq_par_shard_dim_0",
+        "test_dtensor_seq_par_shard_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
+        # https://github.com/intel/torch-xpu-ops/issues/1548
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
+        # https://github.com/intel/torch-xpu-ops/issues/1549
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1550
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
+        # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1551
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
+        # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
+        # https://github.com/intel/torch-xpu-ops/issues/1555
+        "test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings",
+        "test_transformer_training_is_seq_parallel_False_float32",
+        "test_transformer_training_is_seq_parallel_True_float32",
+        # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
+        # https://github.com/intel/torch-xpu-ops/issues/1556
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
+    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_attention.py": None,
+    "../../../../test/distributed/tensor/test_common_rules.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": (
+        # Passed with updated test code for world_size 8
+        "test_auto_implicit_replication",
+        "test_default_value_sub_mesh",
+        "test_device_mesh_nd",
+        "test_dtensor_2d_mesh",
+        "test_dtensor_api_device_mesh_context_manager",
+        "test_dtensor_device_mesh_device_conversion",
+        "test_dtensor_spec_local_shard_offset",
+        "test_from_local_sub_mesh",
+        "test_implicit_replication",
+        "test_metadata_consistency_check",
+        "test_redistribute_sub_mesh",
+        "test_split_tensor_1D",
+    ),
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_2d_fsdp_tp_compile",
+    ),
+    "../../../../test/distributed/tensor/test_experimental_ops.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_bernoulli",
+    ),
+    "../../../../test/distributed/tensor/test_init.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": (
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_mean",
+        "test_nll_loss_and_cross_entropy",
+    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": (
+        # Need to update world size
+        "test_hsdp_tp_model_meta_init",
+    ),
+    "../../../../test/distributed/tensor/test_redistribute.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_redistribute_shard_dim_multi_dim_mesh",
+    ),
+    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
+    "../../../../test/distributed/_shard/test_sharder.py": None,
+    # FSDP2
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_clip_grad_norm_2d",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1571
+        "test_set_reduce_scatter_divide_factor",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_gradient_scaler",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_fully_shard_training_memory",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
+        # Performance test, should skip
+        "test_fully_shard_training_overlap",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1572
+        "test_dp_state_dict_cpu_offload",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_post_optim_event",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_train_parity_multi_group_unshard_async_op",
+        "test_train_parity_with_activation_checkpointing",
+    ),
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+}