intel · zxd1997066 · Sep 19, 2025 · Sep 25, 2025 · chuanqi129 · Sep 29, 2025
diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml
@@ -1,5 +1,11 @@
 name: Get Runner Infos
 
+inputs:
+  ut_name:
+    required: true
-    required: true
+    required: false
-    required: true
+    required: false
+    type: string
+    description: Which ut to launch
+
 outputs:
   runner_id:
     value: ${{ steps.runner.outputs.runner_id }}
@@ -57,22 +63,41 @@ runs:
           if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
         }' |wc -l)"
         cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
-        pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
-          if (x > 0) {
-            split(z, xpu_list, ",");
-            for (i=0;i<x;i++) {
-              if (z != "") {
-                  ze = xpu_list[i+1];
-              } else {
-                  ze = i;
+        if [ "${{ inputs.ut_name }}" == "xpu_distributed" ];then
+          pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
+            if (x > 0) {
+              split(z, xpu_list, ",");
+              for (i=0;i<x;i=i+4) {
+                if (z != "") {
+                    ze = xpu_list[i+1];
+                } else {
+                    ze = i;
+                }
+                printf(" --tx popen//env:ZE_AFFINITY_MASK=%d,%d,%d,%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
+                      ze,ze+1,ze+2,ze+3,4*cx,i*cx,(i+4)*cx-1);
               }
-              printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
-                      ze, cx, i*cx, (i+1)*cx-1);
+            }else {
+              printf(" -n 1 ");
             }
-          }else {
-            printf(" -n 1 ");
-          }
-        }')"
+          }')"
+        else
+          pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
+            if (x > 0) {
+              split(z, xpu_list, ",");
+              for (i=0;i<x;i++) {
+                if (z != "") {
+                    ze = xpu_list[i+1];
+                } else {
+                    ze = i;
+                }
+                printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
+                        ze, cx, i*cx, (i+1)*cx-1);
+              }
+            }else {
+              printf(" -n 1 ");
+            }
+          }')"
+        fi
         echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
         echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
         echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -38,7 +38,7 @@ env:
 jobs:
   runner:
     runs-on: ${{ inputs.runner }}
-    name: get-runner
+    name: get-runner 
     outputs:
       runner_id: ${{ steps.runner-info.outputs.runner_id }}
       user_id: ${{ steps.runner-info.outputs.user_id }}
@@ -53,6 +53,8 @@ jobs:
         uses: actions/checkout@v4
       - name: Get runner
         id: runner-info
+        with:
+          ut_name: ${{ inputs.ut }}
         uses: ./.github/actions/get-runner
 
   test-in-container:
@@ -104,7 +106,7 @@ jobs:
     runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
+      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }}
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4

diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 import sys
 
@@ -9,42 +8,6 @@
 res2 = 0
 fail_test = []
 
-# Get the xelink group card affinity
-ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
-if ret == 0:
-    gpu_dict = {}
-    with open("topology.log") as file:
-        lines = file.readlines()
-        for line in lines:
-            if "CPU Affinity" in line:
-                continue
-            line = line.strip()
-            if line.startswith("GPU "):
-                items = line.split(" ")
-                items = [x for x in items if x]
-                gpu_id = items[1]
-                i = gpu_id.split("/")[0]
-                affinity = ""
-                for j, item in enumerate(items):
-                    if "SYS" not in item and ("XL" in item or "S" in item):
-                        if len(affinity) == 0:
-                            affinity = str(j - 2)
-                        else:
-                            affinity = affinity + "," + str(j - 2)
-                gpu_dict[i] = affinity
-
-    max_affinity = ""
-    for key, value in gpu_dict.items():
-        if len(value) > len(max_affinity):
-            max_affinity = value
-
-    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
-    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
-
-else:
-    print("xpu-smi topology failed")
-    sys.exit(255)
-
 
 # run python test
 def run(test_command):
@@ -56,8 +19,6 @@ def run(test_command):
     return result.returncode
 
 
-test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
-res += run(test_command)
 test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
 res += run(test_command)
 test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]

diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
@@ -1,35 +1,82 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
-        "test_ddp_parity_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
-        "test_parity_with_non_frozen_fsdp_xpu",
-        "test_parity_with_ddp_xpu",
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_overlap.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
+        "test_diff_hyperparams_sharding_strategy_str_full_shard",
+        "test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
     ),
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
-        "test_transformer_no_grad_mixed_precision_True_xpu",
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
+        "test_compiled_autograd_ctx",
+        "test_nested_fully_shard_backend_aot_eager",
+        "test_nested_fully_shard_backend_aot_eager_decomp_partition",
+        "test_nested_fully_shard_backend_inductor_fullgraph_True",
+        "test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition",
+        "test_simple_mlp_fullgraph_backend_aot_eager",
+        "test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition",
+        "test_simple_mlp_fullgraph_backend_inductor",
+        "test_transformer_backend_aot_eager",
+        "test_transformer_backend_aot_eager_decomp_partition",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
-    "../../../../test/distributed/fsdp/test_utils.py": None,
-    "distributed/test_c10d_xccl.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/2046
-        "test_unwaited",
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
+        "test_cached_state_dict",
+        "test_dp_state_dict_cpu_offload",
     ),
-    "distributed/test_c10d_ops_xccl.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
-    "../../../../test/distributed/test_functional_api.py": (
-        # depends on https://github.com/pytorch/pytorch/pull/159473
-        "test_tracing_with_fakepg_xpu",
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        "test_explicit_prefetching",
+        "test_multi_forward_module",
+        "test_train_parity_single_group_shard_dim0",
+        "test_train_parity_single_group_shard_largest_dim",
+        "test_train_parity_shard_placement_fn_shard_largest_dim",
+        "test_3d_mlp_with_nd_mesh",
+    ),
+    "../../../../test/distributed/_composable/test_composability/test_2d_composability.py": (
+        "test_tp_with_fsdp_offloading",
     ),
+    "../../../../test/distributed/_composable/test_replicate_with_compiler.py": None,
+    "../../../../test/distributed/_composable/test_composability/test_pp_composability.py": None,
+    "../../../../test/distributed/_composable/test_checkpoint.py": None,
+    "../../../../test/distributed/_composable/test_contract.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "distributed/test_c10d_ops_xccl.py": None,
+    "../../../../test/distributed/test_functional_api.py": None,
     "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
     "../../../../test/distributed/_tools/test_mem_tracker.py": None,
     "../../../../test/distributed/_tools/test_memory_tracker.py": None,