Merge branch 'main' into xccl/new_api

Chao1Han · web-flow · commit 08819c79c9ea · 2025-03-25T12:46:53.000+08:00
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
@@ -2,22 +2,42 @@
 import subprocess
 import sys
 
+from skip_list_dist import skip_dict
+from xpu_test_utils import launch_test
 
+res = 0
+res2 = 0
+fail_test = []
+
+
+# run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
     print(result.stdout)
     print(result.stderr)
     if "FAILED" in result.stdout or "FAILED" in result.stderr:
-        return 0
-    else:
-        return 1
+        fail_test.append(" ".join(test_command))
+    return result.returncode
 
 
-res = 0
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
 test_command = ["python", "distributed/test_c10d_xccl.py"]
 res += run(test_command)
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res2 += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res2)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
@@ -773,6 +773,13 @@
         "test_scaled_dot_product_attention_3D_input_dim_2D_attn_mask_dropout_p_0_5_xpu",
         "test_scaled_dot_product_attention_3D_input_dim_2D_attn_mask_dropout_p_0_2_xpu",
         "test_scaled_dot_product_attention_3D_input_dim_2D_attn_mask_dropout_p_0_0_xpu",
+        # https://github.com/intel/torch-xpu-ops/issues/1432
+        "test_multiheadattention_fastpath_attn_mask_attn_mask_dim_2_key_padding_mask_dim_2_bool_xpu",
+        "test_multiheadattention_fastpath_attn_mask_attn_mask_dim_3_key_padding_mask_dim_2_bool_xpu",
+        "test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_False_use_autocast_False_d_model_12_xpu",
+        "test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_False_use_autocast_True_d_model_12_xpu",
+        "test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_True_use_autocast_False_d_model_12_xpu",
+        "test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_True_use_autocast_True_d_model_12_xpu",
     ),
     "test_complex_xpu.py": None,
     "test_modules_xpu.py": (
@@ -1027,6 +1034,10 @@
         "test_ctc_loss_cudnn_tensor",  # want "xpu" in function name
         # RuntimeError: reflection_pad2d_backward_xpu does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'.
         "test_ReflectionPad2d_large_deterministic_xpu",
+        # Case updated in pytorch commit 97272e4
+        "test_hardswish_grad_corner_xpu_bfloat16",
+        "test_hardswish_grad_corner_xpu_float16",
+        "test_hardswish_grad_corner_xpu_float32",
     ),
     "test_indexing_xpu.py": (
         # XPU implementation doesn't claimn FP8 now
@@ -1466,6 +1477,13 @@
         "test_compile_int4_mm_m_64_k_32_n_64_xpu",
         "test_compile_int4_mm_m_64_k_64_n_48_xpu",
         "test_compile_int4_mm_m_64_k_64_n_64_xpu",
+        # float8 is not supported
+        "test_matmul_scaled_gemm_offline_tunableop_xpu_float8_e4m3fnuz",
+        "test_matmul_scaled_gemm_offline_tunableop_xpu_float8_e5m2fnuz",
+        "test_scaled_gemm_offline_tunableop_xpu_float8_e4m3fnuz",
+        "test_scaled_gemm_offline_tunableop_xpu_float8_e5m2fnuz",
+        # case need to port for xpu
+        "test_gemm_bias_offline_tunableop_xpu_bfloat16",
     ),
     "test_ops_fwd_gradients_xpu.py": (
         # All of the followings are oneDNN issues
diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
@@ -0,0 +1,95 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+        "test_ddp_parity_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_false_no_shard_xpu",
+        "test_delayed_optim_step_offload_false_none_xpu",
+        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
+        "test_delayed_optim_step_offload_true_none_xpu",
+        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
+        "test_delayed_reduce_scatter_offload_false_none_xpu",
+        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_true_none_xpu",
+        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_offload_false_none_xpu",
+        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_true_none_xpu",
+        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
+        "test_nested_always_wrap_model_offload_false_none_xpu",
+        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_true_none_xpu",
+        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_false_no_shard_xpu",
+        "test_nested_wrapped_model_offload_false_none_xpu",
+        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_true_none_xpu",
+        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
+        "test_transformer_offload_false_no_shard_xpu",
+        "test_transformer_offload_false_none_xpu",
+        "test_transformer_offload_false_shard_grad_op_xpu",
+        "test_transformer_offload_true_none_xpu",
+        "test_transformer_offload_true_shard_grad_op_xpu",
+        # https://github.com/intel/torch-xpu-ops/issues/1475
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+        "test_transformer_no_grad_mixed_precision_False_xpu",
+    ),
+    # Will add them back after debugging
+    # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
+    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
+    #    "test_raises_warning_or_errors_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        "test_parity_with_non_frozen_fsdp_xpu",
+        "test_parity_with_ddp_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    # Will add them back after debugging
+    # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
+    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
+    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
+    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
+    #    "test_hsdp_init_with_device_mesh_xpu",
+    #    "test_root_module_is_not_FSDP_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+}
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
@@ -429,6 +429,11 @@
             torch.float32: tol(atol=2e-5, rtol=5e-5),
         }
     },
+    "test_modules_xpu.py": {
+        ("TestModuleXPU", "test_non_contiguous_tensors_nn_LazyConv3d_xpu_float32"): {
+            torch.float32: tol(atol=2e-5, rtol=7e-5),
+        }
+    },
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -429,6 +429,11 @@`
`429`	`429`	`torch.float32: tol(atol=2e-5, rtol=5e-5),`
`430`	`430`	`}`
`431`	`431`	`},`
	`432`	`+ "test_modules_xpu.py": {`
	`433`	`+ ("TestModuleXPU", "test_non_contiguous_tensors_nn_LazyConv3d_xpu_float32"): {`
	`434`	`+ torch.float32: tol(atol=2e-5, rtol=7e-5),`
	`435`	`+ }`
	`436`	`+ },`
`432`	`437`	`}`
`433`	`438`
`434`	`439`