diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index 1c9ac04785..0c1dd7d4f7 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -9,25 +9,29 @@ from tests.integration_tests import OverrideDefinitions -# Use RUNNER_TEMP if defined (GitHub Actions variable), else fallback to old path -runner_temp = os.getenv("RUNNER_TEMP") -if runner_temp: - checkpoint_path = os.path.join( - runner_temp, - "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/", - ) -else: - checkpoint_path = ( - "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/" - ) - - -def build_features_test_list() -> list[OverrideDefinitions]: +def build_features_test_list(output_dir=None) -> list[OverrideDefinitions]: """ key is the config file name and value is a list of OverrideDefinitions that is used to generate variations of integration tests based on the same root config file. """ + # Construct checkpoint path for HF checkpoint test based on output_dir + runner_temp = os.getenv("RUNNER_TEMP") + if runner_temp: + checkpoint_path = os.path.join( + runner_temp, + "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/", + ) + elif output_dir: + checkpoint_path = os.path.join( + output_dir, + "model_only_hf_checkpoint/hf_checkpoint/step-10/", + ) + else: + checkpoint_path = ( + "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/" + ) + integration_tests_flavors = [ OverrideDefinitions( [ diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index 77851bd4a0..9a87c71b64 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -161,7 +161,11 @@ def main(): args.test_suite in _TEST_SUITES_FUNCTION ), f"Unknown test suite {args.test_suite}" - test_list = _TEST_SUITES_FUNCTION[args.test_suite]() + build_fn = _TEST_SUITES_FUNCTION[args.test_suite] + if args.test_suite == "features": + test_list = build_fn(output_dir=args.output_dir) + else: + test_list = build_fn() run_tests(args, test_list) diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py index b5dc3de8a2..84dbf8809f 100644 --- a/tests/unit_tests/test_checkpoint.py +++ b/tests/unit_tests/test_checkpoint.py @@ -157,7 +157,7 @@ def tearDown(self): shutil.rmtree(self.base_temp_dir) time.sleep(0.1) - def fake_save(self, state_dict: dict, checkpoint_id: str, storage_writer=None): + def fake_save(self, state_dict: dict, checkpoint_id: str, storage_writer=None, process_group=None): os.makedirs(checkpoint_id, exist_ok=True) sd_to_save = {} for key, val in state_dict.items(): @@ -738,7 +738,7 @@ def test_load_only_prevents_saving(self, mock_save, mock_rank): @mock.patch("torchtitan.components.checkpoint.dcp.load") @mock.patch("torchtitan.components.checkpoint.dcp.save") def test_verify_prefix(self, mock_save, mock_load, mock_rank): - def fake_save(state_dict: dict, checkpoint_id: str, storage_writer=None): + def fake_save(state_dict: dict, checkpoint_id: str, storage_writer=None, process_group=None): self.assertIn("bias", state_dict) self.assertIn("weight", state_dict) # No model prefix diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py index a0306fde36..06a01f057e 100644 --- a/tests/unit_tests/test_tokenizer.py +++ b/tests/unit_tests/test_tokenizer.py @@ -304,7 +304,23 @@ def test_download_and_build_tokenizer(self, test_repo_id): ) # Step 6: Compare with transformers tokenizer if available + # NOTE(phuc): DeepSeek-V3 ships two tokenizer files with different implementations: + # - tokenizer.json: tokenizers lib BPE (used by HuggingFaceTokenizer) + # - tokenizer.model: SentencePiece (used by transformers LlamaTokenizer) + # These produce different merge results for the same text, e.g. "Hello world!": + # tokenizers lib: [19923, 2058, 3] + # SentencePiece: [42, 6629, 10030, 3] + # Same vocab, different BPE merge algorithms. Llama/Qwen pass because their + # two files happen to agree. This is an upstream tokenizer packaging issue, + # not a bug in HuggingFaceTokenizer. Step 5 already validates correctness + # against the official tokenizers lib. if transformers_tokenizer: + if "deepseek" in test_repo_id.lower(): + self.skipTest( + f"Skipping transformers comparison for {test_repo_id}: " + f"tokenizer.json (BPE) and tokenizer.model (SentencePiece) " + f"use different merge implementations that produce different token IDs" + ) self._compare_tokenizers( our_tokenizer, transformers_tokenizer, test_repo_id ) diff --git a/torchtitan/models/gpt_oss/model/moe.py b/torchtitan/models/gpt_oss/model/moe.py index d1563ed329..6baae4ab03 100644 --- a/torchtitan/models/gpt_oss/model/moe.py +++ b/torchtitan/models/gpt_oss/model/moe.py @@ -261,7 +261,7 @@ def forward( tp_degree, ) - def init_weights(self, init_std: float): + def init_weights(self, init_std: float, n_layers: int): trunc_normal_(self.mlp1_weight, mean=0.0, std=init_std) trunc_normal_(self.mlp1_bias, mean=0.0, std=init_std) trunc_normal_(self.mlp2_weight, mean=0.0, std=init_std) diff --git a/torchtitan/models/qwen3/__init__.py b/torchtitan/models/qwen3/__init__.py index a359c40a03..f58328458e 100644 --- a/torchtitan/models/qwen3/__init__.py +++ b/torchtitan/models/qwen3/__init__.py @@ -173,6 +173,30 @@ score_before_experts=False, ), ), + "debugmodel_moe_deepep": Qwen3ModelArgs( + vocab_size=151936, + max_seq_len=4096, + head_dim=128, + dim=256, + n_layers=4, + n_heads=16, + n_kv_heads=8, + qk_norm=True, + hidden_dim=3072, + rope_theta=1000000, + moe_enabled=True, + moe_inter_dim=512, + moe_args=MoEArgs( + num_experts=128, + num_shared_experts=0, + top_k=8, + score_func="softmax", + route_norm=True, + route_scale=1.0, + score_before_experts=False, + use_deepep=True, + ), + ), "10B-A1B": Qwen3ModelArgs( vocab_size=151936, max_seq_len=8192,