NousResearch · xrsrke · Mar 6, 2026
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
@@ -9,25 +9,29 @@
 
 from tests.integration_tests import OverrideDefinitions
 
-# Use RUNNER_TEMP if defined (GitHub Actions variable), else fallback to old path
-runner_temp = os.getenv("RUNNER_TEMP")
-if runner_temp:
-    checkpoint_path = os.path.join(
-        runner_temp,
-        "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/",
-    )
-else:
-    checkpoint_path = (
-        "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/"
-    )
-
-
-def build_features_test_list() -> list[OverrideDefinitions]:
+def build_features_test_list(output_dir=None) -> list[OverrideDefinitions]:
     """
     key is the config file name and value is a list of OverrideDefinitions
     that is used to generate variations of integration tests based on the
     same root config file.
     """
+    # Construct checkpoint path for HF checkpoint test based on output_dir
+    runner_temp = os.getenv("RUNNER_TEMP")
+    if runner_temp:
+        checkpoint_path = os.path.join(
+            runner_temp,
+            "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/",
+        )
+    elif output_dir:
+        checkpoint_path = os.path.join(
+            output_dir,
+            "model_only_hf_checkpoint/hf_checkpoint/step-10/",
+        )
+    else:
+        checkpoint_path = (
+            "artifacts-to-be-uploaded/model_only_hf_checkpoint/hf_checkpoint/step-10/"
+        )
+
     integration_tests_flavors = [
         OverrideDefinitions(
             [

diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py
@@ -161,7 +161,11 @@ def main():
         args.test_suite in _TEST_SUITES_FUNCTION
     ), f"Unknown test suite {args.test_suite}"
 
-    test_list = _TEST_SUITES_FUNCTION[args.test_suite]()
+    build_fn = _TEST_SUITES_FUNCTION[args.test_suite]
+    if args.test_suite == "features":
+        test_list = build_fn(output_dir=args.output_dir)
+    else:
+        test_list = build_fn()
     run_tests(args, test_list)
 
 

diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -157,7 +157,7 @@ def tearDown(self):
         shutil.rmtree(self.base_temp_dir)
         time.sleep(0.1)
 
-    def fake_save(self, state_dict: dict, checkpoint_id: str, storage_writer=None):
+    def fake_save(self, state_dict: dict, checkpoint_id: str, storage_writer=None, process_group=None):
         os.makedirs(checkpoint_id, exist_ok=True)
         sd_to_save = {}
         for key, val in state_dict.items():
@@ -738,7 +738,7 @@ def test_load_only_prevents_saving(self, mock_save, mock_rank):
     @mock.patch("torchtitan.components.checkpoint.dcp.load")
     @mock.patch("torchtitan.components.checkpoint.dcp.save")
     def test_verify_prefix(self, mock_save, mock_load, mock_rank):
-        def fake_save(state_dict: dict, checkpoint_id: str, storage_writer=None):
+        def fake_save(state_dict: dict, checkpoint_id: str, storage_writer=None, process_group=None):
             self.assertIn("bias", state_dict)
             self.assertIn("weight", state_dict)
             # No model prefix

diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
@@ -304,7 +304,23 @@ def test_download_and_build_tokenizer(self, test_repo_id):
             )
 
         # Step 6: Compare with transformers tokenizer if available
+        # NOTE(phuc): DeepSeek-V3 ships two tokenizer files with different implementations:
+        #   - tokenizer.json: tokenizers lib BPE (used by HuggingFaceTokenizer)
+        #   - tokenizer.model: SentencePiece (used by transformers LlamaTokenizer)
+        # These produce different merge results for the same text, e.g. "Hello world!":
+        #   tokenizers lib: [19923, 2058, 3]
+        #   SentencePiece:  [42, 6629, 10030, 3]
+        # Same vocab, different BPE merge algorithms. Llama/Qwen pass because their
+        # two files happen to agree. This is an upstream tokenizer packaging issue,
+        # not a bug in HuggingFaceTokenizer. Step 5 already validates correctness
+        # against the official tokenizers lib.
         if transformers_tokenizer:
+            if "deepseek" in test_repo_id.lower():
+                self.skipTest(
+                    f"Skipping transformers comparison for {test_repo_id}: "
+                    f"tokenizer.json (BPE) and tokenizer.model (SentencePiece) "
+                    f"use different merge implementations that produce different token IDs"
+                )
             self._compare_tokenizers(
                 our_tokenizer, transformers_tokenizer, test_repo_id
             )

diff --git a/torchtitan/models/gpt_oss/model/moe.py b/torchtitan/models/gpt_oss/model/moe.py
@@ -261,7 +261,7 @@ def forward(
                 tp_degree,
             )
 
-    def init_weights(self, init_std: float):
+    def init_weights(self, init_std: float, n_layers: int):
         trunc_normal_(self.mlp1_weight, mean=0.0, std=init_std)
         trunc_normal_(self.mlp1_bias, mean=0.0, std=init_std)
         trunc_normal_(self.mlp2_weight, mean=0.0, std=init_std)

diff --git a/torchtitan/models/qwen3/__init__.py b/torchtitan/models/qwen3/__init__.py
@@ -173,6 +173,30 @@
             score_before_experts=False,
         ),
     ),
+    "debugmodel_moe_deepep": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=256,
+        n_layers=4,
+        n_heads=16,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=3072,
+        rope_theta=1000000,
+        moe_enabled=True,
+        moe_inter_dim=512,
+        moe_args=MoEArgs(
+            num_experts=128,
+            num_shared_experts=0,
+            top_k=8,
+            score_func="softmax",
+            route_norm=True,
+            route_scale=1.0,
+            score_before_experts=False,
+            use_deepep=True,
+        ),
+    ),
     "10B-A1B": Qwen3ModelArgs(
         vocab_size=151936,
         max_seq_len=8192,