sgl-project · dcw02 · Oct 31, 2025 · Nov 4, 2025
@@ -0,0 +1,46 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "eagle_config": {
+    "eagle_aux_hidden_state_layer_ids": [
+      1,
+      23,
+      45
+    ]
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "model_type": "llama",
+  "target_model_type": "qwen3_vl_moe",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "pretraining_tp": 1,
+  "rope_scaling": {
+    "rope_type": "default",
+    "mrope_section": [
+      24,
+      20,
+      20
+    ],
+    "mrope_interleaved": true
+  },
+  "rope_theta": 5000000,
+  "tie_word_embeddings": false,
+  "dtype": "bfloat16",
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "draft_vocab_size": 32000
+}
@@ -0,0 +1,39 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 262144,
+  "model_type": "llama",
+  "target_model_type": "qwen3_vl",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "pretraining_tp": 1,
+  "rope_scaling": {
+    "rope_type": "default",
+    "mrope_section": [
+      24,
+      20,
+      20
+    ],
+    "mrope_interleaved": true
+  },
+  "rope_theta": 5000000,
+  "tie_word_embeddings": false,
+  "dtype": "bfloat16",
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "draft_vocab_size": 32000
+}
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+# support tp1 train eagle3 for Qwen3-VL-8B-Instruct
+NUM_GPUS=${1:-1}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3_online.py \
+    --target-model-path Qwen/Qwen3-VL-8B-Instruct \
+    --draft-model-config $ROOT_DIR/configs/qwen3-vl-8b-eagle3.json \
+    --train-data-path $ROOT_DIR/cache/dataset/allava4v_train.jsonl \
+    --output-dir $ROOT_DIR/outputs/Qwen3-VL-8B-eagle3 \
+    --build-dataset-num-proc 0 \
+    --num-epochs 10 \
+    --batch-size 1 \
+    --learning-rate 1e-4 \
+    --max-length 8192 \
+    --chat-template qwen3-vl \
+    --cache-dir $ROOT_DIR/cache \
+    --embedding-key model.language_model.embed_tokens.weight \
+    --tp-size 1 \
+    --is-vlm \
+    --min-pixels 50176 \
+    --max-pixels 802816
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+# support tp1 train eagle3 for Qwen3-VL-30B-A3B-Instruct
+NUM_GPUS=${1:-1}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3_online.py \
+    --target-model-path Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --draft-model-config $ROOT_DIR/configs/qwen3-vl-30b-a3b-eagle3.json \
+    --train-data-path $ROOT_DIR/cache/dataset/allava4v_train.jsonl \
+    --output-dir $ROOT_DIR/outputs/Qwen3-VL-30B-A3B-eagle3 \
+    --build-dataset-num-proc 0 \
+    --num-epochs 10 \
+    --batch-size 1 \
+    --learning-rate 1e-4 \
+    --max-length 8192 \
+    --chat-template qwen3-vl \
+    --cache-dir $ROOT_DIR/cache \
+    --embedding-key model.language_model.embed_tokens.weight \
+    --tp-size 1 \
+    --is-vlm \
+    --min-pixels 50176 \
+    --max-pixels 802816
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@ pre-commit
 torch==2.8.0
 torchaudio==2.8.0
 torchvision==0.23.0
-transformers==4.55.2
+transformers==4.57.1
 qwen-vl-utils==0.0.11
 datasets
 setuptools
@@ -12,5 +12,5 @@ psutil
 numpy
 accelerate
 pydantic
-sglang[all]==0.5.1
+sglang[all]==0.5.4.post1
 openai-harmony
@@ -245,7 +245,7 @@ def main():
         ds = ds.rename_column("uuid", "id")
         proc_fn = process_sharegpt_row
     elif args.dataset == "sharegpt4v":
-        ds = load_dataset("Lin-Chen/ShareGPT4V")["train"]
+        ds = load_dataset("Lin-Chen/ShareGPT4V", name="ShareGPT4V")["train"]
         proc_fn = process_sharegpt4v_row
     elif args.dataset == "allava4v":
         ds = load_dataset("FreedomIntelligence/ALLaVA-4V", name="allava_laion")[

@@ -262,17 +262,44 @@ def main():
                 device_mesh=get_tp_device_mesh(),
             ).eval()
     else:
-        if args.is_vlm and draft_model_config.target_model_type == "qwen2_5_vl":
-            from transformers import Qwen2_5_VLForConditionalGeneration
+        if args.is_vlm and draft_model_config.target_model_type in {
+            "qwen2_5_vl",
+            "qwen3_vl",
+            "qwen3_vl_moe",
+        }:
+            if draft_model_config.target_model_type == "qwen2_5_vl":
+                from transformers import Qwen2_5_VLForConditionalGeneration
+
+                target_model = (
+                    Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        pretrained_model_name_or_path=args.target_model_path,
+                        torch_dtype=torch.bfloat16,
+                    )
+                    .eval()
+                    .cuda()
+                )
+            elif draft_model_config.target_model_type == "qwen3_vl":
+                from transformers import Qwen3VLForConditionalGeneration
 
-            target_model = (
-                Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                    pretrained_model_name_or_path=args.target_model_path,
-                    torch_dtype=torch.bfloat16,
+                target_model = (
+                    Qwen3VLForConditionalGeneration.from_pretrained(
+                        pretrained_model_name_or_path=args.target_model_path,
+                        dtype=torch.bfloat16,
+                    )
+                    .eval()
+                    .cuda()
+                )
+            elif draft_model_config.target_model_type == "qwen3_vl_moe":
+                from transformers import Qwen3VLMoeForConditionalGeneration
+
+                target_model = (
+                    Qwen3VLMoeForConditionalGeneration.from_pretrained(
+                        pretrained_model_name_or_path=args.target_model_path,
+                        dtype=torch.bfloat16,
+                    )
+                    .eval()
+                    .cuda()
                 )
-                .eval()
-                .cuda()
-            )
         else:
             target_model = (
                 AutoModelForCausalLM.from_pretrained(
@@ -314,6 +341,10 @@ def main():
             min_pixels=args.min_pixels,
             max_pixels=args.max_pixels,
         )
+        if args.build_dataset_num_proc > 0:
+            print_on_rank0(
+                "WARNING: VLM dataset preprocessing may hang with --build-dataset-num-proc > 0"
+            )
     else:
         processor = None
 
@@ -396,13 +427,18 @@ def main():
 
     # build Eagle3 model
     # broadcast draft model
-    if args.is_vlm and draft_model_config.target_model_type == "qwen2_5_vl":
+    if args.is_vlm and draft_model_config.target_model_type in {
+        "qwen2_5_vl",
+        "qwen3_vl",
+        "qwen3_vl_moe",
+    }:
         eagle3_model = QwenVLOnlineEagle3Model(
             target_model=target_model,
             draft_model=draft_model,
             processor=processor,
             length=args.ttt_length,
             attention_backend=args.attention_backend,
+            target_model_type=draft_model_config.target_model_type,
         )
     else:
         eagle3_model = OnlineEagle3Model(