Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions recipes/configs/qwen3/30B_A3B_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Config for multi-device full finetuning in full_finetune_distributed.py
# using a Qwen3 30B A3B model
#
# This config assumes that you've run the following command before launching:
# tune download Qwen/Qwen3-30B-A3B --output-dir /tmp/Qwen3-30B-A3B
#
# To launch on 4 devices, run the following command from root:
# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen3/30B_A3B_full
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training
# you can run:
# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen3/30B_A3B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>

output_dir: /tmp/qwen3_30B_A3B/full # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.qwen3.qwen3_tokenizer
path: /tmp/Qwen3-30B-A3B/vocab.json
merges_file: /tmp/Qwen3-30B-A3B/merges.txt
max_seq_len: null

# Dataset
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.qwen3.qwen3_moe_30b_a3b_instruct

checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen3-30B-A3B
checkpoint_files: [
model-00001-of-00016.safetensors,
model-00002-of-00016.safetensors,
model-00003-of-00016.safetensors,
model-00004-of-00016.safetensors,
model-00005-of-00016.safetensors,
model-00006-of-00016.safetensors,
model-00007-of-00016.safetensors,
model-00008-of-00016.safetensors,
model-00009-of-00016.safetensors,
model-00010-of-00016.safetensors,
model-00011-of-00016.safetensors,
model-00012-of-00016.safetensors,
model-00013-of-00016.safetensors,
model-00014-of-00016.safetensors,
model-00015-of-00016.safetensors,
model-00016-of-00016.safetensors,
]
recipe_checkpoint: null
output_dir: ${output_dir}
model_type: QWEN3_MOE
resume_from_checkpoint: False

# Fine-tuning arguments
batch_size: 2
epochs: 1
optimizer:
_component_: torch.optim.AdamW
fused: True
lr: 5e-6
loss:
_component_: torchtune.modules.loss.LinearCrossEntropyLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
#custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
#fsdp_cpu_offload: True

# Reduced precision
dtype: bf16

# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True
log_level: INFO # DEBUG, WARN, etc.


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
4 changes: 2 additions & 2 deletions recipes/lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,11 @@ def setup(self, cfg: DictConfig) -> None:
self._metric_logger.log_config(cfg)

if (
cfg.checkpointer.model_type == "LLAMA4"
cfg.checkpointer.model_type in ("LLAMA4", "QWEN3_MOE")
and self._save_adapter_weights_only is False
):
raise ValueError(
"For Llama4 training, you should set save_adapter_weights_only to True."
f"For {cfg.checkpointer.model_type} training, you should set save_adapter_weights_only to True."
)

checkpoint_dict = self._checkpoint_client.load_base_checkpoint()
Expand Down
18 changes: 18 additions & 0 deletions torchtune/models/qwen3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from ._component_builders import lora_qwen3_moe, qwen3_moe
from ._model_builders import (
lora_qwen3_0_6b_base,
lora_qwen3_0_6b_instruct,
Expand All @@ -16,6 +17,9 @@
lora_qwen3_4b_instruct,
lora_qwen3_8b_base,
lora_qwen3_8b_instruct,
lora_qwen3_moe_235b_a22b,
lora_qwen3_moe_30b_a3b_base,
lora_qwen3_moe_30b_a3b_instruct,
qwen3_0_6b_base,
qwen3_0_6b_instruct,
qwen3_14b_base,
Expand All @@ -27,10 +31,18 @@
qwen3_4b_instruct,
qwen3_8b_base,
qwen3_8b_instruct,
qwen3_moe_235b_a22b,
qwen3_moe_30b_a3b_base,
qwen3_moe_30b_a3b_instruct,
qwen3_tokenizer,
)

from ._parallelism import tp_plan

__all__ = [
"tp_plan",
"qwen3_moe",
"lora_qwen3_moe",
"lora_qwen3_0_6b_base",
"lora_qwen3_0_6b_instruct",
"lora_qwen3_1_7b_base",
Expand All @@ -42,6 +54,9 @@
"lora_qwen3_14b_base",
"lora_qwen3_14b_instruct",
"lora_qwen3_32b",
"lora_qwen3_moe_30b_a3b_base",
"lora_qwen3_moe_30b_a3b_instruct",
"lora_qwen3_moe_235b_a22b",
"qwen3_0_6b_base",
"qwen3_0_6b_instruct",
"qwen3_1_7b_base",
Expand All @@ -53,5 +68,8 @@
"qwen3_14b_base",
"qwen3_14b_instruct",
"qwen3_32b",
"qwen3_moe_30b_a3b_base",
"qwen3_moe_30b_a3b_instruct",
"qwen3_moe_235b_a22b",
"qwen3_tokenizer",
]
Loading