Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ packages/
cookbooks/cosmos3/generator/audiovisual/outputs/
outputs/

# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs)
cookbooks/cosmos3/finetune/data/
cookbooks/cosmos3/finetune/checkpoints/
cookbooks/cosmos3/finetune/outputs/

# Superpowers design specs / implementation plans (kept local, not tracked)
docs/superpowers/

# Streamlit
.streamlit/

Expand Down
345 changes: 345 additions & 0 deletions cookbooks/cosmos3/finetune/README.md

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions cookbooks/cosmos3/finetune/_sft_launcher_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Shared launch plumbing for the cookbook launch_sft_<recipe>.sh — the
# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train.
#
# REQUIRES: an activated cosmos-framework venv (see the finetune README
# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT
# add the framework's .venv/bin to PATH.
#
# Caller MUST set before sourcing:
# TOML_FILE recipe TOML, e.g. "toml/sft_config/<recipe>.toml".
# Absolute or cookbook-relative.
#
# Caller MAY set before sourcing (presence drives which existence checks fire):
# DATASET_PATH recipe-local dataset dir, e.g. "data/<name>".
# If unset, no dataset existence check fires
# (reasoner / HF-streaming case).
# BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/<name>".
# Setting it also enables WAN_VAE_PATH plumbing + check.
# WAN_VAE_PATH override the default checkpoints/wan22_vae/Wan2.2_VAE.pth.
# EXTRA_DATASET_CHECK bash snippet (string) eval'd after the default checks.
# TAIL_OVERRIDES bash array of Hydra CLI overrides appended after `--`
# (e.g. data_setting.max_tokens=16000 for VLM smokes).
# MASTER_PORT torchrun --master_port; default 50012.
# NPROC_PER_NODE torchrun --nproc_per_node; default 8.
# LOG_FILENAME override $LOG_DIR/${LOG_FILENAME}
# (default <toml-stem>_sft.log).
#
# Absolute paths are passed through; relative paths are anchored to the cookbook
# dir (the directory containing this launcher). Paths set in the caller's shell
# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the
# `: "${VAR:=default}"` idiom in the launcher to preserve this).

set -uo pipefail

: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}"

# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/).
WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)"

# Anchor relative paths to $WORKDIR.
[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE"

if [[ -n "${DATASET_PATH:-}" ]]; then
[[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH"
export DATASET_PATH
fi

if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
[[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH"
WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}"
[[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH"
export BASE_CHECKPOINT_PATH WAN_VAE_PATH
fi

OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}"
LOG_DIR="$OUTPUT_ROOT/logs"
TOML_STEM="$(basename "$TOML_FILE" .toml)"
LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}"
IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}"
mkdir -p "$LOG_DIR"

echo ">>> $(date '+%H:%M:%S') Checking inputs..."
[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; }
if [[ -n "${DATASET_PATH:-}" ]]; then
[[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=<path>)" >&2; exit 1; }
fi
if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
[[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=<path>)" >&2; exit 1; }
[[ -f "$WAN_VAE_PATH" ]] || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=<path>)" >&2; exit 1; }
fi
if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi

cd "$WORKDIR"
echo ">>> $(date '+%H:%M:%S') WORKDIR: $WORKDIR"
echo ">>> $(date '+%H:%M:%S') TOML: $TOML_FILE"
[[ -n "${DATASET_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') dataset: $DATASET_PATH"
[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH"
echo ">>> $(date '+%H:%M:%S') log: $LOG_FILE"

# Default empty if caller didn't set; safe under set -u.
[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=()

TRAILING_ARGS=()
if (( ${#TAIL_OVERRIDES[@]} > 0 )); then
TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}")
fi

IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \
torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \
--sft-toml="$TOML_FILE" \
"${TRAILING_ARGS[@]}" \
2>&1 | tee "$LOG_FILE"

EXIT_CODE=${PIPESTATUS[0]}
echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)"
exit $EXIT_CODE
43 changes: 43 additions & 0 deletions cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Structured-TOML launch for llava_ov (VLM SFT on
# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives
# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml.
#
# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
#
# Requires an activated cosmos-framework venv (see the finetune README
# Prerequisites). Run from cookbooks/cosmos3/finetune/.
#
# The dataset streams from the HuggingFace Hub, so DATASET_PATH /
# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required.
#
# Optional env:
# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads.
# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors
# (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via
# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
# When set, plumbed to backbone.safetensors_path via a
# tail override. When unset, the framework falls back
# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
#
# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
# bash launch_sft_llava_ov.sh

TOML_FILE="toml/sft_config/llava_ov.toml"

TAIL_OVERRIDES=(
${EXTRA_TAIL_OVERRIDES:-}
)

# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
# while keeping the public HF model_name for tokenizer/architecture discovery.
if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
fi

source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
45 changes: 45 additions & 0 deletions cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2
# via CosmosDataLoader). Drives cosmos_framework.scripts.train against
# toml/sft_config/videophy2_sft_nano.toml.
#
# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
#
# Requires an activated cosmos-framework venv (see the finetune README
# Prerequisites). Run from cookbooks/cosmos3/finetune/.
#
# Required env:
Comment thread
lfengad marked this conversation as resolved.
Outdated
# VIDEOPHYSICS_ROOT dir containing videophy2_train/ and videophy2_val/
# (each with meta.json + media/ + text/). Populate via
# `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`.
#
# Optional env:
# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads.
# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors
# (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via
# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
# When set, plumbed to backbone.safetensors_path via a
# tail override. When unset, the framework falls back
# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
#
# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
# VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh

TOML_FILE="toml/sft_config/videophy2_sft_nano.toml"

TAIL_OVERRIDES=(
${EXTRA_TAIL_OVERRIDES:-}
)

# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
# while keeping the public HF model_name for tokenizer/architecture discovery.
if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
fi

source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
30 changes: 30 additions & 0 deletions cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only
# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against
# toml/sft_config/vision_sft_nano.toml.
#
# Requires an activated cosmos-framework venv (see the finetune README
# Prerequisites). Run from cookbooks/cosmos3/finetune/.
#
# Optional env vars (defaults below point under this cookbook dir; override to
# put data or checkpoints on a different filesystem):
# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
# (must contain train/video_dataset_file.jsonl)
# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Nano
# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth
# HF_TOKEN if any tokenizer download requires gated HF access
# OUTPUT_ROOT default: outputs/train
#
# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
# bash launch_sft_vision_nano.sh

TOML_FILE="toml/sft_config/vision_sft_nano.toml"
: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}"

EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'

source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
36 changes: 36 additions & 0 deletions cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on
# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives
# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml.
#
# Requires an activated cosmos-framework venv (see the finetune README
# Prerequisites). Run from cookbooks/cosmos3/finetune/.
#
# Optional env vars (defaults below point under this cookbook dir; override to
# put data or checkpoints on a different filesystem):
# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
# (must contain train/video_dataset_file.jsonl)
# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Super
# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth
# HF_TOKEN if any tokenizer download requires gated HF access
# OUTPUT_ROOT default: outputs/train
#
# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
# bash launch_sft_vision_super.sh

TOML_FILE="toml/sft_config/vision_sft_super.toml"
: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}"

EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'

# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs
# bleeding into the venv, switch the allocator to expandable_segments so the
# 32B backbone fits without OOM during compile/decode.
export LD_LIBRARY_PATH=""
export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"

source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
108 changes: 108 additions & 0 deletions cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# pre_exp012_llava_ov — VLM training on lmms-lab/LLaVA-OneVision-Data
# via CosmosDataLoader. Base config = cosmos_framework/configs/base/vlm/config.py
# (selected by [job].task="vlm").
#
# One knob that the SFTExperimentConfig dataclass does NOT model — supply
# it as a CLI extra override at launch time:
#
# data_setting.max_tokens=<int, drives both max_seq_len and dataloader.max_tokens>
#
# (The backbone is now modeled — see [model.backbone] below.)
#
# Example launch:
# torchrun --nproc_per_node=4 -m cosmos_framework.scripts.train \
# --sft-toml toml/sft_config/llava_ov.toml -- \
# data_setting.max_tokens=16000
#
# Per-task remap (see _PATH_REMAPS["vlm"]):
# model.parallelism.* -> model.config.parallelism.*
# model.compile.* -> model.config.compile.*
# model.activation_checkpointing.* -> model.config.activation_checkpointing.*
# model.precision -> model.config.precision
# model.attn_implementation -> model.config.policy.attn_implementation
# model.backbone.* -> model.config.policy.backbone.*
# model.ema.* -> model.config.ema.*
# model.{max_num_tokens_after_packing, joint_attn_implementation, lora_*,
# tokenizer.*} and dataloader_train.{max_sequence_length, seed} -> SKIPPED

[job]
task = "vlm"
experiment = "pre_exp012_llava_ov"
project = "cosmos3" # matches legacy
group = "vlm_llava_ov_demo"
name = "pre_exp012_llava_ov"
wandb_mode = "disabled"

[model]
# VLM-only attention impl (PolicyConfig.attn_implementation).
attn_implementation = "cosmos" # "cosmos" | "flash_attention_2" | "sdpa" | "eager"
precision = "bfloat16" # was [model.parallelism].precision

[model.backbone]
model_name = "Qwen/Qwen3-VL-8B-Instruct" # → model.config.policy.backbone.model_name (VLM remap)

[model.ema]
enabled = false
rate = 0.1
iteration_shift = 0

[model.parallelism]
data_parallel_shard_degree = 8 # matches legacy dp_shard_size=8
data_parallel_replicate_degree = -1 # matches legacy dp_replicate_size=-1
context_parallel_shard_degree = 1
cfg_parallel_shard_degree = 1

[model.compile]
enabled = false # was [model.parallelism].use_torch_compile
compile_dynamic = true

[model.activation_checkpointing]
mode = "full"
save_ops_regex = ["fmha"]
preserve_rng_state = true
determinism_check = "default"

[optimizer]
betas = [0.9, 0.95]
eps = 1.0e-8 # skipped for VLM by _PATH_REMAPS
fused = true
lr = 1.0e-5 # matches legacy
weight_decay = 0.1 # matches legacy
# keys_to_select / lr_multipliers omitted — VLM Trainer defaults apply.

[scheduler]
cycle_lengths = [500] # matches legacy (VLM_LAMBDACOSINE_KWARGS uses ${trainer.max_iter})
f_max = [1.0]
f_min = [0.5] # matches legacy
f_start = [0.05] # matches legacy
verbosity_interval = 0 # skipped for VLM by _PATH_REMAPS
warm_up_steps = [1000] # matches legacy

[trainer]
distributed_parallelism = "fsdp"
grad_accum_iter = 1
logging_iter = 1
max_iter = 500 # matches legacy

[trainer.callbacks.compile_tokenizer]
compile_after_iterations = 3
enabled = false

[trainer.callbacks.grad_clip]
clip_norm = 1.0
force_finite = false # matches VLM default in cosmos_framework/configs/base/vlm/defaults/callbacks.py:55

[checkpoint]
keys_to_skip_loading = []
load_path = "???" # MISSING sentinel; skipped by build_hydra_overrides — supply at runtime
save_iter = 100

[dataloader_train]
# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher:
# max_samples_per_batch -> dataloader_train.batcher.max_batch_size
# max_sequence_length -> dataloader_train.batcher.max_tokens
max_samples_per_batch = 1
max_sequence_length = 16000
Loading