NVIDIA · foreverlms · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -181,6 +181,14 @@ packages/
 cookbooks/cosmos3/generator/audiovisual/outputs/
 outputs/
 
+# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs)
+cookbooks/cosmos3/finetune/data/
+cookbooks/cosmos3/finetune/checkpoints/
+cookbooks/cosmos3/finetune/outputs/
+
+# Superpowers design specs / implementation plans (kept local, not tracked)
+docs/superpowers/
+
 # Streamlit
 .streamlit/
 

diff --git a/cookbooks/cosmos3/finetune/README.md b/cookbooks/cosmos3/finetune/README.md
diff --git a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Shared launch plumbing for the cookbook launch_sft_<recipe>.sh — the
+# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train.
+#
+# REQUIRES: an activated cosmos-framework venv (see the finetune README
+# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT
+# add the framework's .venv/bin to PATH.
+#
+# Caller MUST set before sourcing:
+#   TOML_FILE            recipe TOML, e.g. "toml/sft_config/<recipe>.toml".
+#                        Absolute or cookbook-relative.
+#
+# Caller MAY set before sourcing (presence drives which existence checks fire):
+#   DATASET_PATH         recipe-local dataset dir, e.g. "data/<name>".
+#                        If unset, no dataset existence check fires
+#                        (reasoner / HF-streaming case).
+#   BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/<name>".
+#                        Setting it also enables WAN_VAE_PATH plumbing + check.
+#   WAN_VAE_PATH         override the default checkpoints/wan22_vae/Wan2.2_VAE.pth.
+#   EXTRA_DATASET_CHECK  bash snippet (string) eval'd after the default checks.
+#   TAIL_OVERRIDES       bash array of Hydra CLI overrides appended after `--`
+#                        (e.g. data_setting.max_tokens=16000 for VLM smokes).
+#   MASTER_PORT          torchrun --master_port; default 50012.
+#   NPROC_PER_NODE       torchrun --nproc_per_node; default 8.
+#   LOG_FILENAME         override $LOG_DIR/${LOG_FILENAME}
+#                        (default <toml-stem>_sft.log).
+#
+# Absolute paths are passed through; relative paths are anchored to the cookbook
+# dir (the directory containing this launcher). Paths set in the caller's shell
+# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the
+# `: "${VAR:=default}"` idiom in the launcher to preserve this).
+
+set -uo pipefail
+
+: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}"
+
+# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/).
+WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)"
+
+# Anchor relative paths to $WORKDIR.
+[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE"
+
+if [[ -n "${DATASET_PATH:-}" ]]; then
+    [[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH"
+    export DATASET_PATH
+fi
+
+if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
+    [[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH"
+    WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}"
+    [[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH"
+    export BASE_CHECKPOINT_PATH WAN_VAE_PATH
+fi
+
+OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}"
+LOG_DIR="$OUTPUT_ROOT/logs"
+TOML_STEM="$(basename "$TOML_FILE" .toml)"
+LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}"
+IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}"
+mkdir -p "$LOG_DIR"
+
+echo ">>> $(date '+%H:%M:%S') Checking inputs..."
+[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; }
+if [[ -n "${DATASET_PATH:-}" ]]; then
+    [[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=<path>)" >&2; exit 1; }
+fi
+if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
+    [[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=<path>)" >&2; exit 1; }
+    [[ -f "$WAN_VAE_PATH" ]]         || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=<path>)" >&2; exit 1; }
+fi
+if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi
+
+cd "$WORKDIR"
+echo ">>> $(date '+%H:%M:%S') WORKDIR:    $WORKDIR"
+echo ">>> $(date '+%H:%M:%S') TOML:       $TOML_FILE"
+[[ -n "${DATASET_PATH:-}" ]]         && echo ">>> $(date '+%H:%M:%S') dataset:    $DATASET_PATH"
+[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH"
+echo ">>> $(date '+%H:%M:%S') log:        $LOG_FILE"
+
+# Default empty if caller didn't set; safe under set -u.
+[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=()
+
+TRAILING_ARGS=()
+if (( ${#TAIL_OVERRIDES[@]} > 0 )); then
+    TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}")
+fi
+
+IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \
+    torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \
+    --sft-toml="$TOML_FILE" \
+    "${TRAILING_ARGS[@]}" \
+    2>&1 | tee "$LOG_FILE"
+
+EXIT_CODE=${PIPESTATUS[0]}
+echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)"
+exit $EXIT_CODE
diff --git a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for llava_ov (VLM SFT on
+# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives
+# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml.
+#
+# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# The dataset streams from the HuggingFace Hub, so DATASET_PATH /
+# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required.
+#
+# Optional env:
+#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
+#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
+#                          (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via
+#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
+#                          When set, plumbed to backbone.safetensors_path via a
+#                          tail override. When unset, the framework falls back
+#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_llava_ov.sh
+
+TOML_FILE="toml/sft_config/llava_ov.toml"
+
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
+
+# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
+# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
+# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
+# while keeping the public HF model_name for tokenizer/architecture discovery.
+if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
+    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
+fi
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2
+# via CosmosDataLoader). Drives cosmos_framework.scripts.train against
+# toml/sft_config/videophy2_sft_nano.toml.
+#
+# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Required env:
+#   VIDEOPHYSICS_ROOT  dir containing videophy2_train/ and videophy2_val/
+#                      (each with meta.json + media/ + text/). Populate via
+#                      `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`.
+#
+# Optional env:
+#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
+#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
+#                          (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via
+#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
+#                          When set, plumbed to backbone.safetensors_path via a
+#                          tail override. When unset, the framework falls back
+#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh
+
+TOML_FILE="toml/sft_config/videophy2_sft_nano.toml"
+
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
+
+# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
+# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
+# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
+# while keeping the public HF model_name for tokenizer/architecture discovery.
+if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
+    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
+fi
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only
+# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against
+# toml/sft_config/vision_sft_nano.toml.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Optional env vars (defaults below point under this cookbook dir; override to
+# put data or checkpoints on a different filesystem):
+#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
+#                         (must contain train/video_dataset_file.jsonl)
+#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Nano
+#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
+#   HF_TOKEN              if any tokenizer download requires gated HF access
+#   OUTPUT_ROOT           default: outputs/train
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_vision_nano.sh
+
+TOML_FILE="toml/sft_config/vision_sft_nano.toml"
+: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on
+# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives
+# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Optional env vars (defaults below point under this cookbook dir; override to
+# put data or checkpoints on a different filesystem):
+#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
+#                         (must contain train/video_dataset_file.jsonl)
+#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Super
+#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
+#   HF_TOKEN              if any tokenizer download requires gated HF access
+#   OUTPUT_ROOT           default: outputs/train
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_vision_super.sh
+
+TOML_FILE="toml/sft_config/vision_sft_super.toml"
+: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs
+# bleeding into the venv, switch the allocator to expandable_segments so the
+# 32B backbone fits without OOM during compile/decode.
+export LD_LIBRARY_PATH=""
+export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# pre_exp012_llava_ov — VLM training on lmms-lab/LLaVA-OneVision-Data
+# via CosmosDataLoader. Base config = cosmos_framework/configs/base/vlm/config.py
+# (selected by [job].task="vlm").
+#
+# One knob that the SFTExperimentConfig dataclass does NOT model — supply
+# it as a CLI extra override at launch time:
+#
+#   data_setting.max_tokens=<int, drives both max_seq_len and dataloader.max_tokens>
+#
+# (The backbone is now modeled — see [model.backbone] below.)
+#
+# Example launch:
+#   torchrun --nproc_per_node=4 -m cosmos_framework.scripts.train \
+#       --sft-toml toml/sft_config/llava_ov.toml -- \
+#       data_setting.max_tokens=16000
+#
+# Per-task remap (see _PATH_REMAPS["vlm"]):
+#   model.parallelism.*            -> model.config.parallelism.*
+#   model.compile.*                -> model.config.compile.*
+#   model.activation_checkpointing.* -> model.config.activation_checkpointing.*
+#   model.precision                -> model.config.precision
+#   model.attn_implementation      -> model.config.policy.attn_implementation
+#   model.backbone.*               -> model.config.policy.backbone.*
+#   model.ema.*                    -> model.config.ema.*
+#   model.{max_num_tokens_after_packing, joint_attn_implementation, lora_*,
+#          tokenizer.*} and dataloader_train.{max_sequence_length, seed} -> SKIPPED
+
+[job]
+task         = "vlm"
+experiment   = "pre_exp012_llava_ov"
+project      = "cosmos3"                                 # matches legacy
+group        = "vlm_llava_ov_demo"
+name         = "pre_exp012_llava_ov"
+wandb_mode   = "disabled"
+
+[model]
+# VLM-only attention impl (PolicyConfig.attn_implementation).
+attn_implementation = "cosmos"     # "cosmos" | "flash_attention_2" | "sdpa" | "eager"
+precision           = "bfloat16"   # was [model.parallelism].precision
+
+[model.backbone]
+model_name = "Qwen/Qwen3-VL-8B-Instruct"   # → model.config.policy.backbone.model_name (VLM remap)
+
+[model.ema]
+enabled         = false
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = 8                  # matches legacy dp_shard_size=8
+data_parallel_replicate_degree  = -1                 # matches legacy dp_replicate_size=-1
+context_parallel_shard_degree   = 1
+cfg_parallel_shard_degree       = 1
+
+[model.compile]
+enabled                         = false              # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-8                              # skipped for VLM by _PATH_REMAPS
+fused         = true
+lr            = 1.0e-5                              # matches legacy
+weight_decay  = 0.1                                 # matches legacy
+# keys_to_select / lr_multipliers omitted — VLM Trainer defaults apply.
+
+[scheduler]
+cycle_lengths      = [500]                          # matches legacy (VLM_LAMBDACOSINE_KWARGS uses ${trainer.max_iter})
+f_max              = [1.0]
+f_min              = [0.5]                          # matches legacy
+f_start            = [0.05]                         # matches legacy
+verbosity_interval = 0                              # skipped for VLM by _PATH_REMAPS
+warm_up_steps      = [1000]                         # matches legacy
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 1
+logging_iter            = 1
+max_iter                = 500                     # matches legacy
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 1.0
+force_finite = false                                # matches VLM default in cosmos_framework/configs/base/vlm/defaults/callbacks.py:55
+
+[checkpoint]
+keys_to_skip_loading = []
+load_path            = "???"                      # MISSING sentinel; skipped by build_hydra_overrides — supply at runtime
+save_iter            = 100
+
+[dataloader_train]
+# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher:
+#   max_samples_per_batch -> dataloader_train.batcher.max_batch_size
+#   max_sequence_length   -> dataloader_train.batcher.max_tokens
+max_samples_per_batch = 1
+max_sequence_length   = 16000