EvolvingLMMs-Lab · kcz358 · Dec 10, 2025 · Dec 1, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/docs/models/llava_onevision1_5.md b/docs/models/llava_onevision1_5.md
@@ -0,0 +1,71 @@
+# LLaVA-OneVision 1.5 Training
+
+## Overview
+
+LLaVA-OneVision 1.5 is a vision-language model adapted for efficient FSDP2 training with lightweight data and fast startup.
+
+## Supported Features
+
+| Feature | Support |
+|---------|---------|
+| **FSDP2** | ✅ |
+| **FlashAttention 2** | ✅ |
+| **Liger Kernel** | ✅ |
+| **RMPAD** | ✅ |
+| **Packing** | ✅ |
+
+
+## Quick Start
+
+See the example configuration and run script:
+See the example configuration and run script:
+- **Example Config**: [examples/llava_onevision1_5/example.yaml](../../examples/llava_onevision1_5/example.yaml)
+- **Run Script**: [examples/llava_onevision1_5/run.sh](../../llava_onevision1_5/run.sh)
+
+## Key Configuration
+
+```yaml
+trainer_type: fsdp2_trainer
+
+dataset_config:
+  dataset_type: vision_iterable
+  dataset_format: yaml
+  datasets:
+    - path: data/open_thoughts_debug
+      data_folder: ''
+      data_type: arrow
+  processor_config:
+    processor_name: Jinghao-Guo/llavaov1.5-4B-instruct-converted
+    processor_type: llava
+  packing: true
+  shuffle: true
+  video_backend: qwen_vl_utils
+
+model_config:
+  load_from_pretrained_path: Jinghao-Guo/llavaov1.5-4B-instruct-converted
+  attn_implementation: flash_attention_2
+
+trainer_args:
+  per_device_train_batch_size: 1
+  gradient_checkpointing: true
+  num_train_epochs: 1
+  max_steps: 1
+  report_to: none
+  output_dir: out
+  warmup_ratio: 0.0
+  eval_strategy: 'no'
+  dataloader_num_workers: 1
+  bf16: true
+  lr_scheduler_type: cosine
+  use_liger_kernel: true
+  use_rmpad: true
+  fsdp2: true
+  group_by_length: true
+  fsdp_config:
+    transformer_layer_cls_to_wrap:
+      - LLaVAOneVision1_5_DecoderLayer
+      - RiceBlock
+    reshard_after_forward: false
+  sp_ulysses_degree: 1
+  print_batch_input_steps: -1
+```
diff --git a/examples/llava_onevision1_5/example.yaml b/examples/llava_onevision1_5/example.yaml
@@ -0,0 +1,41 @@
+trainer_type: fsdp2_trainer
+
+dataset_config:
+  dataset_type: vision_iterable
+  dataset_format: yaml
+  datasets:
+    - path: data/open_thoughts_debug
+      data_folder: ""
+      data_type: arrow
+  processor_config:
+    processor_name: Jinghao-Guo/llavaov1.5-4B-instruct-converted
+    processor_type: llava
+  packing: false
+  shuffle: false
+  video_backend: qwen_vl_utils
+
+model_config:
+  load_from_pretrained_path: Jinghao-Guo/llavaov1.5-4B-instruct-converted
+  attn_implementation: flash_attention_2
+
+trainer_args:
+  per_device_train_batch_size: 1
+  gradient_checkpointing: true
+  num_train_epochs: 1
+  max_steps: 1
+  report_to: none
+  output_dir: out
+  warmup_ratio: 0.0
+  eval_strategy: "no"
+  dataloader_num_workers: 1
+  bf16: true
+  lr_scheduler_type: cosine
+  use_liger_kernel: true
+  use_rmpad: true
+  fsdp2: true
+  group_by_length: true
+  fsdp_config:
+    transformer_layer_cls_to_wrap: ["LLaVAOneVision1_5_DecoderLayer", "RiceBlock"]
+    reshard_after_forward: false
+  sp_ulysses_degree: 1
+  print_batch_input_steps: -1
diff --git a/examples/llava_onevision1_5/run.sh b/examples/llava_onevision1_5/run.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+################################################################################
+# Qwen2.5-VL Training with FSDP2 + Ulysses Sequence Parallel
+################################################################################
+#
+# DESCRIPTION:
+#   Train Qwen2.5-VL vision-language model with support for long sequences
+#   using Ulysses Sequence Parallel and FSDP2 distributed training.
+#
+# KEY FEATURES:
+#   - Multi-resolution visual understanding
+#   - Ulysses SP for 10K+ visual tokens
+#   - Flash Attention 2 + unpadding (use_rmpad)
+#   - Sequence packing support
+#   - Liger Kernel fused operations
+#   - FSDP2 distributed training
+#
+# REQUIREMENTS:
+#   - 8x GPUs (A100/H100 recommended, 80GB VRAM)
+#   - flash-attn: pip install flash-attn --no-build-isolation
+#   - liger-kernel: pip install liger-kernel
+#
+# DATASET:
+#   Prepare your dataset in OpenAI chat format (JSONL/Arrow/Parquet):
+#   See: docs/user_guide/data_prep.md
+#
+#   Example dataset entry:
+#   ```json
+#   {
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#           {"type": "image", "image": "path/to/image.jpg"},
+#           {"type": "text", "text": "Describe this image"}
+#         ]
+#       },
+#       {
+#         "role": "assistant",
+#         "content": [{"type": "text", "text": "This image shows..."}]
+#       }
+#     ]
+#   }
+#   ```
+#
+# CONFIGURATION:
+#   Edit example_config.yaml to customize:
+#   - Model size (3B/7B/72B): change load_from_pretrained_path
+#   - Sequence length: adjust packing_length
+#   - SP degree: set sp_ulysses_degree (1/2/4/8)
+#   - Batch size: per_device_train_batch_size
+#   - Max frames: video_max_frames
+#
+# PERFORMANCE TIPS:
+#   - Adjust sp_ulysses_degree based on sequence length:
+#     * Degree 1: < 10K tokens
+#     * Degree 2: 10K-20K tokens
+#     * Degree 4: 20K-40K tokens
+#     * Degree 8: 40K+ tokens
+#   - Enable packing for better MFU: set packing: true
+#   - Use gradient_checkpointing for larger models (already enabled)
+#   - Monitor memory with: watch -n 1 nvidia-smi
+#
+################################################################################
+
+# Number of GPUs
+NGPUS=8
+# Training command
+torchrun --nproc_per_node=${NGPUS} \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --master_port=12356 \
+  -m lmms_engine.launch.cli \
+  config_yaml=examples/llava_onevision1_5/example.yaml
+################################################################################
+# MULTI-NODE TRAINING:
+#
+# On rank 0 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=0 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12356 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=examples/qwen2_5_vl/example_config.yaml
+#
+# On rank 1 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=1 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12356 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=examples/qwen2_5_vl/example_config.yaml
+#
+################################################################################
diff --git a/src/lmms_engine/models/__init__.py b/src/lmms_engine/models/__init__.py
@@ -4,6 +4,11 @@
 from .dream_dllm import DreamDLLMConfig, DreamDLLMForMaskedLM
 from .llada_dllm import LLaDADLLMConfig, LLaDADLLMForMaskedLM
 from .llava_onevision import apply_liger_kernel_to_llava_onevision
+from .llava_onevision1_5 import (
+    LLaVAOneVision1_5_ForConditionalGeneration,
+    Llavaonevision1_5Config,
+    apply_liger_kernel_to_llava_onevision1_5,
+)
 from .monkey_patch import MONKEY_PATCHER
 from .qwen2 import apply_liger_kernel_to_qwen2
 from .qwen2_5_omni import (
@@ -67,4 +72,7 @@
     "SiTModel",
     "SiTConfig",
     "SiT",
+    "Llavaonevision1_5Config",
+    "LLaVAOneVision1_5_ForConditionalGeneration",
+    "apply_liger_kernel_to_llava_onevision1_5",
 ]
diff --git a/src/lmms_engine/models/llava_onevision1_5/__init__.py b/src/lmms_engine/models/llava_onevision1_5/__init__.py
@@ -0,0 +1,18 @@
+from lmms_engine.mapping_func import register_model
+
+from .configuration_llavaonevision1_5 import Llavaonevision1_5Config
+from .modeling_llavaonevision1_5 import LLaVAOneVision1_5_ForConditionalGeneration
+from .monkey_patch import apply_liger_kernel_to_llava_onevision1_5
+
+register_model(
+    "llavaonevision1_5",
+    Llavaonevision1_5Config,
+    LLaVAOneVision1_5_ForConditionalGeneration,
+    model_general_type="image_text_to_text",
+)
+
+__all__ = [
+    "Llavaonevision1_5Config",
+    "LLaVAOneVision1_5_ForConditionalGeneration",
+    "apply_liger_kernel_to_llava_onevision1_5",
+]