EvolvingLMMs-Lab · kcz358 · Dec 10, 2025 · Dec 1, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/docs/models/llava_onevision1_5.md b/docs/models/llava_onevision1_5.md
@@ -0,0 +1,71 @@
+# LLaVA-OneVision 1.5 Training
+
+## Overview
+
+LLaVA-OneVision 1.5 is a vision-language model adapted for efficient FSDP2 training with lightweight data and fast startup.
+
+## Supported Features
+
+| Feature | Support |
+|---------|---------|
+| **FSDP2** | ✅ |
+| **FlashAttention 2** | ✅ |
+| **Liger Kernel** | ✅ |
+| **RMPAD** | ✅ |
+| **Packing** | ✅ |
+
+
+## Quick Start
+
+See the example configuration and run script:
+See the example configuration and run script:
+- **Example Config**: [examples/llava_onevision1_5/example.yaml](../../examples/llava_onevision1_5/example.yaml)
+- **Run Script**: [examples/llava_onevision1_5/run.sh](../../llava_onevision1_5/run.sh)
+
+## Key Configuration
+
+```yaml
+trainer_type: fsdp2_trainer
+
+dataset_config:
+  dataset_type: vision_iterable
+  dataset_format: yaml
+  datasets:
+    - path: data/open_thoughts_debug
+      data_folder: ''
+      data_type: arrow
+  processor_config:
+    processor_name: Jinghao-Guo/llavaov1.5-4B-instruct-converted-qwen
+    processor_type: llava
+  packing: true
+  shuffle: true
+  video_backend: qwen_vl_utils
+
+model_config:
+  load_from_pretrained_path: Jinghao-Guo/llavaov1.5-4B-instruct-converted-qwen
+  attn_implementation: flash_attention_2
+
+trainer_args:
+  per_device_train_batch_size: 1
+  gradient_checkpointing: true
+  num_train_epochs: 1
+  max_steps: 1
+  report_to: none
+  output_dir: out
+  warmup_ratio: 0.0
+  eval_strategy: 'no'
+  dataloader_num_workers: 1
+  bf16: true
+  lr_scheduler_type: cosine
+  use_liger_kernel: true
+  use_rmpad: true
+  fsdp2: true
+  group_by_length: true
+  fsdp_config:
+    transformer_layer_cls_to_wrap:
+      - LLaVAOneVision1_5_DecoderLayer
+      - RiceBlock
+    reshard_after_forward: false
+  sp_ulysses_degree: 1
+  print_batch_input_steps: -1
+```
diff --git a/examples/llava_onevision1_5/example.yaml b/examples/llava_onevision1_5/example.yaml
@@ -0,0 +1,175 @@
+trainer_type: fsdp2_trainer
+
+dataset_config:
+  dataset_type: vision_iterable
+  dataset_format: yaml
+  datasets:
+    - path: data/open_thoughts_debug
+      data_folder: ""
+      data_type: arrow
+  processor_config:
+    processor_name: Jinghao-Guo/llavaov1.5-4B-instruct-converted-qwen
+    processor_type: llava
+  packing: false
+  shuffle: false
+  video_backend: qwen_vl_utils
+
+model_config:
+  load_from_pretrained_path: Jinghao-Guo/llavaov1.5-4B-instruct-converted-qwen
+  attn_implementation: flash_attention_2
+
+trainer_args:
+  output_dir: ./output/llava_onevision1_5_training
+  # overwrite_output_dir: false
+  do_train: true
+  do_eval: false
+  do_predict: false
+  eval_strategy: 'no'
+  prediction_loss_only: false
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 8
+  gradient_accumulation_steps: 1
+  eval_accumulation_steps: null
+  eval_delay: 0
+  torch_empty_cache_steps: null
+  learning_rate: 0.00001
+  weight_decay: 0.0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  max_grad_norm: 1.0
+  num_train_epochs: 1
+  max_steps: 1000
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.1
+  warmup_steps: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  # logging_dir: ./output/qwen3_llm_training/runs
+  logging_strategy: steps
+  logging_first_step: false
+  logging_steps: 10
+  logging_nan_inf_filter: true
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 2
+  save_safetensors: true
+  save_on_each_node: false
+  save_only_model: false
+  restore_callback_states_from_checkpoint: false
+  # no_cuda: false
+  use_cpu: false
+  # use_mps_device: false
+  seed: 42
+  data_seed: null
+  # jit_mode_eval: false
+  bf16: true
+  fp16: false
+  # fp16_opt_level: O1
+  # half_precision_backend: auto
+  bf16_full_eval: false
+  fp16_full_eval: false
+  tf32: null
+  local_rank: 0
+  ddp_backend: null
+  # tpu_num_cores: null
+  # tpu_metrics_debug: false
+  debug: []
+  dataloader_drop_last: false
+  eval_steps: null
+  dataloader_num_workers: 8
+  dataloader_prefetch_factor: null
+  # past_index: -1
+  run_name: qwen3_llm_training
+  disable_tqdm: false
+  remove_unused_columns: true
+  label_names: null
+  load_best_model_at_end: false
+  metric_for_best_model: null
+  greater_is_better: null
+  ignore_data_skip: false
+  fsdp: []
+  # fsdp_min_num_params: 0
+  fsdp_config:
+    transformer_layer_cls_to_wrap: ["Qwen3DecoderLayer","RiceBlock",]
+    reshard_after_forward: true
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+  # fsdp_transformer_layer_cls_to_wrap: null
+  accelerator_config:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    non_blocking: false
+    gradient_accumulation_kwargs: null
+  parallelism_config: null
+  deepspeed: null
+  label_smoothing_factor: 0.0
+  optim: adamw_torch_fused
+  optim_args: null
+  # adafactor: false
+  group_by_length: true
+  length_column_name: length
+  report_to:
+  - tensorboard
+  project: huggingface
+  trackio_space_id: trackio
+  ddp_find_unused_parameters: null
+  ddp_bucket_cap_mb: null
+  ddp_broadcast_buffers: null
+  dataloader_pin_memory: true
+  dataloader_persistent_workers: false
+  skip_memory_metrics: true
+  # use_legacy_prediction_loop: false
+  push_to_hub: false
+  resume_from_checkpoint: null
+  hub_model_id: null
+  hub_strategy: every_save
+  hub_token: <HUB_TOKEN>
+  hub_private_repo: null
+  hub_always_push: false
+  hub_revision: null
+  gradient_checkpointing: true
+  gradient_checkpointing_kwargs: null
+  # include_inputs_for_metrics: false
+  # include_for_metrics: []
+  eval_do_concat_batches: true
+  # fp16_backend: auto
+  # push_to_hub_model_id: null
+  # push_to_hub_organization: null
+  # mp_parameters: ''
+  auto_find_batch_size: false
+  full_determinism: false
+  # torchdynamo: null
+  # ray_scope: last
+  ddp_timeout: 1800
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+  # include_tokens_per_second: false
+  # include_num_input_tokens_seen: 'no'
+  neftune_noise_alpha: null
+  optim_target_modules: null
+  batch_eval_metrics: false
+  eval_on_start: false
+  use_liger_kernel: true
+  liger_kernel_config: null
+  eval_use_gather_object: false
+  average_tokens_across_devices: true
+  # use_muon: false
+  # freeze_modules: null
+  use_rmpad: true
+  fsdp2: true
+  sp_ulysses_degree: 1
+  reduce_dtype: bfloat16
+  output_dtype: bfloat16
+  print_batch_input_steps: 5
+  enable_profiler: false
+  profiler_config:
+    start_step: 1
+    end_step: 3
diff --git a/examples/llava_onevision1_5/run.sh b/examples/llava_onevision1_5/run.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+################################################################################
+# Qwen2.5-VL Training with FSDP2 + Ulysses Sequence Parallel
+################################################################################
+#
+# DESCRIPTION:
+#   Train Qwen2.5-VL vision-language model with support for long sequences
+#   using Ulysses Sequence Parallel and FSDP2 distributed training.
+#
+# KEY FEATURES:
+#   - Multi-resolution visual understanding
+#   - Ulysses SP for 10K+ visual tokens
+#   - Flash Attention 2 + unpadding (use_rmpad)
+#   - Sequence packing support
+#   - Liger Kernel fused operations
+#   - FSDP2 distributed training
+#
+# REQUIREMENTS:
+#   - 8x GPUs (A100/H100 recommended, 80GB VRAM)
+#   - flash-attn: pip install flash-attn --no-build-isolation
+#   - liger-kernel: pip install liger-kernel
+#
+# DATASET:
+#   Prepare your dataset in OpenAI chat format (JSONL/Arrow/Parquet):
+#   See: docs/user_guide/data_prep.md
+#
+#   Example dataset entry:
+#   ```json
+#   {
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#           {"type": "image", "image": "path/to/image.jpg"},
+#           {"type": "text", "text": "Describe this image"}
+#         ]
+#       },
+#       {
+#         "role": "assistant",
+#         "content": [{"type": "text", "text": "This image shows..."}]
+#       }
+#     ]
+#   }
+#   ```
+#
+# CONFIGURATION:
+#   Edit example_config.yaml to customize:
+#   - Model size (3B/7B/72B): change load_from_pretrained_path
+#   - Sequence length: adjust packing_length
+#   - SP degree: set sp_ulysses_degree (1/2/4/8)
+#   - Batch size: per_device_train_batch_size
+#   - Max frames: video_max_frames
+#
+# PERFORMANCE TIPS:
+#   - Adjust sp_ulysses_degree based on sequence length:
+#     * Degree 1: < 10K tokens
+#     * Degree 2: 10K-20K tokens
+#     * Degree 4: 20K-40K tokens
+#     * Degree 8: 40K+ tokens
+#   - Enable packing for better MFU: set packing: true
+#   - Use gradient_checkpointing for larger models (already enabled)
+#   - Monitor memory with: watch -n 1 nvidia-smi
+#
+################################################################################
+
+# Number of GPUs
+NGPUS=8
+# Training command
+torchrun --nproc_per_node=${NGPUS} \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --master_port=12356 \
+  -m lmms_engine.launch.cli \
+  config_yaml=examples/llava_onevision1_5/example.yaml
+################################################################################
+# MULTI-NODE TRAINING:
+#
+# On rank 0 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=0 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12356 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=examples/qwen2_5_vl/example_config.yaml
+#
+# On rank 1 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=1 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12356 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=examples/qwen2_5_vl/example_config.yaml
+#
+################################################################################
diff --git a/src/lmms_engine/models/__init__.py b/src/lmms_engine/models/__init__.py
@@ -4,6 +4,11 @@
 from .dream_dllm import DreamDLLMConfig, DreamDLLMForMaskedLM
 from .llada_dllm import LLaDADLLMConfig, LLaDADLLMForMaskedLM
 from .llava_onevision import apply_liger_kernel_to_llava_onevision
+from .llava_onevision1_5 import (
+    LLaVAOneVision1_5_ForConditionalGeneration,
+    Llavaonevision1_5Config,
+    apply_liger_kernel_to_llava_onevision1_5,
+)
 from .monkey_patch import MONKEY_PATCHER
 from .qwen2 import apply_liger_kernel_to_qwen2
 from .qwen2_5_omni import (
@@ -67,4 +72,7 @@
     "SiTModel",
     "SiTConfig",
     "SiT",
+    "Llavaonevision1_5Config",
+    "LLaVAOneVision1_5_ForConditionalGeneration",
+    "apply_liger_kernel_to_llava_onevision1_5",
 ]
diff --git a/src/lmms_engine/models/llava_onevision1_5/__init__.py b/src/lmms_engine/models/llava_onevision1_5/__init__.py
@@ -0,0 +1,18 @@
+from lmms_engine.mapping_func import register_model
+
+from .configuration_llavaonevision1_5 import Llavaonevision1_5Config
+from .modeling_llavaonevision1_5 import LLaVAOneVision1_5_ForConditionalGeneration
+from .monkey_patch import apply_liger_kernel_to_llava_onevision1_5
+
+register_model(
+    "llavaonevision1_5",
+    Llavaonevision1_5Config,
+    LLaVAOneVision1_5_ForConditionalGeneration,
+    model_general_type="image_text_to_text",
+)
+
+__all__ = [
+    "Llavaonevision1_5Config",
+    "LLaVAOneVision1_5_ForConditionalGeneration",
+    "apply_liger_kernel_to_llava_onevision1_5",
+]