feat: add explicit vlm config flag for VLM detection (#2063)

hallerite · web-flow · commit d8030652042f · 2026-03-23T22:40:44.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 Documenting changes which affect configuration usage patterns (added/moved/removed/renamed fields, notable logic changes).
 
+- **`[model.vlm]` (NEW — replaces auto-detection)**: VLM mode is now opt-in via a `[model.vlm]` sub-config with required `vision_encoder_attr` and `language_model_attr` fields. There is no auto-detection — if you train a VLM, you must add `[model.vlm]`. Existing multimodal configs need the new section. See `docs/multimodal.md` for the table of known model attrs. (2026-03-24)
 - **`model.optimization_dtype` / `model.reduce_dtype` (VLM models, RL only)**: VLM dtype validation now only applies to RL training (`TrainerConfig`), not SFT. VLM models used with `sft` no longer require `optimization_dtype='bfloat16'` / `reduce_dtype='bfloat16'`. RL training still enforces both to match vLLM inference. (2026-03-24)
 - **`model.optimization_dtype` / `model.reduce_dtype` (VLM models)**: Added validation that VLM models must use `optimization_dtype='bfloat16'` and `reduce_dtype='bfloat16'` to match vLLM inference. Previously valid configs with `float32` (the default) are now rejected for VLM model names. Set both fields to `"bfloat16"` when training VLMs. (2026-03-21)
 - **`orchestrator.advantage.length_weighted_mean`**: Removed. The default advantage now always uses the plain per-problem mean baseline unless `orchestrator.advantage.length_shaping_alpha` is set. Existing configs must delete this field. (2026-03-19)
diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml
@@ -9,6 +9,10 @@ num_infer_gpus = 4
 [model]
 name = "Qwen/Qwen3-VL-4B-Instruct"
 
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+
 [orchestrator]
 batch_size = 256
 rollouts_per_example = 16
diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
@@ -4,6 +4,10 @@ seq_len = 4096
 [model]
 name = "Qwen/Qwen3-VL-4B-Instruct"
 
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+
 [orchestrator]
 batch_size = 256
 rollouts_per_example = 16
diff --git a/configs/multimodal/rl_color_codeword_test.toml b/configs/multimodal/rl_color_codeword_test.toml
@@ -0,0 +1,35 @@
+max_steps = 3
+seq_len = 2048
+output_dir = "outputs/rl_color_codeword_test"
+
+[model]
+name = "Qwen/Qwen3-VL-4B-Instruct"
+
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+
+[orchestrator]
+batch_size = 16
+rollouts_per_example = 2
+
+[orchestrator.sampling]
+max_tokens = 32
+
+[[orchestrator.env]]
+id = "color-codeword"
+args = { images_per_turn = 1, max_turns = 2, num_examples = 100, seed = 42 }
+
+[trainer]
+
+[trainer.model]
+optimization_dtype = "bfloat16"
+reduce_dtype = "bfloat16"
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+
+[inference.parallel]
+dp = 1
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -1,35 +1,62 @@
 # Multimodal (VLM) Support
 
-Prime-RL has experimental support for training vision-language models (VLMs) like Qwen3-VL.
+Prime-RL supports training vision-language models (VLMs) like Qwen3-VL.
 
-## Current Limitations
+## VLM Configuration
+
+### Supported Models
+
+The built-in registry supports these model families out of the box:
+
+| Model Family | model_type | Vision Encoder | Language Model |
+|-------------|------------|---------------|----------------|
+| Qwen3-VL | `qwen3_vl` | `model.visual` | `model.language_model` |
+| Qwen3.5 | `qwen3_5` | `model.visual` | `model.language_model` |
+| Qwen3.5-MoE | `qwen3_5_moe` | `model.visual` | `model.language_model` |
+
+Enable VLM mode by adding a `[model.vlm]` section. Both fields are required — they tell prime-rl where the vision encoder and language model live on the model object:
+
+```toml
+[model]
+name = "Qwen/Qwen3-VL-4B-Instruct"
+
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+```
 
-- **No SFT support**: Supervised fine-tuning is not yet supported for VLM models. Only RL training is available.
+For the registered models in the table above, use the attrs shown there. For custom VLMs, check your model's structure with `model.named_children()`.
+
+Both fields are dotted attribute paths resolved on the loaded model. A bad path raises a `ValueError` immediately — there are no silent fallbacks.
+
+The weight key prefix for NCCL broadcasting is derived automatically as `{language_model_attr}.layers.`.
+
+To add permanent support for a new model family, add an entry to `VLM_REGISTRY` in `src/prime_rl/utils/vlm.py`.
+
+## Current Limitations
 
 - **Vision encoder is frozen**: The vision encoder is automatically frozen during training. Only the language model is trained.
 
-- **No multimodal-safe truncation**: Token sequences are truncated to `seq_len`, but `pixel_values` and `image_grid_thw` are passed through unchanged. If a multimodal sample exceeds `seq_len`, image tokens can be dropped while image tensors still describe the full set of images. Ensure `seq_len` covers your longest VLM samples or avoid overlong rollouts.
+- **No multimodal-safe truncation**: Token sequences are truncated to `seq_len`, but `pixel_values` and `image_grid_thw` are passed through unchanged. If a multimodal sample exceeds `seq_len`, image tokens can be dropped while image tensors still describe the full set of images. Ensure `seq_len` covers your longest VLM samples.
 
-- **The images that the VLM sees are not logged**
+- **Optimization dtype must be bfloat16**: Set `optimization_dtype = "bfloat16"` and `reduce_dtype = "bfloat16"` in your trainer config.
 
-- **Optimization dtype must be bfloat16**: VLM models must load in bfloat16 to match vLLM inference. If the trainer uses a different dtype, the vision encoder produces different `pixel_values`, causing a mismatch between inference and training. A workaround would be to propagate the `pixel_values` computed by vLLM to the trainer, but this is more involved. For now, set `optimization_dtype = "bfloat16"` and `reduce_dtype = "bfloat16"` in your trainer config.
+- **Higher KL mismatch with multi-image inputs**: VLM training exhibits higher KL mismatch compared to text-only, especially with multiple images.
 
-- **Higher KL mismatch with multi-image inputs**: VLM training exhibits higher KL mismatch between inference and trainer logprobs compared to text-only models, especially with multiple images per sample. We are investigating the root cause. The existing importance ratio masking thresholds should handle reasonable mismatches.
+- **Images are not logged**: The images the VLM sees during training are not logged to monitors.
 
-## How Multi-Turn VLM Training Works
+## How Multi-Turn VLM RL Training Works
 
-VLM training uses the same `interleave_rollout` path as text-only models. Multi-turn trajectory steps are merged into a single training sample wherever the extension property holds (consecutive steps share a token prefix). When extension breaks (e.g., due to context compaction), a new sample is started automatically.
+VLM training uses the same `interleave_rollout` path as text-only models. Multi-turn trajectory steps are merged into a single training sample wherever the extension property holds.
 
 Images are handled via a `VLMImageCache` built once per batch:
 
-1. **Extract**: Base64 images are decoded from trajectory step prompts into PIL images. Since prompts are cumulative, only new images per step are extracted.
-2. **Preprocess**: All images are processed in a single batched call through the HuggingFace image processor, producing `pixel_values` (patches) and `image_grid_thw` (grid dimensions).
-3. **Attach**: Each training sample receives the cumulative `pixel_values` up to its last merged step. When steps are merged, the sample's images are updated to include all images seen so far.
-
-This works correctly for all combinations: images in early turns with text-only follow-ups, images appearing mid-conversation, new images accumulating across turns, and interleaved agents with separate image streams.
+1. **Extract**: Base64 images are decoded from trajectory step prompts into PIL images.
+2. **Preprocess**: Images are processed through the HuggingFace image processor, producing `pixel_values` and `image_grid_thw`.
+3. **Attach**: Each training sample receives the cumulative `pixel_values` up to its last merged step.
 
-Each multimodal sample becomes its own micro-batch during training (no packing with other samples) since image tensor sizes vary per sample.
+Each multimodal sample becomes its own micro-batch during training (no packing) since image tensor sizes vary.
 
 ## vLLM Configuration
 
-`VLLM_WORKER_MULTIPROC_METHOD=spawn` is required for VLM inference. This is set automatically in `src/prime_rl/inference/config.py`, so if you use `uv run rl @ ...` it works out of the box, but if you start the vLLM server yourself, make sure this environment variable is set.
+`VLLM_WORKER_MULTIPROC_METHOD=spawn` is required for VLM inference. This is set automatically when using `uv run rl @ ...`, but if you start the vLLM server yourself, make sure this environment variable is set.
diff --git a/src/prime_rl/configs/rl.py b/src/prime_rl/configs/rl.py
@@ -19,6 +19,7 @@
 )
 from prime_rl.configs.shared import (
     SlurmConfig,
+    VLMConfig,
     WandbConfig,
     WandbWithExtrasConfig,
 )
@@ -127,6 +128,11 @@ class SharedModelConfig(BaseConfig):
         Field(description="The name of the model to use."),
     ] = "Qwen/Qwen3-0.6B"
 
+    vlm: Annotated[
+        "VLMConfig | None",
+        Field(description="VLM configuration. Set to enable vision-language model support."),
+    ] = None
+
 
 class SharedWeightBroadcastConfig(BaseConfig):
     """Configures shared weight broadcast settings."""
@@ -520,6 +526,12 @@ def auto_setup_model(self):
             else:
                 self.orchestrator.model.name = self.model.name
 
+            if self.model.vlm is not None:
+                self.trainer.model.vlm = self.model.vlm
+                self.orchestrator.model.vlm = self.model.vlm
+                if self.inference is not None:
+                    self.inference.model.vlm = self.model.vlm
+
         validate_shared_model_name(self.trainer, self.orchestrator, self.inference)
 
         return self
diff --git a/src/prime_rl/configs/shared.py b/src/prime_rl/configs/shared.py
@@ -80,6 +80,29 @@ def resolve_project_dir(self):
 ServerType = Literal["vllm", "openai"]
 
 
+class VLMConfig(BaseConfig):
+    """Configures vision-language model support.
+
+    Presence of this config enables VLM mode. You must specify where the
+    vision encoder and language model live on the model object.
+
+    Usage:
+        [model.vlm]
+        vision_encoder_attr = "model.visual"
+        language_model_attr = "model.language_model"
+    """
+
+    vision_encoder_attr: Annotated[
+        str,
+        Field(description="Dotted attribute path to the vision encoder module (e.g. 'model.visual')."),
+    ]
+
+    language_model_attr: Annotated[
+        str,
+        Field(description="Dotted attribute path to the language model module (e.g. 'model.language_model')."),
+    ]
+
+
 class BaseModelConfig(BaseConfig):
     """Configures the model."""
 
@@ -92,6 +115,13 @@ class BaseModelConfig(BaseConfig):
         ),
     ] = False
 
+    vlm: Annotated[
+        "VLMConfig | None",
+        Field(
+            description="VLM configuration. Set this to enable vision-language model support.",
+        ),
+    ] = None
+
 
 class ElasticConfig(BaseConfig):
     """Configures elastic inference pool with DNS-based service discovery.
diff --git a/src/prime_rl/configs/trainer.py b/src/prime_rl/configs/trainer.py
@@ -13,7 +13,6 @@
     WandbConfig,
 )
 from prime_rl.utils.config import BaseConfig
-from prime_rl.utils.vlm import is_vlm_model
 
 # -- Shared trainer configs (used by both SFT and RL trainers) --
 
@@ -714,7 +713,7 @@ class TrainerConfig(BaseConfig):
 
     @model_validator(mode="after")
     def vlms_require_bfloat16(self):
-        if is_vlm_model(self.model.name) and (
+        if self.model.vlm is not None and (
             self.model.optimization_dtype != "bfloat16" or self.model.reduce_dtype != "bfloat16"
         ):
             raise ValueError(
diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py
@@ -75,7 +75,6 @@
     strip_env_version,
     to_col_format,
 )
-from prime_rl.utils.vlm import is_vlm_model
 
 
 @clean_exit
@@ -137,7 +136,7 @@ async def orchestrate(config: OrchestratorConfig):
         teacher_inference_pool = None
 
     # Check if this is a vision-language model (used throughout for VLM-specific paths)
-    is_vlm = is_vlm_model(config.model.name)
+    is_vlm = config.model.vlm is not None
 
     # Load tokenizer and processor (processor only for VLM models)
     logger.info(f"Initializing tokenizer for {config.model.name}")
diff --git a/src/prime_rl/trainer/model.py b/src/prime_rl/trainer/model.py
@@ -41,7 +41,7 @@
 )
 from prime_rl.trainer.world import get_world
 from prime_rl.utils.logger import get_logger
-from prime_rl.utils.vlm import is_vlm_config, is_vlm_model
+from prime_rl.utils.vlm import get_language_model, get_vision_encoder
 
 
 def _patch_qwen3_5_moe_conversion_mapping():
@@ -118,24 +118,11 @@ def _patched_forward(self, hidden_states, position_ids=None, **kwargs):
 torch._dynamo.config.cache_size_limit = 64  # default: 8
 
 
-def freeze_vision_encoder(model: nn.Module) -> None:
-    """Freeze the vision encoder parameters for VLM training.
-
-    For Qwen3-VL, the vision encoder is at model.model.visual.
-    This freezes all parameters in the vision encoder so only the
-    language model (with LoRA) is trained.
-    """
+def freeze_vision_encoder(model: nn.Module, override_attr: str | None = None) -> None:
     logger = get_logger()
-
-    # Qwen3-VL structure: model.model.visual
-    if hasattr(model, "model") and hasattr(model.model, "visual"):
-        vision_encoder = model.model.visual
-    # Qwen2-VL structure: model.visual
-    elif hasattr(model, "visual"):
-        vision_encoder = model.visual
-    else:
-        raise ValueError("Could not find vision encoder to freeze. Expected model.model.visual or model.visual")
-
+    vision_encoder = get_vision_encoder(model, override=override_attr)
+    if vision_encoder is None:
+        raise ValueError("Could not find vision encoder to freeze")
     num_frozen = 0
     for param in vision_encoder.parameters():
         param.requires_grad = False
@@ -175,17 +162,6 @@ def is_tt_moe_model(model: nn.Module) -> bool:
     return hasattr(model.config, "num_experts") or hasattr(model.config, "n_routed_experts")
 
 
-def get_language_model(model: nn.Module) -> nn.Module:
-    """Get the language model component containing transformer layers.
-
-    For VLM models (Qwen3-VL): model.model.language_model
-    For text-only models: model.model
-    """
-    if hasattr(model.model, "language_model"):
-        return model.model.language_model
-    return model.model
-
-
 def get_load_balance_stats(
     model: nn.Module, reset_stats: bool = True, try_to_avoid_padding_experts: bool = True
 ) -> dict[str, Tensor | None]:
@@ -218,8 +194,8 @@ def get_model(
         f"Loading model config (name={config.name}, attn={config.attn}, trust_remote_code={config.trust_remote_code})"
     )
 
-    # Check if this is a vision-language model (by name pattern first)
-    is_vlm = is_vlm_model(config.name)
+    # VLM mode is enabled by setting [model.vlm] in config
+    is_vlm = config.vlm is not None
 
     if "Qwen3.5" in config.name or "qwen3_5" in config.name.lower():
         _patch_qwen3_5_text_position_ids()
@@ -233,9 +209,6 @@ def get_model(
     )
     model_config.use_cache = False
 
-    # Fallback VLM detection from loaded config (catches local paths)
-    if not is_vlm and is_vlm_config(model_config):
-        is_vlm = True
     if is_vlm:
         logger.info(f"Detected vision-language model: {config.name}")
 
@@ -327,7 +300,7 @@ def get_model(
 
     # For VLM models, freeze the vision encoder
     if is_vlm:
-        freeze_vision_encoder(model)
+        freeze_vision_encoder(model, override_attr=config.vlm.vision_encoder_attr)
 
     assert model.lm_head.weight.dtype == dtype, (
         f"LM head dtype wasnt loaded correctly {model.lm_head.weight.dtype} != {dtype}"
@@ -365,33 +338,16 @@ def setup_fsdp(model: nn.Module, config: ModelConfig, parallel_dims: ParallelDim
 
         dp_mod_ep_mesh = parallel_dims.world_mesh[tuple(dp_mod_ep_mesh_dim_names)]
 
-    # For VLM models, shard the frozen vision encoder as a single unit
-    # This allows FSDP to manage the memory while keeping it frozen
-    is_vlm = is_vlm_model(config.name) or (hasattr(model, "model") and hasattr(model.model, "visual"))
+    is_vlm = config.vlm is not None
     if is_vlm:
-        if hasattr(model, "model") and hasattr(model.model, "visual"):
-            vision_encoder = model.model.visual
-        elif hasattr(model, "visual"):
-            vision_encoder = model.visual
-        else:
-            raise ValueError(f"VLM model {config.name} does not have a recognized vision encoder attribute")
-
-        fully_shard(
-            vision_encoder,
-            mesh=hsdp_mesh,
-            **fsdp_config,
-        )
+        vision_encoder = get_vision_encoder(model, override=config.vlm.vision_encoder_attr)
+        if vision_encoder is None:
+            raise ValueError(f"VLM model {config.name} has no recognized vision encoder")
+        fully_shard(vision_encoder, mesh=hsdp_mesh, **fsdp_config)
         get_logger().info("Applied FSDP to frozen vision encoder")
 
-    # Get the language model layers (handle VLM structure)
-    # For Qwen3-VL: model.model.language_model contains the transformer layers
-    # For text-only models: model.model contains the layers directly
-    if is_vlm:
-        language_model = model.model.language_model
-        transformer_layers = language_model.layers
-    else:
-        language_model = model.model
-        transformer_layers = language_model.layers
+    language_model = get_language_model(model, override=config.vlm.language_model_attr if is_vlm else None)
+    transformer_layers = language_model.layers
 
     for transformer_block in transformer_layers:
         if parallel_dims.ep_enabled and isinstance(transformer_block.mlp, MoE):
diff --git a/src/prime_rl/utils/vlm.py b/src/prime_rl/utils/vlm.py