[trainer,cfg,rollout,algo] feat: (MOPD, 1/2) Multi-Teacher Model, Server Managers and Config by JacobHelwig · Pull Request #5834 · verl-project/verl

JacobHelwig · 2026-04-01T02:31:06Z

What does this PR do?

Classes and configs for managing multiple sets of teacher models and servers for multi-teacher OPD. This PR only supports a single teacher.

Testing

Tests to demonstrate no regression in single teacher OPD.

Green = current main
Blue = colocate mode, this PR
Red = standalone mode, this PR

Script

set -xeuo pipefail

############################ Quick Config ############################

ROLLOUT_NAME="vllm" # sglang or vllm

FAMILY="Qwen"
STUDENT_MODEL=Qwen2.5-0.5B
TEACHER_MODEL=Qwen2.5-3B-Instruct

USE_POLICY_GRADIENT=False
# DISTILLATION_LOSS_MODE="k3"
DISTILLATION_LOSS_MODE="forward_kl_topk"
USE_FUSED_KERNELS=False

# USE_POLICY_GRADIENT=True
# DISTILLATION_LOSS_MODE="k1"
# USE_FUSED_KERNELS=False

DISTILLATION_LOSS_MAX_CLAMP=10.0
DISTILLATION_LOG_PROB_MIN_CLAMP=-10.0

PROJECT_NAME='verl_on_policy_distillation_example_gsm8k'

MAX_PROMPT=256
MAX_RESPONSE_LENGTH=512
MAX_NUM_TOKENS=$(( MAX_PROMPT + MAX_RESPONSE_LENGTH + 1 ))
TRAIN_PROMPT_BSZ=128
STUDENT_MICRO_BATCH_SIZE_PER_GPU=8
STUDENT_MAX_TOKEN_LEN_PER_GPU=$(( STUDENT_MICRO_BATCH_SIZE_PER_GPU * (MAX_PROMPT + MAX_RESPONSE_LENGTH) ))
USE_DYNAMIC_BSZ=True

MODE=STANDALONE
TEACHER_RESOURCE_POOL=True
STUDENT_WORLD_SIZE=4
TEACHER_WORLD_SIZE=4

MODE=COLOCATE
TEACHER_RESOURCE_POOL=False
STUDENT_WORLD_SIZE=8
TEACHER_WORLD_SIZE=1

# export CUDA_VISIBLE_DEVICES=2,3
# MODE=COLOCATE
# TEACHER_RESOURCE_POOL=True
# STUDENT_WORLD_SIZE=1
# TEACHER_WORLD_SIZE=1

SP=1

EXP_NAME="MAIN/${MODE}/student-${STUDENT_MODEL}/teacher-${TEACHER_MODEL}/loss-${DISTILLATION_LOSS_MODE}/pg-${USE_POLICY_GRADIENT}"

ENFORCE_EAGER=True # true for faster debugging

############################ Paths ############################

gsm8k_train_path=$DATA_PATH/gsm8k/train.parquet
gsm8k_test_path=$DATA_PATH/gsm8k/test.parquet

TRAIN_FILES="['$gsm8k_train_path']"
TEST_FILES="['$gsm8k_test_path']"

############################ Parameter Groups ############################

DATA=(
    data.train_files="$TRAIN_FILES"
    data.val_files="$TEST_FILES"
    data.max_prompt_length=$MAX_PROMPT
    data.max_response_length=$MAX_RESPONSE_LENGTH
    data.train_batch_size=$TRAIN_PROMPT_BSZ
    data.filter_overlong_prompts=True
    data.truncation='error'
    data.shuffle=False
)

MODEL=(
    actor_rollout_ref.model.path="${FAMILY}/${STUDENT_MODEL}"
    actor_rollout_ref.model.enable_gradient_checkpointing=True
    actor_rollout_ref.model.use_remove_padding=True
    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS
    actor_rollout_ref.actor.use_torch_compile=True
    actor_rollout_ref.rollout.enforce_eager=$ENFORCE_EAGER
)

DISTILLATION=(
    distillation.enabled=True
    distillation.num_workers=8
    distillation.teacher_model.enable_resource_pool=$TEACHER_RESOURCE_POOL
    distillation.teacher_model.n_gpus_per_node=$TEACHER_WORLD_SIZE
    distillation.teacher_model.nnodes=1
    distillation.teacher_model.model_path="${FAMILY}/${TEACHER_MODEL}"
    distillation.teacher_model.inference.tensor_model_parallel_size=1
    distillation.teacher_model.inference.name=$ROLLOUT_NAME
    distillation.teacher_model.inference.gpu_memory_utilization=0.3
    distillation.teacher_model.inference.enforce_eager=$ENFORCE_EAGER
    distillation.teacher_model.inference.max_model_len=$MAX_NUM_TOKENS
    distillation.teacher_model.inference.max_num_batched_tokens=$MAX_NUM_TOKENS
    distillation.teacher_model.inference.max_num_seqs=$MAX_NUM_TOKENS
    distillation.distillation_loss.loss_mode=$DISTILLATION_LOSS_MODE
    distillation.distillation_loss.topk=64
    distillation.distillation_loss.use_task_rewards=False
    distillation.distillation_loss.use_policy_gradient=$USE_POLICY_GRADIENT
    distillation.distillation_loss.loss_max_clamp=$DISTILLATION_LOSS_MAX_CLAMP
    distillation.distillation_loss.log_prob_min_clamp=$DISTILLATION_LOG_PROB_MIN_CLAMP
)

STUDENT=(
    actor_rollout_ref.actor.optim.lr=1e-6
    actor_rollout_ref.actor.ppo_mini_batch_size=$TRAIN_PROMPT_BSZ
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$STUDENT_MICRO_BATCH_SIZE_PER_GPU
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$STUDENT_MAX_TOKEN_LEN_PER_GPU
    actor_rollout_ref.actor.use_dynamic_bsz=$USE_DYNAMIC_BSZ
    actor_rollout_ref.actor.fsdp_config.param_offload=True
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$SP
)

ROLLOUT=(
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$STUDENT_MICRO_BATCH_SIZE_PER_GPU
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$STUDENT_MAX_TOKEN_LEN_PER_GPU
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=$USE_DYNAMIC_BSZ
    actor_rollout_ref.rollout.tensor_model_parallel_size=1
    actor_rollout_ref.rollout.name=$ROLLOUT_NAME
    actor_rollout_ref.rollout.gpu_memory_utilization=0.3
    actor_rollout_ref.rollout.calculate_log_probs=False
    actor_rollout_ref.rollout.max_model_len=$MAX_NUM_TOKENS
    actor_rollout_ref.rollout.max_num_batched_tokens=$MAX_NUM_TOKENS
    actor_rollout_ref.rollout.max_num_seqs=$MAX_NUM_TOKENS
    actor_rollout_ref.rollout.n=1
)

ALGORITHM=(
    algorithm.adv_estimator=grpo
    algorithm.use_kl_in_reward=False
)

TRAINER=(
    trainer.logger='["console","wandb"]'
    trainer.project_name=$PROJECT_NAME
    trainer.experiment_name=$EXP_NAME
    trainer.n_gpus_per_node=$STUDENT_WORLD_SIZE
    trainer.nnodes=1
    trainer.save_freq=200
    trainer.test_freq=5
    trainer.total_epochs=15
    trainer.val_before_train=False
    trainer.use_legacy_worker_impl=disable
    trainer.resume_mode=disable
    trainer.log_val_generations=5
)



############################ Launch ############################

python3 -m verl.trainer.main_ppo \
    --config-path=config \
    --config-name='ppo_trainer.yaml' \
    "${DATA[@]}" \
    "${ALGORITHM[@]}" \
    "${MODEL[@]}" \
    "${DISTILLATION[@]}" \
    "${ROLLOUT[@]}" \
    "${STUDENT[@]}" \
    "${TRAINER[@]}" \
    "$@"

GSM8k eval acc

GSM8k train acc

GSM8k distillation loss

Design & Code Changes

MultiTeacherModelManager: new class that manages multiple TeacherModelManager for MOPD.
AsyncTeacherLLMServerManager: add an attribute server_manager: AsyncLLMServerManager. Previously, AsyncTeacherLLMServerManager inherited from AsyncLLMServerManager, but for MOPD, it will manage one AsyncLLMServerManager per teacher.
Adds a teacher model config dict to the distillation config. We also maintain a teacher_model entry for single teacher OPD.

For the multi-teacher training script, teacher model args will be specified as:

+distillation.teacher_models.gsm8k.task="openai/gsm8k"    
+distillation.teacher_models.gsm8k.model_path="path/to/math_teacher"
+distillation.teacher_models.geo3k.task="hiyouga/geometry3k"
+distillation.teacher_models.geo3k.model_path="path/to/vision_math_teacher"

gemini-code-assist

Code Review

This pull request introduces a MultiTeacherModelManager to support multiple teacher models during distillation, refactoring the existing TeacherModelManager and AsyncTeacherLLMServerManager to accommodate this architecture. Key changes include moving configuration and tokenizer management into the server manager and introducing helper functions for async task execution. Review feedback identifies a potential issue with mixing auto_await and asyncio.run in compute_logprobs, incorrect type hints for coroutines in _run_all, and performance risks associated with redundant tokenizer loading in the AsyncTeacherLLMServerManager constructor.

gemini-code-assist · 2026-04-01T02:37:05Z

verl/experimental/teacher_loop/teacher_model.py

    def compute_logprobs(self, data):
        self.wake_up()
        try:
-            return self._run_single(self.server_manager.compute_teacher_logprobs_batch(data))
+            return _run_single(self.server_manager.compute_teacher_logprobs_batch(data))
        finally:
            self.sleep()


The compute_logprobs method is synchronous, but it calls self.wake_up() and self.sleep(), which are decorated with @auto_await. When called synchronously, @auto_await typically runs the coroutine to completion using the current event loop or by creating a new one. However, _run_single explicitly uses asyncio.run(), which creates a new event loop and will fail if an event loop is already running in the same thread (which might happen if auto_await left a loop active).

More importantly, mixing auto_await for some calls and asyncio.run (via _run_single) for others in the same synchronous method is fragile and inefficient. It is better to wrap the entire sequence of operations into a single async method and run that once.

Suggested change

def compute_logprobs(self, data):

self.wake_up()

try:

return self._run_single(self.server_manager.compute_teacher_logprobs_batch(data))

return _run_single(self.server_manager.compute_teacher_logprobs_batch(data))

finally:

self.sleep()

def compute_logprobs(self, data):

async def _compute():

await self.wake_up()

try:

return await self.server_manager.compute_teacher_logprobs_batch(data)

finally:

await self.sleep()

return _run_single(_compute())

gemini-code-assist · 2026-04-01T02:37:05Z

verl/experimental/teacher_loop/teacher_model.py

+async def _run_all(tasks: list[asyncio.Task]):
+    await asyncio.gather(*tasks)


The type hint for tasks in _run_all is list[asyncio.Task], but the function is called with a list of coroutines (e.g., in TeacherModelManager.wake_up). While asyncio.gather accepts coroutines, the type hint is technically incorrect and might be misleading for static analysis tools.

Suggested change

async def _run_all(tasks: list[asyncio.Task]):

await asyncio.gather(*tasks)

async def _run_all(tasks: list):

await asyncio.gather(*tasks)

gemini-code-assist · 2026-04-01T02:37:05Z

verl/experimental/teacher_loop/teacher_manager.py

+        model_config = HFModelConfig(path=teacher_model_config.model_path)
+        text_tokenizer = model_config.tokenizer
+        if model_config.tokenizer is None:
+            raise ValueError(f"Tokenizer is required for teacher model {teacher_model_config.model_path}")
+        self.pad_token_id = text_tokenizer.pad_token_id


Initializing HFModelConfig and accessing model_config.tokenizer inside __init__ may trigger redundant and expensive tokenizer loading every time an AsyncTeacherLLMServerManager is instantiated. Since this manager is initialized in the AgentLoopWorker, which can be numerous, this could lead to significant overhead and memory pressure. It is recommended to pass the pad_token_id directly or ensure the tokenizer is cached.

wuxibin89 · 2026-04-13T04:03:49Z

We should merge this PR with config change in #5774

JacobHelwig · 2026-04-13T05:15:39Z

/gemini review

gemini-code-assist

Code Review

This pull request refactors the distillation framework to support multi-teacher configurations. Key changes include the introduction of a MultiTeacherModelManager to coordinate multiple teacher instances, updates to the DistillationConfig and DistillationTeacherModelConfig schemas to support a dictionary of teacher models, and the addition of validation logic for teacher context lengths and inference engine parameters. Review feedback highlighted a concurrency risk when using asyncio.run() in an existing event loop, an indentation error in the teacher model resolution logic that would break multi-teacher support, and an initialization issue where the manager might use an unconfigured singular teacher field instead of the resolved configuration.

gemini-code-assist · 2026-04-13T05:19:59Z

verl/experimental/teacher_loop/teacher_model.py

+def _run_single(task):
+    async def run():
+        return await task
+
+    return asyncio.run(run())


Using asyncio.run() inside _run_single is dangerous because it will raise a RuntimeError if an event loop is already running in the current thread. In the context of the Ray driver or actors where verl typically runs, an event loop is often already present. It is safer to use the @auto_await decorator which handles both sync and async calling contexts correctly by using run_until_complete when a loop is already running.

References

Avoid using asyncio.run() in environments where an event loop might already be running; use safer alternatives like run_until_complete or auto-awaiting wrappers.

gemini-code-assist · 2026-04-13T05:19:59Z

verl/workers/config/distillation.py

+            for model_name, teacher_model in self.teacher_models.items():
+                teacher_model = omega_conf_to_dataclass(teacher_model, dataclass_type=DistillationTeacherModelConfig)
+                if teacher_model.is_configured(is_multi=True):
+                    teacher_models[model_name] = teacher_model
+                return teacher_models


The return teacher_models statement is incorrectly indented inside the for loop. This causes the method to return after processing only the first teacher model entry, which breaks support for multiple teacher models (even if currently limited to one in the runtime path, the configuration resolution logic is incorrect).

References

Ensure logic correctly handles all items in a collection before returning, unless an early exit is explicitly intended.

gemini-code-assist · 2026-04-13T05:19:59Z

verl/experimental/teacher_loop/teacher_model.py

+        """TODO: MOPD -- split resource pool across teachers and init one TeacherModelManager per teacher."""
+        self.teacher_model_manager = TeacherModelManager(
+            distillation_config=self.distillation_config,
+            teacher_model_config=self.distillation_config.teacher_model,


The MultiTeacherModelManager is passing self.distillation_config.teacher_model directly to the TeacherModelManager. However, if the user has configured the teacher(s) using the new teacher_models (plural) dictionary, the singular teacher_model field will be empty (default), leading to initialization failures. You should use self.distillation_config.get_single_teacher_model() to retrieve the correctly resolved teacher configuration.

References

Always use resolved configuration objects or helper methods that account for multiple configuration sources (e.g., singular vs plural fields).

wuxibin89 · 2026-04-13T05:45:42Z

verl/trainer/config/distillation/distillation.yaml

+# multi-teacher configs
+teacher_models: {}

+# single-teacher config


single-teacher config should be absorbed into teacher_models, which is a special case of MOPD with only one teacher.

We can place single-teacher under teacher_models with a special key (e.g default)?

JacobHelwig added 5 commits March 27, 2026 01:01

Multi-teacher cfg

cdcd6a3

Fix megatron name

e85de26

Generate cfgs

4ca6da7

Add task and model path checks

95097bb

Compose cfg test

309e140

JacobHelwig requested review from ArronHZG, PeterSH6, eric-haibin-lin, tongyx361, vermouth1992 and wuxibin89 as code owners April 1, 2026 02:31

JacobHelwig changed the title ~~[MOPD, 1/n][trainer,rollout,algo] feat: Multi-Teacher Model Manager~~ [trainer,rollout,algo] feat (MOPD, 1/n): Multi-Teacher Model Manager Apr 1, 2026

JacobHelwig changed the title ~~[trainer,rollout,algo] feat (MOPD, 1/n): Multi-Teacher Model Manager~~ [trainer,rollout,algo] feat: (MOPD, 1/n) Multi-Teacher Model Manager Apr 1, 2026

gemini-code-assist bot reviewed Apr 1, 2026

View reviewed changes

JacobHelwig added 3 commits April 1, 2026 18:34

Multi teacher model manager

504581f

Fix for standalone mode

5634e8c

Fix async agent loop

15e2efe

JacobHelwig force-pushed the jhelwig/multiTeacherManager branch from 12f4b5b to 15e2efe Compare April 1, 2026 23:34

JacobHelwig added 2 commits April 1, 2026 18:58

Server manager attr in AsyncTeacherLLMServerManager

c617ab6

Teacher model config for sampling params

2a0d681

JacobHelwig changed the title ~~[trainer,rollout,algo] feat: (MOPD, 1/n) Multi-Teacher Model Manager~~ [trainer,rollout,algo] feat: (MOPD, 1/3) Multi-Teacher Model Manager Apr 2, 2026

JacobHelwig changed the title ~~[trainer,rollout,algo] feat: (MOPD, 1/3) Multi-Teacher Model Manager~~ [trainer,rollout,algo] feat: (MOPD, 1/3) Multi-Teacher Model and Server Managers Apr 2, 2026

wuxibin89 mentioned this pull request Apr 7, 2026

[roadmap] verl 26Q2 roadmap #5836

Open

33 tasks

JacobHelwig and others added 4 commits April 6, 2026 21:32

Merge branch 'main' into jhelwig/mopdCfgDict

26a09a0

PC

dd0f0b9

top-k check

9ebd62d

Correct check

432188b

JacobHelwig added 2 commits April 12, 2026 23:22

Merge branch 'jhelwig/mopdCfgDict' into jhelwig/multiTeacherManager

55e19dd

Merge branch 'main' into jhelwig/multiTeacherManager

ecb2548

JacobHelwig added 4 commits April 12, 2026 23:30

MTM to main ppo sync

c3ec0c9

ngpus in teacher config

f72b7f6

Extra line

f214a85

Extra arg

bde0655

JacobHelwig changed the title ~~[trainer,rollout,algo] feat: (MOPD, 1/3) Multi-Teacher Model and Server Managers~~ [trainer,cfg,rollout,algo] feat: (MOPD, 1/2) Multi-Teacher Model, Server Managers and Config Apr 13, 2026

gemini-code-assist bot reviewed Apr 13, 2026

View reviewed changes

JacobHelwig added 2 commits April 13, 2026 00:20

Fix return

b5cf6b8

Use teacher config dict when initializing model managers

9308f43

wuxibin89 mentioned this pull request Apr 13, 2026

[cfg,trainer] feat: (MOPD, 1/3): Multi-teacher config dict #5774

Closed

wuxibin89 reviewed Apr 13, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[trainer,cfg,rollout,algo] feat: (MOPD, 1/2) Multi-Teacher Model, Server Managers and Config#5834

[trainer,cfg,rollout,algo] feat: (MOPD, 1/2) Multi-Teacher Model, Server Managers and Config#5834
JacobHelwig wants to merge 22 commits intoverl-project:mainfrom
JacobHelwig:jhelwig/multiTeacherManager

JacobHelwig commented Apr 1, 2026 •

edited

Loading

Uh oh!

gemini-code-assist bot left a comment

Uh oh!

gemini-code-assist bot Apr 1, 2026

Uh oh!

gemini-code-assist bot Apr 1, 2026

Uh oh!

gemini-code-assist bot Apr 1, 2026

Uh oh!

wuxibin89 commented Apr 13, 2026

Uh oh!

JacobHelwig commented Apr 13, 2026

Uh oh!

gemini-code-assist bot left a comment

Uh oh!

gemini-code-assist bot Apr 13, 2026

Uh oh!

gemini-code-assist bot Apr 13, 2026

Uh oh!

gemini-code-assist bot Apr 13, 2026

Uh oh!

wuxibin89 Apr 13, 2026 •

edited

Loading

Uh oh!

wuxibin89 Apr 13, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

		async def _run_all(tasks: list[asyncio.Task]):
		await asyncio.gather(*tasks)

Conversation

JacobHelwig commented Apr 1, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

What does this PR do?

Testing

Script

GSM8k eval acc

GSM8k train acc

GSM8k distillation loss

Design & Code Changes

Uh oh!

gemini-code-assist bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

gemini-code-assist bot Apr 1, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist bot Apr 1, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist bot Apr 1, 2026

Choose a reason for hiding this comment

Uh oh!

wuxibin89 commented Apr 13, 2026

Uh oh!

JacobHelwig commented Apr 13, 2026

Uh oh!

gemini-code-assist bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

gemini-code-assist bot Apr 13, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist bot Apr 13, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist bot Apr 13, 2026

Choose a reason for hiding this comment

Uh oh!

wuxibin89 Apr 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

wuxibin89 Apr 13, 2026

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

JacobHelwig commented Apr 1, 2026 •

edited

Loading

wuxibin89 Apr 13, 2026 •

edited

Loading