@@ -481,10 +481,9 @@ def make_opd(
481481 Loss: ReverseKLLoss - minimizes KL(student || teacher) per token.
482482
483483 Prerequisites:
484- - Rollouts must include teacher logprobs via one of:
485- - RolloutEngine.generate_batch(teacher_client=...)
486- - External post-processing that populates SAWItem.attachments.teacher_logps
487- - Collator must extract teacher_logps into batch["teacher_logps"]
484+ - Agent must have a TokenLevelScorer with name="teacher_logps"
485+ - Use make_vllm_teacher_scorer() to create the scorer
486+ - Scorer runs during Agent.act() and scores flow through to training
488487
489488 Args:
490489 kl_coeff: Coefficient for KL loss. Higher values push the student
@@ -495,22 +494,25 @@ def make_opd(
495494
496495 Example:
497496 ```python
498- from ludic.training import RolloutBatchSource, Trainer, make_opd
499- from ludic.training.distillation import TinkerTeacherClient
500-
501- # Create teacher client
502- teacher = TinkerTeacherClient(sampling_client=teacher_sampling_client)
503-
504- # Create batch source with teacher
505- batch_source = RolloutBatchSource(
506- engine=engine,
507- make_requests=make_requests_fn,
508- credit_assigner=make_opd().credit_assigner,
509- teacher_client=teacher,
497+ from ludic.training import Trainer, make_opd
498+ from ludic.training.scoring import make_vllm_teacher_scorer
499+ from ludic.agent import Agent
500+
501+ # Create teacher scorer
502+ teacher_scorer = make_vllm_teacher_scorer(
503+ base_url="http://localhost:8001",
504+ model="Qwen/Qwen3-32B",
505+ )
506+
507+ # Attach to agent - scores flow through automatically
508+ agent = Agent(
509+ client=client,
510+ ...,
511+ scorers=[teacher_scorer],
510512 )
511513
512514 # Train with OPD
513- trainer = Trainer(model=model, algorithm =make_opd(), ...)
515+ trainer = Trainer(model=model, algo =make_opd(), ...)
514516 ```
515517
516518 Reference: https://thinkingmachines.ai/blog/on-policy-distillation
0 commit comments