PrimeIntellect-ai · mikasenghaas · Mar 31, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 Documenting changes which affect configuration usage patterns (added/moved/removed/renamed fields, notable logic changes).
 
+- **`orchestrator.buffer.env_ratios` → per-env `orchestrator.env[].ratio`**: `buffer.env_ratios` has been removed. Set `ratio` on each `[[orchestrator.env]]` entry instead. Ratios must be all-or-nothing across envs (either all have a ratio or none do). (2026-04-05)
+- **`orchestrator.val` removed**: The `[orchestrator.val]` config section (`ValConfig`) has been removed. Validation was a separate concern from the env group refactor. Existing configs must delete this section. (2026-04-05)
+- **`orchestrator.max_concurrent` removed**: Concurrency limiting via `max_concurrent` and the global semaphore have been removed. Existing configs must delete this field. (2026-04-05)
+- **`orchestrator.buffer.hash_keys` default changed**: Default changed from `["task", "prompt"]` to `["env_name", "prompt"]`. The `task` field is no longer overridden by the orchestrator for env identification; `env_name` is used instead. Buffer checkpoints using the old default may not resume correctly. (2026-04-05)
+- **`orchestrator.eval.env[].num_examples` / `rollouts_per_example` no longer fall through**: `num_examples` and `rollouts_per_example` are now required per eval env and no longer inherit from the top-level `orchestrator.eval` section. (2026-04-05)
+- **`orchestrator.eval.env[].failed_rollouts` metric is now a ratio**: The `eval/{name}/failed_rollouts` metric now reports a ratio (0.0–1.0) instead of a raw count. Dashboards keying on this metric should be updated. (2026-04-05)
+- **`orchestrator.sampling.temp_scheduler` removed**: Temperature scheduling (`TemperatureSchedulerConfig`) has been removed. `sampling.temperature` is now a required `float` (default `1.0`). Existing configs using `temp_scheduler` must replace it with a fixed `temperature` value. (2026-04-05)
+- **`orchestrator.verification` removed**: The `[orchestrator.verification]` config section (`VerificationConfig`) has been removed. Rollout scoring is now always enabled. Existing configs must delete this section. (2026-04-05)
 - **`log.file` and `log.env_worker_logs` removed**: Removed `log.file` (from `LogConfig` and `SharedLogConfig`) and `log.env_worker_logs` (from `LogConfig`). Python file logging is replaced by deployment-level capture. Existing configs using these fields must delete them. Log paths unified: `.stdout` files renamed to `.log`, SLURM logs moved from `slurm/` to `logs/`. (2026-03-31)
 - **`trainer.log.ranks_filter` (NEW)**: Added `ranks_filter: list[int]` to `TrainerLogConfig` (default: `[0]`). Controls which ranks appear in trainer console output via torchrun's `--local-ranks-filter`. (2026-03-31)
 - **`wandb.log_extras.sample_ratio` / monitor sample logging defaults**: `wandb.log_extras.sample_ratio` is now actually applied to W&B sample-table logging via the shared monitor sampler (it was previously a no-op for WandB). Separately, the orchestrator no longer hard-caps sample logging to 8 rollouts before monitor-level sampling runs, so when monitor `sample_ratio` is `None`, monitors now receive and may log the full rollout batch for a step instead of at most 8 rollouts. This affects both W&B and Prime monitor sample logging behavior. (2026-03-27)

diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml
@@ -24,10 +24,6 @@ max_tokens = 64
 id = "color-codeword"
 args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }
 
-[orchestrator.val]
-interval = 5
-num_examples = 100
-
 [trainer]
 
 [trainer.model]

diff --git a/configs/hendrycks_math/rl.toml b/configs/hendrycks_math/rl.toml
@@ -24,10 +24,6 @@ args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "defau
 easy_threshold = 1.0
 hard_threshold = 0.0
 
-[orchestrator.val]
-interval = 5
-num_examples = 128
-
 [orchestrator.eval]
 interval = 10
 

diff --git a/configs/math_group/rl.toml b/configs/math_group/rl.toml
@@ -8,28 +8,25 @@ name = "math-group"
 name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
-batch_size = 512
-rollouts_per_example = 16
-oversampling_factor = 1.5
+batch_size = 256
+rollouts_per_example = 8
 
 [[orchestrator.env]]
 id = "math-env"
 name = "hendrycks-math"
 args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default" }
+ratio = 0.5
 
 [[orchestrator.env]]
 id = "math-env"
 name = "acereason-math"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem" }
+ratio = 0.5
 
 [orchestrator.buffer]
 easy_threshold = 1.0
 hard_threshold = 0.0
 
-[orchestrator.val]
-interval = 5
-num_examples = 128
-
 [orchestrator.eval]
 interval = 50
 

diff --git a/configs/multi_reverse_text/rl.toml b/configs/multi_reverse_text/rl.toml
@@ -13,16 +13,11 @@ max_tokens = 128
 
 [[orchestrator.env]]
 id = "reverse-text"
-address = "tcp://127.0.0.1:5000" # requires: uv run env-server --env.id reverse-text --env.address tcp://127.0.0.1:5000
 
 [[orchestrator.env]]
 id = "reverse-text"
 name = "reverse-text-2"
 
-[orchestrator.val]
-interval = 1
-num_examples = 16
-
 [orchestrator.eval]
 interval = 5
 

diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
@@ -20,10 +20,6 @@ max_tokens = 64
 id = "color-codeword"
 args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }
 
-[orchestrator.val]
-interval = 1
-num_examples = 100
-
 [trainer]
 
 [trainer.model]

diff --git a/configs/nemotron_4node/rl.toml b/configs/nemotron_4node/rl.toml
@@ -60,10 +60,6 @@ args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "defau
 easy_threshold = 1.0
 hard_threshold = 0.0
 
-[orchestrator.val]
-interval = 5
-num_examples = 128
-
 [orchestrator.eval]
 interval = 10
 

diff --git a/configs/nemotron_debug/rl.toml b/configs/nemotron_debug/rl.toml
@@ -59,10 +59,6 @@ args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "defau
 easy_threshold = 1.0
 hard_threshold = 0.0
 
-[orchestrator.val]
-interval = 5
-num_examples = 128
-
 [orchestrator.eval]
 interval = 10
 

diff --git a/examples/Intellect-3.1/rl.toml b/examples/Intellect-3.1/rl.toml
@@ -50,30 +50,34 @@ oversampling_factor = 2
 [[orchestrator.env]]
 id = "mini-swe-agent-plus"
 name = "swe"
+ratio = 0.3
 args = { max_turns = 200, cpu_cores = 2,  memory_gb = 4, disk_size_gb = 4, labels = ["mini-swe-agent-plus"], total_timeout_minutes = 720, sandbox_client_max_workers = 256, max_command_timeouts = 3, sandbox_command_timeout = 30}
 
 [[orchestrator.env]]
 id = "deepdive"
 name = "deepdive"
+ratio = 0.2
 args = { finish_with_tool = true, open_max_workers = 128, cache_dir = "/tmp/i3_deepdive_cache_train" }
 
 [[orchestrator.env]]
 id = "math-env"
 name = "math"
+ratio = 0.3
 args = { min_avg_reward = 0.0, max_avg_reward = 0.874}
 
 [[orchestrator.env]]
 id = "logic-env"
-args = { min_avg_reward = 0.0, max_avg_reward = 0.874 }
 name = "logic"
+ratio = 0.2
+args = { min_avg_reward = 0.0, max_avg_reward = 0.874 }
 
 [[orchestrator.env]]
 id = "code-env"
 name = "code"
+ratio = 0.2
 args = { pool_size = 512 }
 
 [orchestrator.buffer]
-env_ratios = [0.3, 0.2, 0.3, 0.2, 0.2]
 easy_threshold = 1.0
 online_difficulty_filtering = true
 seed = 42