refactor(slurm): move NCCL and runtime configs from template to config (#1818)

samsja · claude · web-flow · commit a77725e5c09e · 2026-02-17T13:42:59.000-08:00
* refactor(slurm): move NCCL and runtime configs from template to config - Add port/timeout fields to SharedWeightBroadcastConfig - Propagate port/timeout in auto_setup_weight_broadcast - Add auto_setup_slurm_nccl validator that sets: - orchestrator.num_train_workers from SLURM topology - trainer.weight_broadcast.host to 0.0.0.0 - trainer.weight_broadcast.inference_world_size from SLURM topology - Add validate_inference_config to catch missing inference config early - Remove hardcoded env vars and CLI args from SLURM template Only --weight_broadcast.host $MASTER_ADDR and --client.base-url remain as CLI args (runtime values from SLURM). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(slurm): add rl_slurm entrypoint and update docs - Add rl_slurm script entrypoint to pyproject.toml - Update docs to use `uv run rl_slurm` instead of `uv run python -m prime_rl.slurm.rl` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: sami jaghouar <sami@primeintellect.ai> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/docs/slurm.md b/docs/slurm.md
@@ -3,7 +3,7 @@
 For SLURM clusters, use the `rl_slurm` entrypoint. It resolves the full config (trainer, orchestrator, inference), dumps sub-configs as TOML files, renders a SLURM batch script from a Jinja2 template, and submits it with `sbatch`.
 
 ```bash
-uv run python -m prime_rl.slurm.rl @ examples/slurm/hendrycks_math.toml
+uv run rl_slurm @ examples/slurm/hendrycks_math.toml
 ```
 
 This will:
@@ -15,7 +15,7 @@ This will:
 To only generate the script without submitting, use `--dry-run`:
 
 ```bash
-uv run python -m prime_rl.slurm.rl @ examples/slurm/hendrycks_math.toml --dry-run
+uv run rl_slurm @ examples/slurm/hendrycks_math.toml --dry-run
 ```
 
 ## Configuration
@@ -123,7 +123,7 @@ dp = 2
 The default template handles a standard multi-node setup with NCCL weight broadcast, InfiniBand detection, and `srun`-based process dispatch. For more advanced use cases (custom partitions, account settings, module loads, different networking setups, etc.), provide your own Jinja2 template:
 
 ```bash
-uv run python -m prime_rl.slurm.rl \
+uv run rl_slurm \
     @ my_config.toml \
     --slurm-template path/to/my_template.sh.j2
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
 
 [project.scripts]
 rl = "prime_rl.rl:main"
+rl_slurm = "prime_rl.slurm.rl:main"
 trainer = "prime_rl.trainer.rl.train:main"
 orchestrator = "prime_rl.orchestrator.orchestrator:main"
 inference = "prime_rl.inference.server:main"
diff --git a/src/prime_rl/rl_config.py b/src/prime_rl/rl_config.py
@@ -92,6 +92,9 @@ class SharedWeightBroadcastConfig(BaseSettings):
         "filesystem"
     )
 
+    port: Annotated[int, Field(description="The port to use for NCCL weight broadcast.")] = 29501
+    timeout: Annotated[int, Field(description="The timeout in seconds for NCCL weight broadcast.")] = 1200
+
 
 class BaseRLConfig(BaseSettings):
     """Configures an RL training run."""
@@ -306,10 +309,15 @@ def auto_setup_weight_broadcast(self):
             if self.weight_broadcast.type == "nccl":
                 inference_world_size = self.inference.parallel.dp * self.inference.parallel.tp if self.inference else 1
                 self.trainer.weight_broadcast = TrainerNCCLWeightBroadcastConfig(
-                    type=self.weight_broadcast.type, inference_world_size=inference_world_size
+                    type=self.weight_broadcast.type,
+                    inference_world_size=inference_world_size,
+                    port=self.weight_broadcast.port,
+                    timeout=self.weight_broadcast.timeout,
                 )
                 self.orchestrator.weight_broadcast = OrchestratorNCCLWeightBroadcastConfig(
-                    type=self.weight_broadcast.type
+                    type=self.weight_broadcast.type,
+                    port=self.weight_broadcast.port,
+                    timeout=self.weight_broadcast.timeout,
                 )
             elif self.weight_broadcast.type == "filesystem":
                 self.trainer.weight_broadcast = TrainerFileSystemWeightBroadcastConfig()
diff --git a/src/prime_rl/slurm/rl.py b/src/prime_rl/slurm/rl.py
@@ -49,6 +49,30 @@ def auto_setup_dp_replicate(self):
             self.trainer.dp_replicate = self.num_train_nodes // self.nodes_per_fsdp_group
         return self
 
+    @model_validator(mode="after")
+    def auto_setup_slurm_nccl(self):
+        """Set SLURM-specific values for NCCL weight broadcast and num_train_workers."""
+        # Set num_train_workers based on SLURM topology
+        self.orchestrator.num_train_workers = self.num_train_nodes * self.gpus_per_node
+
+        # Set NCCL-specific values if using NCCL weight broadcast
+        if self.weight_broadcast is not None and self.weight_broadcast.type == "nccl":
+            # Trainer listens on all interfaces
+            self.trainer.weight_broadcast.host = "0.0.0.0"
+            # Compute inference world size from SLURM topology
+            self.trainer.weight_broadcast.inference_world_size = self.gpus_per_node * self.num_infer_nodes
+        return self
+
+    @model_validator(mode="after")
+    def validate_inference_config(self):
+        """Validate that inference config is provided when num_infer_nodes > 0."""
+        if self.num_infer_nodes > 0 and self.inference is None:
+            raise ValueError(
+                f"inference config is required when num_infer_nodes > 0 (got {self.num_infer_nodes}). "
+                "The SLURM template will launch inference servers on these nodes."
+            )
+        return self
+
 
 def write_subconfigs(config: RLSLURMConfig, output_dir: Path) -> None:
     """Write resolved subconfigs to disk as TOML files."""
diff --git a/src/prime_rl/slurm/rl_slurm.sh.j2 b/src/prime_rl/slurm/rl_slurm.sh.j2
@@ -38,9 +38,6 @@ export OMP_NUM_THREADS=1
 export HOSTNAMES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
 export INFER_HOSTS=${HOSTNAMES[@]:0:$NUM_INFER_NODES}
 export TRAIN_HOSTS=${HOSTNAMES[@]:$NUM_INFER_NODES:$SLURM_JOB_NUM_NODES}
-export BROADCAST_PORT=${BROADCAST_PORT:-29501}
-export BROADCAST_TIMEOUT=${BROADCAST_TIMEOUT:-12000}
-export NCCL_COMM_TIMEOUT=${NCCL_BROADCAST_TIMEOUT:-12000}
 
 INFER_URLS=""
 for host in ${INFER_HOSTS[@]}; do
@@ -105,11 +102,7 @@ else
         uv run orchestrator \
             @ $CONFIG_DIR/orchestrator.toml \
             --weight_broadcast.host $MASTER_ADDR \
-            --weight_broadcast.port $BROADCAST_PORT \
-            --weight_broadcast.timeout $BROADCAST_TIMEOUT \
             --client.base-url $INFER_URLS \
-            --client.timeout 3600 \
-            --num-train-workers $((NUM_TRAIN_NODES * GPUS_PER_NODE)) \
             2>&1 | tee $OUTPUT_DIR/slurm/latest_orchestrator.log $OUTPUT_DIR/slurm/job_${SLURM_JOB_ID}_orchestrator.log & disown
     fi
 
@@ -132,11 +125,6 @@ else
         --local-ranks-filter=$(seq -s, 0 $((GPUS_PER_NODE - 1))) \
         -m prime_rl.trainer.rl.train \
         @ $CONFIG_DIR/trainer.toml \
-        --weight_broadcast.host 0.0.0.0 \
-        --weight_broadcast.port $BROADCAST_PORT \
-        --weight_broadcast.inference_world_size $((GPUS_PER_NODE * NUM_INFER_NODES)) \
-        --weight_broadcast.timeout $BROADCAST_TIMEOUT \
-        --dist_timeout_seconds $NCCL_COMM_TIMEOUT \
         2>&1 | tee -a $OUTPUT_DIR/slurm/latest_train_node_rank_${TRAIN_NODE_RANK}.log $OUTPUT_DIR/slurm/job_${SLURM_JOB_ID}_train_node_rank_${TRAIN_NODE_RANK}.log
     fi
 '