allenai
diff --git a/‎mason.py‎
Lines changed: 229 additions & 327 deletions b/‎mason.py‎
Lines changed: 229 additions & 327 deletions
diff --git a/‎open_instruct/grpo_fast.py‎
Lines changed: 10 additions & 40 deletions b/‎open_instruct/grpo_fast.py‎
Lines changed: 10 additions & 40 deletions
diff --git a/‎open_instruct/search_utils/gpqa_eval.py‎
Lines changed: 0 additions & 1 deletion b/‎open_instruct/search_utils/gpqa_eval.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎open_instruct/vllm_utils3.py‎
Lines changed: 34 additions & 137 deletions b/‎open_instruct/vllm_utils3.py‎
Lines changed: 34 additions & 137 deletions
@@ -625,15 +625,6 @@ def load(self, path: str, map_location=None):
         np.random.seed(worker_seed)
         random.seed(worker_seed)
 
-        torch.distributed.init_process_group(
-            backend="nccl",
-            init_method="env://",
-            world_size=self.world_size,
-            rank=self.rank,
-            timeout=timedelta(minutes=args.backend_timeout),
-            device_id=torch.device("cuda", self.local_rank),
-        )
-
         deepspeed.init_distributed(timeout=timedelta(minutes=args.backend_timeout))
 
         ds_config = get_train_ds_config(offload=False, adam_offload=False, stage=args.deepspeed_stage, bf16=True)
@@ -2312,28 +2303,6 @@ def weight_sync_thread(
     logger.info("[Weight Sync Thread] 🛑 Stopping weight sync thread")
 
 
-def generate_thread(args, vllm_engines, resume_training_step, stop_event, generate_metrics_Q):
-    """Thread function that repeatedly calls process_from_queue on vllm engines."""
-    logger.info("[Generate Thread] 🚀 Starting generation thread")
-    while not stop_event.is_set():
-        with Timer("🔥 Generation time") as timer:
-            processed_results, _ = ray_get_with_progress(
-                [engine.process_from_queue.remote(timeout=20) for engine in vllm_engines],
-                desc="[Generate Thread] Waiting for vLLM engines to process",
-                enable=args.verbose,
-            )
-            num_processed = sum(int(result) for result in processed_results)
-            # Suppress timing output if nothing was processed
-            if num_processed == 0:
-                timer.noop = True
-        if num_processed > 0:
-            try:
-                generate_metrics_Q.put_nowait({"time/generation": timer.duration})
-            except Full:
-                logger.warning("[Generate Thread] generate metrics queue full, skipping metric")
-    logger.info("[Generate Thread] 🛑 Stopping generation thread")
-
-
 def one_training_step(
     args: Args,
     policy_group: ModelGroup,
@@ -2682,7 +2651,6 @@ def cleanup_training_resources(
     actor_manager: ActorManager,
 ) -> None:
     """Clean up all training resources including threads and Ray queues."""
-    # Signal generate_thread to stop
     stop_event.set()
 
     logger.info("Signaling all actors to stop...")
@@ -2791,14 +2759,13 @@ def run_training(
         model_dims,
     )
 
-    logger.info("======== ✅ generation thread starts =========")
-    generation_future = executor.submit(
-        generate_thread, args, vllm_engines, resume_training_step, stop_event, generate_metrics_Q
-    )
-
-    # setup health check function to check that everything is still alive
     def health_check_fn():
-        [f.result() for f in [packing_future, generation_future, weight_sync_thread_future] if f.done()]
+        [f.result() for f in [packing_future, weight_sync_thread_future] if f.done()]
+        ray_get_with_progress(
+            [engine.check_background_threads.remote() for engine in vllm_engines],
+            desc="Checking vLLM engine health",
+            enable=False,
+        )
 
     # Send initial data to ensure we have a N-step offset.
     for _ in range(args.async_steps):
@@ -2835,7 +2802,9 @@ def health_check_fn():
             )
 
         # Check if any of the threads have raised an exception.
+        health_check_start = time.perf_counter()
         health_check_fn()
+        health_check_time = time.perf_counter() - health_check_start
 
         logger.debug(f"[Main Thread] Triggered weight sync for step {training_step}")
         weight_sync_trigger_event.set()
@@ -2859,7 +2828,6 @@ def health_check_fn():
                 is_eval=True,
             )
 
-        # The generate_thread is now handling vLLM processing asynchronously
         collated_data, data_thread_metrics, num_total_tokens, num_step_tokens, prompt_lengths, response_lengths = (
             load_data_from_packing_thread(packed_sequences_Q, num_total_tokens, stop_event, health_check_fn)
         )
@@ -2872,6 +2840,8 @@ def health_check_fn():
             except Empty:
                 logger.info("[Main Thread] didn't get train generation metrics")
 
+        data_thread_metrics["time/health_check"] = health_check_time
+
         one_training_step(
             args,
             policy_group,
 
@@ -48,7 +48,6 @@
     max_output_len=args.model_len,  # Explicitly set a custom max context length
     gpu_memory_utilization=0.95,
     num_gpus=1,
-    enable_sleep_mode=False,
     noset_visible_devices=ray_noset_visible_devices(),
 )
 
 
@@ -18,6 +18,7 @@
 import dataclasses
 import os
 import queue
+import threading
 import time
 from collections import defaultdict
 from concurrent import futures
@@ -395,10 +396,15 @@ def __init__(
         self._should_stop_value = False
         self._should_stop_timeout_s = 5
 
-        self._executor = futures.ThreadPoolExecutor(max_workers=1)
-        self._prefetch_future = self._executor.submit(self._prefetch_worker)
+        # Initialize instance variables before starting threads
         self.tracking = _init_tool_tracking()
         self.request_outputs = {}
+        self._threads_started = threading.Event()
+
+        # Start background threads
+        self._executor = futures.ThreadPoolExecutor(max_workers=2)
+        self._prefetch_future = self._executor.submit(self._prefetch_worker)
+        self._process_future = self._executor.submit(self._process_from_queue)
 
     def get_model_dims_dict(self):
         """Get only the model dimensions as a simple dict without loading weights."""
@@ -431,8 +437,9 @@ def _should_stop(self) -> bool:
 
     def _prefetch_worker(self, sleep_length_s: int = 1):
         """Background worker that prefetches requests until we have enough buffered."""
+        self._threads_started.set()
         while True:
-            if self._should_stop():
+            if not self.inflight_updates and self._should_stop():
                 time.sleep(sleep_length_s)
                 continue
             current_unfinished = self.llm_engine.get_num_unfinished_requests()
@@ -456,58 +463,18 @@ def _insert_result_to_queue(self, result, is_eval: bool):
         results_queue = self.eval_results_queue if is_eval else self.results_queue
         results_queue.put(result)
 
-    def _should_exit(self) -> bool:
-        """Determine if the processing loop should exit.
-
-        Returns:
-            bool: True if the loop should exit, False otherwise.
-        """
-        # Check stop condition first (cheapest check)
-        stop_requested = self._should_stop()
-
-        # Case 1: inflight_updates enabled and stop requested - exit immediately
-        if self.inflight_updates and stop_requested:
-            return True
-
-        # Now check for pending work (only if needed)
-        if stop_requested:
-            # Need to check if we have pending work
-            pending_tools = len(self.tracking["pending_tool_futures"])
-            unfinished = self.llm_engine.get_num_unfinished_requests()
-
-            # Case 2: stop requested and no pending work - exit
-            if pending_tools == 0 and unfinished == 0:
-                return True
-            # Otherwise, we have pending work and should continue
-            return False
-
-        # No stop requested - check if there's any work to do
-        pending_tools = len(self.tracking["pending_tool_futures"])
-        unfinished = self.llm_engine.get_num_unfinished_requests()
-
-        # Case 3: no work left at all - exit
-        if pending_tools == 0 and unfinished == 0:
-            return True
-
-        # Otherwise, continue processing
-        return False
-
-    def process_from_queue(self, timeout: float = 60.0):
+    def _process_from_queue(self, timeout: float = 60.0):
         """Run generation loop using LLMEngine directly, with optional tool support.
 
-        Runs continuously until should_stop is set, periodically adding new requests
-        and yielding control to allow weight synchronization.
+        Runs continuously in a background thread, processing requests from the engine.
 
         Returns:
             int: Number of requests processed
         """
-
-        # Use persistent instance variables for tracking and outputs
-        # This ensures state is maintained across multiple calls
         total_processed = 0
         iteration_count = 0
 
-        while not self._should_exit():
+        while True:
             iteration_count += 1
 
             # Health check: ensure prefetch worker is alive. This will raise if it has crashed.
@@ -558,17 +525,7 @@ def process_from_queue(self, timeout: float = 60.0):
                         total_processed += self._finalize_sub_request(
                             output.request_id, output, complete_output, current_time
                         )
-
-            if self.verbose and iteration_count % 100 == 0:
-                final_unfinished = self.llm_engine.get_num_unfinished_requests()
-                pending_tools = len(self.tracking["pending_tool_futures"])
-                self.logger.info(
-                    f"process_from_queue iteration {iteration_count}: unfinished={final_unfinished}, pending_tools={pending_tools}"
-                )
-
-            # If we have only pending tools but no unfinished requests, sleep briefly
-            # to let pending tools complete before the next iteration
-            if self.llm_engine.get_num_unfinished_requests() == 0 and len(self.tracking["pending_tool_futures"]) > 0:
+            if self.llm_engine.get_num_unfinished_requests() == 0:
                 time.sleep(1)
 
         return total_processed
@@ -870,10 +827,22 @@ def init_process_group(
             args=(master_address, master_port, rank_offset, world_size, group_name, backend, use_ray, timeout_minutes),
         )
 
+    def _maybe_drain_requests(self, sleep_s: float = 0.1):
+        while not self.inflight_updates:
+            pending_tools = len(self.tracking["pending_tool_futures"])
+            unfinished = self.llm_engine.get_num_unfinished_requests()
+
+            if pending_tools == 0 and unfinished == 0:
+                break
+
+            time.sleep(sleep_s)
+
     def update_weight(self, name, dtype, shape, empty_cache=False):
+        self._maybe_drain_requests()
         return self.llm_engine.collective_rpc("update_weight", args=(name, dtype, shape, empty_cache))
 
     def update_weight_cuda_ipc(self, name, dtype, shape, ipc_handles, empty_cache=False):
+        self._maybe_drain_requests()
         return self.llm_engine.collective_rpc(
             "update_weight_cuda_ipc", args=(name, dtype, shape, ipc_handles, empty_cache)
         )
@@ -888,8 +857,15 @@ def wake_up(self, tags: Optional[list[str]] = None):
         self.llm_engine.wake_up(tags)
 
     def ready(self):
+        self._threads_started.wait(timeout=30)
         return True
 
+    def check_background_threads(self):
+        if self._prefetch_future.done():
+            self._prefetch_future.result()
+        if self._process_future.done():
+            self._process_future.result()
+
     def get_kv_cache_info(self):
         """Get KV cache max concurrency from the vLLM engine."""
         kv_cache_specs = self.llm_engine.model_executor.get_kv_cache_specs()
@@ -954,7 +930,6 @@ def create_vllm_engines(
     vllm_gpu_memory_utilization: float = 0.9,
     single_gpu_mode: bool = False,
     pg: Optional[ray.util.placement_group] = None,
-    vllm_enable_sleep=False,
     tools: Optional[Dict[str, Tool]] = None,
     max_tool_calls: List[int] = [5],
     prompt_queue=None,
@@ -1037,7 +1012,6 @@ def create_vllm_engines(
                 gpu_memory_utilization=vllm_gpu_memory_utilization,
                 bundle_indices=bundle_indices,
                 num_gpus=0.2 if use_hybrid_engine else 1,
-                enable_sleep_mode=vllm_enable_sleep,
                 noset_visible_devices=ray_noset_visible_devices(),
                 prompt_queue=prompt_queue,
                 results_queue=results_queue,
@@ -1053,83 +1027,6 @@ def create_vllm_engines(
             )
         )
 
-    # Verify engines initialized successfully
-    try:
-        ray_get_with_progress(
-            [engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=300
-        )
-    except TimeoutError as e:
-        logger.error(f"vLLM engines failed to initialize: {e}")
-        # Kill partially initialized actors before raising
-        for engine in vllm_engines:
-            ray.kill(engine)
-        raise RuntimeError(f"vLLM engine initialization timed out: {e}")
-
-    if vllm_enable_sleep:
-        batch_vllm_engine_call(vllm_engines, "sleep", rank_0_only=False)
+    ray_get_with_progress([engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=300)
 
     return vllm_engines
-
-
-def batch_vllm_engine_call(engines: List[Any], method_name: str, *args, rank_0_only: bool = True, **kwargs):
-    """
-    Batch call a method on multiple vLLM engines.
-    Args:
-        engines: List of vLLM engine instances
-        method_name: Name of the method to call
-        rank_0_only: Only execute on rank 0 if True
-        *args: Positional arguments to pass to the method
-        **kwargs: Keyword arguments to pass to the method
-    Returns:
-        List of results from ray.get() if on rank 0, None otherwise
-    """
-    import torch
-
-    if rank_0_only and torch.distributed.get_rank() != 0:
-        return None
-
-    refs = []
-    for engine in engines:
-        method = getattr(engine, method_name)
-        refs.append(method.remote(*args, **kwargs))
-
-    return ray.get(refs)
-
-
-if __name__ == "__main__":
-    num_engines = 1
-    tensor_parallel_size = 1
-    world_size = num_engines * tensor_parallel_size + 1
-    vllm_engines = create_vllm_engines(
-        num_engines=num_engines,
-        tensor_parallel_size=tensor_parallel_size,
-        enforce_eager=True,
-        pretrain="facebook/opt-125m",
-        revision="main",
-        seed=42,
-        enable_prefix_caching=False,
-        max_model_len=1024,
-    )
-    llm = vllm_engines[0]
-    from vllm.utils import get_ip, get_open_port
-
-    master_address = get_ip()
-    master_port = get_open_port()
-    backend = "gloo"
-
-    refs = [
-        engine.init_process_group.remote(
-            master_address, master_port, i * tensor_parallel_size + 1, world_size, "openrlhf", backend=backend
-        )
-        for i, engine in enumerate(vllm_engines)
-    ]
-    model_update_group = init_process_group(
-        backend=backend,
-        init_method=f"tcp://{master_address}:{master_port}",
-        world_size=world_size,
-        rank=0,
-        group_name="openrlhf",
-    )
-    ray.get(refs)
-    output = ray.get(llm.generate.remote("San Franciso is a"))
-    logger.info(f"output: {output}")
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@`
`48`	`48`	`max_output_len=args.model_len, # Explicitly set a custom max context length`
`49`	`49`	`gpu_memory_utilization=0.95,`
`50`	`50`	`num_gpus=1,`
`51`		`- enable_sleep_mode=False,`
`52`	`51`	`noset_visible_devices=ray_noset_visible_devices(),`
`53`	`52`	`)`
`54`	`53`