OpenPipe
diff --git a/Diff for: ‎.github/workflows/mypy.yaml
+4-4 b/Diff for: ‎.github/workflows/mypy.yaml
+4-4
diff --git a/Diff for: ‎format.sh
+4-4 b/Diff for: ‎format.sh
+4-4
diff --git a/Diff for: ‎vllm/engine/async_llm_engine.py
+25-19 b/Diff for: ‎vllm/engine/async_llm_engine.py
+25-19
diff --git a/Diff for: ‎vllm/lora/worker_manager.py
+2-2 b/Diff for: ‎vllm/lora/worker_manager.py
+2-2
diff --git a/Diff for: ‎vllm/model_executor/guided_decoding/outlines_decoding.py
+2-2 b/Diff for: ‎vllm/model_executor/guided_decoding/outlines_decoding.py
+2-2
diff --git a/Diff for: ‎vllm/model_executor/guided_decoding/outlines_logits_processors.py
+5-1 b/Diff for: ‎vllm/model_executor/guided_decoding/outlines_logits_processors.py
+5-1
diff --git a/Diff for: ‎vllm/model_executor/model_loader/neuron.py
+9-7 b/Diff for: ‎vllm/model_executor/model_loader/neuron.py
+9-7
diff --git a/Diff for: ‎vllm/model_executor/model_loader/tensorizer.py
+1 b/Diff for: ‎vllm/model_executor/model_loader/tensorizer.py
+1
diff --git a/Diff for: ‎vllm/model_executor/sampling_metadata.py
+4 b/Diff for: ‎vllm/model_executor/sampling_metadata.py
+4
diff --git a/Diff for: ‎vllm/spec_decode/batch_expansion.py
+3-3 b/Diff for: ‎vllm/spec_decode/batch_expansion.py
+3-3
diff --git a/Diff for: ‎vllm/spec_decode/interfaces.py
+2-2 b/Diff for: ‎vllm/spec_decode/interfaces.py
+2-2
diff --git a/Diff for: ‎vllm/spec_decode/metrics.py
+1 b/Diff for: ‎vllm/spec_decode/metrics.py
+1
@@ -41,10 +41,10 @@ jobs:
         mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
         mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
 
+        mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
         # TODO(sang): Follow up
-        # mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
         # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
 
@@ -104,10 +104,10 @@ mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
 mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
 
 # TODO(sang): Follow up
-# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
 # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
 
 
 
@@ -2,8 +2,8 @@
 import os
 import time
 from functools import partial
-from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
-                    Set, Tuple, Type, Union)
+from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
+                    Optional, Set, Tuple, Type, Union)
 
 from transformers import PreTrainedTokenizer
 
@@ -52,7 +52,7 @@ class AsyncStream:
 
     def __init__(self, request_id: str) -> None:
         self.request_id = request_id
-        self._queue = asyncio.Queue()
+        self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
     def put(self, item: Union[RequestOutput, Exception]) -> None:
@@ -312,15 +312,17 @@ def __init__(self,
         self.max_log_len = max_log_len
         self.engine = self._init_engine(*args, **kwargs)
 
-        self.background_loop = None
+        self.background_loop: Optional[asyncio.Future] = None
         # We need to keep a reference to unshielded
         # task as well to prevent it from being garbage
         # collected
-        self._background_loop_unshielded = None
+        self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
         self.start_engine_loop = start_engine_loop
-        self._request_tracker: Optional[RequestTracker] = None
         self._errored_with: Optional[BaseException] = None
 
+        # Lazy initialized fields
+        self._request_tracker: RequestTracker
+
     @classmethod
     def from_engine_args(
         cls,
@@ -361,11 +363,13 @@ def from_engine_args(
     @property
     def is_running(self) -> bool:
         return (self.background_loop is not None
+                and self._background_loop_unshielded is not None
                 and not self._background_loop_unshielded.done())
 
     @property
     def is_stopped(self) -> bool:
-        return self.errored or (self.background_loop is not None
+        return self.errored or (self.background_loop is not None and
+                                self._background_loop_unshielded is not None
                                 and self._background_loop_unshielded.done())
 
     @property
@@ -381,7 +385,7 @@ def _error_callback(self, exc: Exception) -> None:
 
     async def get_tokenizer(self) -> "PreTrainedTokenizer":
         if self.engine_use_ray:
-            return await self.engine.get_tokenizer.remote()
+            return await self.engine.get_tokenizer.remote()  # type: ignore
         else:
             return self.engine.get_tokenizer()
 
@@ -434,7 +438,8 @@ async def engine_step(self) -> bool:
             # TODO: Maybe add add_request_batch to reduce Ray overhead
             try:
                 if self.engine_use_ray:
-                    await self.engine.add_request.remote(**new_request)
+                    await self.engine.add_request.remote(  # type: ignore
+                        **new_request)
                 else:
                     await self.engine.add_request_async(**new_request)
             except ValueError as e:
@@ -449,7 +454,7 @@ async def engine_step(self) -> bool:
             await self._engine_abort(finished_requests)
 
         if self.engine_use_ray:
-            request_outputs = await self.engine.step.remote()
+            request_outputs = await self.engine.step.remote()  # type: ignore
         else:
             request_outputs = await self.engine.step_async()
 
@@ -462,7 +467,7 @@ async def engine_step(self) -> bool:
 
     async def _engine_abort(self, request_ids: Iterable[str]):
         if self.engine_use_ray:
-            await self.engine.abort_request.remote(request_ids)
+            await self.engine.abort_request.remote(request_ids)  # type: ignore
         else:
             self.engine.abort_request(request_ids)
 
@@ -525,11 +530,12 @@ async def add_request(
             arrival_time = time.time()
 
         if self.engine_use_ray:
-            prompt_token_ids = await self.engine.encode_request_async.remote(
-                request_id=request_id,
-                prompt=prompt,
-                prompt_token_ids=prompt_token_ids,
-                lora_request=lora_request)
+            prompt_token_ids = await (
+                self.engine.encode_request_async.remote(  # type: ignore
+                    request_id=request_id,
+                    prompt=prompt,
+                    prompt_token_ids=prompt_token_ids,
+                    lora_request=lora_request))
         else:
             prompt_token_ids = await self.engine.encode_request_async(
                 request_id=request_id,
@@ -676,13 +682,13 @@ def _abort(self, request_id: str) -> None:
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         if self.engine_use_ray:
-            return await self.engine.get_model_config.remote()
+            return await self.engine.get_model_config.remote()  # type: ignore
         else:
             return self.engine.get_model_config()
 
     async def do_log_stats(self) -> None:
         if self.engine_use_ray:
-            await self.engine.do_log_stats.remote()
+            await self.engine.do_log_stats.remote()  # type: ignore
         else:
             self.engine.do_log_stats()
 
@@ -695,7 +701,7 @@ async def check_health(self) -> None:
 
         if self.engine_use_ray:
             try:
-                await self.engine.check_health.remote()
+                await self.engine.check_health.remote()  # type: ignore
             except ray.exceptions.RayActorError as e:
                 raise RuntimeError("Engine is dead.") from e
         else:
 
@@ -107,12 +107,12 @@ def create_lora_manager(
         self._lora_manager: LoRAModelManager = lora_manager
         return lora_manager.model
 
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         self._apply_loras(lora_requests)
         self._lora_manager.set_lora_mapping(lora_mapping)
 
-    def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
         loras_that_exist = self.list_loras()
         loras_map = {
             lora_request.lora_int_id: lora_request
 
@@ -55,7 +55,7 @@ class GuidedDecodingMode(Enum):
 
 async def get_outlines_guided_decoding_logits_processor(
         request: Union[CompletionRequest, ChatCompletionRequest],
-        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
+        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
     """
     Given an OpenAI-compatible request, check for guided decoding parameters
     and get the necessary logits processor for the given guide.
@@ -84,7 +84,7 @@ async def get_outlines_guided_decoding_logits_processor(
 
 def _get_guide_and_mode(
     request: Union[CompletionRequest, ChatCompletionRequest]
-) -> Tuple[str, GuidedDecodingMode]:
+) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
 
     if request.guided_json:
         json = request.guided_json
 
@@ -21,14 +21,18 @@
 from typing import Callable, DefaultDict, Dict, List, Optional, Union
 
 import torch
-from outlines.fsm.fsm import CFGFSM, RegexFSM
+from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
 
 class BaseLogitsProcessor:
 
+    def __init__(self):
+        # Child class should use initialize in their init.
+        self.fsm: FSM
+
     def init_state(self):
         """Initialize the FSM states."""
         self.fsm_state: DefaultDict[int, int] = defaultdict(int)
 
@@ -1,7 +1,7 @@
 """Utilities for selecting and loading neuron models."""
 import importlib
 import os
-from typing import Optional, Type
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -27,7 +27,7 @@
 }
 
 # Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS = {
+_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
     "LlamaForCausalLM": ("transformers_neuronx.llama.model",
                          "LlamaForSampling", "LlamaForCausalLM"),
     "MistralForCausalLM": ("transformers_neuronx.mistral.model",
@@ -43,11 +43,13 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
-        self.model = None
         self.logits_processor = LogitsProcessor(config.vocab_size,
                                                 logits_as_input=True)
         self.sampler = Sampler()
 
+        # Lazy initialized
+        self.model: nn.Module
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -74,17 +76,17 @@ def sample(
 
     def load_weights(self, model_name_or_path: str, **kwargs):
         arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls, hf_model_cls = (
+        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
             _NEURON_SUPPORTED_MODELS[arch])
         neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
 
         split_model_dir = f"{model_name_or_path}-split"
         if os.path.isdir(os.path.join(model_name_or_path,
                                       "pytorch_model.bin")):
             split_model_dir = model_name_or_path
         elif not os.path.exists(f"{model_name_or_path}-split"):
-            hf_model_cls = getattr(transformers, hf_model_cls)
+            hf_model_cls = getattr(transformers, hf_model_cls_name)
             from transformers_neuronx.module import save_pretrained_split
 
             hf_model = hf_model_cls.from_pretrained(model_name_or_path,
@@ -96,7 +98,7 @@ def load_weights(self, model_name_or_path: str, **kwargs):
         self.model.to_neuron()
 
 
-def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+def _get_model_architecture(config: PretrainedConfig) -> str:
     architectures = getattr(config, "architectures", [])
     for arch in architectures:
         if arch in _NEURON_SUPPORTED_MODELS:
 
@@ -167,6 +167,7 @@ def __post_init__(self):
                 decryption_params = DecryptionParams.from_key(key)
                 self.deserializer_params['encryption'] = decryption_params
 
+    @staticmethod
     def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """Tensorizer CLI arguments"""
 
@@ -113,6 +113,8 @@ def from_sampling_metadata(
                              get_num_triton_sampler_splits(vocab_size))
 
         sample_indices_start_idx = 0
+        assert sampling_metadata.seq_groups is not None
+        assert sampling_metadata.seq_data is not None
         for i, seq_group in enumerate(sampling_metadata.seq_groups):
             seq_ids, sampling_params = seq_group
             temperature = sampling_params.temperature
@@ -147,6 +149,7 @@ def from_sampling_metadata(
                     and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
+                assert sampling_metadata.prompt_lens is not None
                 prompt_len = sampling_metadata.prompt_lens[i]
                 temperatures += [temperature] * (prompt_len - 1)
                 top_ps += [top_p] * (prompt_len - 1)
@@ -172,6 +175,7 @@ def from_sampling_metadata(
             is_prompt = i < sampling_metadata.num_prompts
             if is_prompt:
                 prompt_best_of.append(sampling_params.best_of)
+                assert sampling_metadata.prompt_lens is not None
                 prompt_len = sampling_metadata.prompt_lens[i]
 
                 if sampling_params.prompt_logprobs is not None:
 
@@ -106,7 +106,7 @@ def score_proposals(
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-        proposal_token_ids_list: List[TokenId],
+        proposal_token_ids_list: List[List[TokenId]],
         proposal_lens_list: List[int],
     ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
         """Given the input sequences and potentially multiple corresponding
@@ -218,7 +218,7 @@ def _create_scoring_model_input(
     def _create_target_seq_group_metadata(
         self,
         input_seq_group_metadata: SequenceGroupMetadata,
-        proposal_token_ids: List[TokenId],  # shape: [batch_size, k]
+        proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
         batch_index: int,
         target_seq_ids_iter: Iterator[TargetSeqId],
     ) -> List[SequenceGroupMetadata]:
@@ -360,7 +360,7 @@ def _get_token_ids_to_score(
                 [0, 1, 2]
                 [0, 1, 2, 3]
         """
-        empty_token_ids = []
+        empty_token_ids: List[TokenId] = []
 
         token_ids_to_score = [empty_token_ids]
         token_ids_to_score.extend([
 
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import torch
 
@@ -73,5 +73,5 @@ def score_proposals(
         blocks_to_copy: Optional[Dict[int, List[int]]],
         k: int,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> SpeculativeScores:
         raise NotImplementedError
@@ -112,6 +112,7 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
 
         Returns a CUDA event recording when the copy is complete.
         """
+        assert self._copy_stream is not None
         self._copy_stream.wait_stream(torch.cuda.current_stream())
 
         with torch.cuda.stream(self._copy_stream):