inference-labs-inc · jsgold-1 · Dec 12, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,7 +3,7 @@
 
 ## Related Issue
 <!-- Link to related GitHub issue (e.g. "Fixes #123", "Addresses #456") -->
-- 
+-
 
 ## Type of Change
 <!-- Delete options that don't apply -->
@@ -24,8 +24,8 @@
 
 ## Deployment Notes
 <!-- Special considerations for deployment (migrations, config changes, etc.) -->
-- 
+-
 
 ## Additional Comments
 <!-- Any other important context for reviewers -->
--
+-
diff --git a/poetry.lock b/poetry.lock
diff --git a/python/core/circuits/base.py b/python/core/circuits/base.py
@@ -4,13 +4,12 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-from numpy import asarray, ndarray
+import numpy as np
 
 from python.core.utils.errors import ShapeMismatchError
 from python.core.utils.witness_utils import compare_witness_to_io, load_witness
 
 if TYPE_CHECKING:
-    import numpy as np
     import torch
 
 from python.core.circuits.errors import (
@@ -775,18 +774,18 @@ def _gen_witness_preprocessing(
     def reshape_inputs_for_inference(
         self: Circuit,
         inputs: dict[str],
-    ) -> ndarray | dict[str, ndarray]:
+    ) -> np.ndarray | dict[str, np.ndarray]:
         """
         Reshape input tensors to match the model's expected input shape.
 
         Parameters
         ----------
-        inputs : dict[str] or ndarray
+        inputs : dict[str] or np.ndarray
             Input tensors or a dictionary of tensors.
 
         Returns
         -------
-        ndarray or dict[str, ndarray]
+        np.ndarray or dict[str, np.ndarray]
             Reshaped input(s) ready for inference.
         """
 
@@ -801,15 +800,33 @@ def reshape_inputs_for_inference(
         if isinstance(inputs, dict):
             if len(inputs) == 1:
                 only_key = next(iter(inputs))
-                inputs = asarray(inputs[only_key])
+                value = np.asarray(inputs[only_key])
+
+                # If shape is a dict, extract the shape for this key
+                if isinstance(shape, dict):
+                    key_shape = shape.get(only_key, None)
+                    if key_shape is None:
+                        raise CircuitConfigurationError(
+                            missing_attributes=[f"input_shape[{only_key!r}]"],
+                        )
+                    shape = key_shape
+
+                # From here on, treat it as a regular reshape
+                inputs = value
             else:
                 return self._reshape_dict_inputs(inputs, shape)
 
         # --- Regular reshape ---
+        if not isinstance(shape, (list, tuple)):
+            msg = (
+                f"Expected list or tuple shape for reshape, got {type(shape).__name__}"
+            )
+            raise CircuitInputError(msg)
+
         try:
-            return asarray(inputs).reshape(shape)
+            return np.asarray(inputs).reshape(shape)
         except Exception as e:
-            raise ShapeMismatchError(shape, list(asarray(inputs).shape)) from e
+            raise ShapeMismatchError(shape, list(np.asarray(inputs).shape)) from e
 
     def _reshape_dict_inputs(
         self: Circuit,
@@ -824,7 +841,7 @@ def _reshape_dict_inputs(
             )
             raise CircuitInputError(msg, parameter="shape", expected="dict")
         for key, value in inputs.items():
-            tensor = asarray(value)
+            tensor = np.asarray(value)
             try:
                 inputs[key] = tensor.reshape(shape[key])
             except Exception as e:
@@ -867,16 +884,16 @@ def reshape_inputs_for_circuit(
             value = inputs[key]
 
             # --- handle unsupported input types BEFORE entering try ---
-            if not isinstance(value, (ndarray, list, tuple)):
+            if not isinstance(value, (np.ndarray, list, tuple)):
                 msg = f"Unsupported input type for key '{key}': {type(value).__name__}"
                 raise CircuitProcessingError(message=msg)
 
             try:
                 # Convert to tensor, flatten, and back to list
-                if isinstance(value, ndarray):
+                if isinstance(value, np.ndarray):
                     flattened = value.flatten().tolist()
                 else:
-                    flattened = asarray(value).flatten().tolist()
+                    flattened = np.asarray(value).flatten().tolist()
             except Exception as e:
                 msg = f"Failed to flatten input '{key}' (type {type(value).__name__})"
                 raise CircuitProcessingError(message=msg) from e

diff --git a/python/core/circuits/errors.py b/python/core/circuits/errors.py
@@ -1,4 +1,3 @@
-# python/core/utils/exceptions.py
 from __future__ import annotations
 
 from python.core.utils.helper_functions import RunType
@@ -68,7 +67,7 @@ class CircuitInputError(CircuitError):
         actual (any): Actual value encountered (optional).
     """
 
-    def __init__(  # noqa: PLR0913
+    def __init__(
         self: CircuitInputError,
         message: str | None = None,
         parameter: str | None = None,

diff --git a/python/core/model_processing/converters/base.py b/python/core/model_processing/converters/base.py
@@ -2,7 +2,7 @@
 
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     import numpy as np
@@ -16,10 +16,10 @@ class ModelType(Enum):
 
 ONNXLayerDict = dict[
     str,
-    Union[int, str, list[str], dict[str, list[int]], Optional[list], Optional[dict]],
+    int | str | list[str] | dict[str, list[int]] | list | None | dict,
 ]
 
-CircuitParamsDict = dict[str, Union[int, dict[str, bool]]]
+CircuitParamsDict = dict[str, int | dict[str, bool]]
 
 
 class ModelConverter(ABC):

diff --git a/python/core/model_processing/onnx_custom_ops/__init__.py b/python/core/model_processing/onnx_custom_ops/__init__.py
@@ -1,16 +1,16 @@
 import importlib
 import pkgutil
-import os
+from pathlib import Path
 
 # Get the package name of the current module
 package_name = __name__
 
 # Dynamically import all .py files in this package directory (except __init__.py)
-package_dir = os.path.dirname(__file__)
+package_dir = Path(__file__).parent
 
-__all__ = []
+__all__: list[str] = []
 
 for _, module_name, is_pkg in pkgutil.iter_modules([package_dir]):
     if not is_pkg and (module_name != "custom_helpers"):
         importlib.import_module(f"{package_name}.{module_name}")
-        __all__.append(module_name)
+        __all__.append(str(module_name))  # noqa: PYI056
diff --git a/python/core/model_processing/onnx_quantizer/exceptions.py b/python/core/model_processing/onnx_quantizer/exceptions.py
@@ -31,7 +31,7 @@ class InvalidParamError(QuantizationError):
     quantization the quantization process.
     """
 
-    def __init__(  # noqa: PLR0913
+    def __init__(
         self: QuantizationError,
         node_name: str,
         op_type: str,
@@ -151,7 +151,7 @@ class InvalidConfigError(QuantizationError):
     def __init__(
         self: QuantizationError,
         key: str,
-        value: str | float | bool | None,
+        value: str | float | bool | None,  # noqa: FBT001
         expected: str | None = None,
     ) -> None:
         """Initialize InvalidConfigError with context about the bad config.

diff --git a/python/core/model_processing/onnx_quantizer/layers/base.py b/python/core/model_processing/onnx_quantizer/layers/base.py
@@ -418,6 +418,40 @@ def insert_scale_node(
 
 
 class QuantizerBase:
+    """
+    Shared mixin implementing the generic INT64 quantization pipeline.
+
+    IMPORTANT:
+        QuantizerBase is *not* a standalone quantizer. It must always be
+        combined with BaseOpQuantizer via multiple inheritance:
+
+            class FooQuantizer(BaseOpQuantizer, QuantizeFoo):
+                ...
+
+        BaseOpQuantizer supplies required methods and attributes that
+        QuantizerBase relies on:
+            - add_scaled_initializer_inputs
+            - insert_scale_node
+            - get_scaling
+            - new_initializers  (initializer buffer shared with converter)
+
+        If a subclass inherits QuantizerBase without BaseOpQuantizer,
+        QuantizerBase.quantize() will raise attribute errors at runtime.
+
+    This mixin centralizes:
+        - attribute extraction/merging
+        - optional initializer scaling (USE_WB + SCALE_PLAN)
+        - optional rescaling of outputs (USE_SCALING)
+        - creation of the final quantized NodeProto
+
+    The Quantize<Op> mixins should define:
+        - OP_TYPE
+        - DOMAIN
+        - USE_WB (bool)
+        - USE_SCALING (bool)
+        - SCALE_PLAN (dict[int,int]) if initializer scaling is enabled
+    """
+
     OP_TYPE = None
     DOMAIN = "ai.onnx.contrib"
     DEFAULT_ATTRS: ClassVar = {}

diff --git a/python/core/model_processing/onnx_quantizer/layers/clip.py b/python/core/model_processing/onnx_quantizer/layers/clip.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+if TYPE_CHECKING:
+    import onnx
+
+from python.core.model_processing.onnx_quantizer.layers.base import (
+    BaseOpQuantizer,
+    QuantizerBase,
+    ScaleConfig,
+)
+
+
+class QuantizeClip(QuantizerBase):
+    """
+    Quantization traits for ONNX Clip.
+
+    Semantics:
+    - X is already scaled/cast to INT64 at the graph boundary by the converter.
+    - Clip is elementwise + broadcasting.
+    - The bound inputs (min, max) should live in the *same* fixed-point scale
+      as X so that Clip(alpha*x; alpha*a, alpha*b) matches the original Clip(x; a, b).
+
+    Implementation:
+    - Treat inputs 1 and 2 (min, max) like "WB-style" slots: we let the
+      QuantizerBase machinery rescale / cast those inputs using the same
+      global scale factor.
+    - No extra internal scaling input is added (USE_SCALING = False).
+    """
+
+    OP_TYPE = "Clip"
+    DOMAIN = ""  # standard ONNX domain
+
+    # We DO want WB-style handling so that min/max initializers get quantized:
+    USE_WB = True
+
+    # Clip does not introduce its own scale input; it just runs in the
+    # existing fixed-point scale.
+    USE_SCALING = False
+
+    # Scale-plan for WB-style slots:
+    #   - Input index 1: min
+    #   - Input index 2: max
+    # Each should be scaled once by the global alpha (same as activations).
+    SCALE_PLAN: ClassVar = {1: 1, 2: 1}
+
+
+class ClipQuantizer(BaseOpQuantizer, QuantizeClip):
+    """
+    Quantizer for ONNX Clip.
+
+    - Keeps the node op_type as "Clip".
+    - Ensures that any bound inputs (min, max), whether they are dynamic
+      inputs or initializers, are converted to the same INT64 fixed-point
+      representation as A.
+    """
+
+    def __init__(
+        self,
+        new_initializers: dict[str, onnx.TensorProto] | None = None,
+    ) -> None:
+        # Match Max/Min/Add: we simply share the new_initializers dict
+        # with the converter so any constants we add are collected.
+        self.new_initializers = new_initializers
+
+    def quantize(
+        self,
+        node: onnx.NodeProto,
+        graph: onnx.GraphProto,
+        scale_config: ScaleConfig,
+        initializer_map: dict[str, onnx.TensorProto],
+    ) -> list[onnx.NodeProto]:
+        # Delegate to the shared QuantizerBase logic, which will:
+        # - keep X as-is (already scaled/cast by the converter),
+        # - rescale / cast min/max according to SCALE_PLAN,
+        # - update initializers as needed.
+        return QuantizeClip.quantize(self, node, graph, scale_config, initializer_map)
+
+    def check_supported(
+        self,
+        node: onnx.NodeProto,
+        initializer_map: dict[str, onnx.TensorProto] | None = None,
+    ) -> None:
+        """
+        Minimal support check for Clip:
+
+        - Clip is variadic elementwise with optional min/max as inputs or attrs.
+        - We accept both forms; if attrs are present, ORT enforces semantics.
+        - Broadcasting is ONNX-standard; we don't restrict further here.
+        """
+        _ = node, initializer_map
diff --git a/python/core/model_processing/onnx_quantizer/layers/max.py b/python/core/model_processing/onnx_quantizer/layers/max.py
@@ -0,0 +1,47 @@
+# python/core/model_processing/onnx_quantizer/layers/max.py
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+if TYPE_CHECKING:
+    import onnx
+
+from python.core.model_processing.onnx_quantizer.layers.base import (
+    BaseOpQuantizer,
+    QuantizerBase,
+    ScaleConfig,
+)
+
+
+class QuantizeMax(QuantizerBase):
+    OP_TYPE = "Max"
+    DOMAIN = ""
+    USE_WB = True
+    USE_SCALING = False
+    SCALE_PLAN: ClassVar = {1: 1}
+
+
+class MaxQuantizer(BaseOpQuantizer, QuantizeMax):
+    def __init__(
+        self,
+        new_initializers: dict[str, onnx.TensorProto] | None = None,
+    ) -> None:
+        self.new_initializers = new_initializers
+
+    def quantize(
+        self,
+        node: onnx.NodeProto,
+        graph: onnx.GraphProto,
+        scale_config: ScaleConfig,
+        initializer_map: dict[str, onnx.TensorProto],
+    ) -> list[onnx.NodeProto]:
+        # Delegate to the shared QuantizerBase logic
+        return QuantizeMax.quantize(self, node, graph, scale_config, initializer_map)
+
+    def check_supported(
+        self,
+        node: onnx.NodeProto,
+        initializer_map: dict[str, onnx.TensorProto] | None = None,
+    ) -> None:
+        # If later we want to enforce/relax broadcasting, add it here.
+        pass