-
Notifications
You must be signed in to change notification settings - Fork 93
feat(metrics): column-store + ragged-series + accumulator protocol foundation #969
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,191 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from dataclasses import dataclass, field | ||
| from typing import TYPE_CHECKING, Any, ClassVar, Protocol, runtime_checkable | ||
|
|
||
| import numpy as np | ||
| from numpy.typing import NDArray | ||
|
|
||
| from aiperf.common.enums.metric_enums import MetricValueTypeVarT | ||
|
|
||
| if TYPE_CHECKING: | ||
| from aiperf.common.models.error_models import ErrorDetailsCount | ||
| from aiperf.common.models.record_models import MetricResult | ||
| from aiperf.common.types import MetricTagT | ||
| from aiperf.exporters.exporter_config import FileExportInfo | ||
| from aiperf.plugin.enums import AccumulatorType | ||
|
|
||
|
|
||
| @runtime_checkable | ||
| class AccumulatorResult(Protocol): | ||
| """Protocol for typed results from accumulator summarize().""" | ||
|
|
||
| def to_json(self) -> Any: | ||
| """Serialize to JSON-compatible structure.""" | ||
| ... | ||
|
|
||
| def to_csv(self) -> list[dict[str, Any]]: | ||
| """Serialize to list of CSV-compatible row dicts.""" | ||
| ... | ||
|
|
||
|
|
||
| @runtime_checkable | ||
| class MetricSeriesProtocol(Protocol[MetricValueTypeVarT]): | ||
| """Shared interface for run-level record metric series consumers. | ||
|
|
||
| Implemented by any in-memory accumulator that exposes a running sum, a | ||
| record count, and a finalized ``MetricResult`` summary. Used by the | ||
| per-tag dispatch path in MetricsAccumulator and by ColumnStore-backed | ||
| series wrappers so that derived metrics can read values without caring | ||
| about the underlying storage shape (numpy column, ragged CSR, growable | ||
| array, etc.). | ||
| """ | ||
|
|
||
| @property | ||
| def sum(self) -> MetricValueTypeVarT: | ||
| """Return the accumulated sum of all observed values.""" | ||
|
|
||
| def __len__(self) -> int: | ||
| """Return the number of observed values.""" | ||
|
|
||
| def to_result(self, tag: MetricTagT, header: str, unit: str) -> MetricResult: | ||
| """Summarize the accumulated values as a MetricResult.""" | ||
|
|
||
|
|
||
| @dataclass(frozen=True, slots=True) | ||
| class ExportContext: | ||
| """Context passed to domain-specific export_results() methods. | ||
|
|
||
| Bundles the profiling time window and error summary so that export_results | ||
| signatures stay stable as new fields are added. | ||
| """ | ||
|
|
||
| start_ns: int | None = None | ||
| """Inclusive start of the export time window (ns since epoch), or None for unbounded.""" | ||
|
|
||
| end_ns: int | None = None | ||
| """Exclusive end of the export time window (ns since epoch), or None for unbounded.""" | ||
|
|
||
| error_summary: list[ErrorDetailsCount] | None = None | ||
| """De-duplicated profile-run error counts to surface in the export, if any.""" | ||
|
|
||
| cancelled: bool = False | ||
| """True when the profile run was cancelled — exporters may emit partial artifacts.""" | ||
|
|
||
|
|
||
| @dataclass(slots=True) | ||
| class SummaryContext: | ||
| """Typed cross-accumulator communication context for dependency-ordered summarization. | ||
|
|
||
| NOT a Pydantic model — this is never serialized over the wire. It is created | ||
| by RecordsManager._process_results() and passed through the topological-sort | ||
| pipeline so each accumulator can read outputs from its declared dependencies. | ||
| """ | ||
|
|
||
| accumulators: dict[AccumulatorType, Any] = field(default_factory=dict) | ||
| """Live accumulator instances keyed by AccumulatorType — analyzers use this to query peer state.""" | ||
|
|
||
| accumulator_outputs: dict[str, Any] = field(default_factory=dict) | ||
| """Already-computed summary payloads keyed by accumulator name — populated as topo-order completes.""" | ||
|
|
||
| start_ns: int = 0 | ||
| """Inclusive start of the summarization window (ns since epoch); 0 means full range.""" | ||
|
|
||
| end_ns: int = 0 | ||
| """Exclusive end of the summarization window (ns since epoch); 0 means full range.""" | ||
|
|
||
| cancelled: bool = False | ||
| """True when the profile run was cancelled — analyzers may short-circuit.""" | ||
|
|
||
| def get_accumulator(self, accumulator_type: AccumulatorType) -> Any | None: | ||
| """Look up an accumulator by its type. Returns None if not present.""" | ||
| return self.accumulators.get(accumulator_type) | ||
|
|
||
| def get_output(self, accumulator_type: str) -> Any | None: | ||
| """Look up a previously-computed accumulator output. Returns None if not yet available.""" | ||
| return self.accumulator_outputs.get(accumulator_type) | ||
|
|
||
|
|
||
| @runtime_checkable | ||
| class AccumulatorProtocol(Protocol): | ||
| """Protocol for accumulators that ingest records, support time-range queries, and produce summaries. | ||
|
|
||
| Accumulators are the primary data stores in the records pipeline. Each accumulator | ||
| owns exactly one record type and is fully self-contained — no cross-accumulator | ||
| dependencies. Derived computations belong on AnalyzerProtocol instead. | ||
| """ | ||
|
|
||
| async def process_record(self, record: Any) -> None: | ||
| """Ingest a single record into this accumulator's internal storage.""" | ||
| ... | ||
|
|
||
| def query_time_range(self, start_ns: int, end_ns: int) -> NDArray[np.bool_]: | ||
| """Return a boolean mask where True marks records in [start_ns, end_ns). | ||
|
|
||
| The mask length equals the accumulator's record count. Callers can use | ||
| ``mask.sum()`` for the count or ``np.where(mask)[0]`` for indices. | ||
| """ | ||
| ... | ||
|
|
||
| async def summarize(self, ctx: SummaryContext | None = None) -> AccumulatorResult: | ||
| """Compute and return aggregated metric results. | ||
|
|
||
| Args: | ||
| ctx: Optional SummaryContext for reading dependency outputs. | ||
| None when called for realtime metrics (no cross-processor deps). | ||
| """ | ||
| ... | ||
|
|
||
| async def export_results(self, ctx: ExportContext) -> Any: | ||
| """Export final results for this accumulator. | ||
|
|
||
| Called once after profiling completes. Each accumulator returns its own | ||
| typed result (AccumulatorMetricsSummary, TelemetryExportData, ServerMetricsResults) | ||
| which is consumed by typed fields on the unified results message. | ||
|
|
||
| Args: | ||
| ctx: ExportContext with profiling time window, error summary, and cancelled flag. | ||
| """ | ||
| ... | ||
|
|
||
|
|
||
| @runtime_checkable | ||
| class AnalyzerProtocol(Protocol): | ||
| """Protocol for processors that don't ingest records directly but derive results | ||
| from other accumulators at summarization time. | ||
|
|
||
| Analyzers declare which accumulators they need via required_accumulators | ||
| and which outputs they depend on via summary_dependencies. They receive | ||
| accumulator references at construction and a SummaryContext at summarize time. | ||
| """ | ||
|
|
||
| required_accumulators: ClassVar[set[AccumulatorType]] | ||
| summary_dependencies: ClassVar[list[AccumulatorType]] | ||
|
|
||
| async def summarize(self, ctx: SummaryContext) -> Any: | ||
| """Compute derived results using data from declared accumulator dependencies.""" | ||
| ... | ||
|
|
||
|
|
||
| @runtime_checkable | ||
| class StreamExporterProtocol(Protocol): | ||
| """Protocol for processors that stream each record to an external sink (e.g. JSONL files). | ||
|
|
||
| Stream exporters have no summarization dependencies and are flushed after | ||
| all accumulators complete. | ||
| """ | ||
|
|
||
| async def process_record(self, record: Any) -> None: | ||
| """Write a single record to the export sink.""" | ||
| ... | ||
|
|
||
| async def finalize(self) -> None: | ||
| """Flush any buffered data. Called once after all records are processed.""" | ||
| ... | ||
|
|
||
| def get_export_info(self) -> FileExportInfo: | ||
| """Return metadata about the file this exporter writes to.""" | ||
| ... | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| """Per-tag setter closure factories for ``ColumnStore.ingest``. | ||
|
|
||
| These closures are resolved on first sighting of each metric tag (via Python | ||
| type dispatch) and cached in ``ColumnStore._tag_handlers``. Subsequent records | ||
| skip the isinstance ladder and the ``_ensure_*_column`` lookups entirely. | ||
|
|
||
| Profiling at 50k records (24 numeric tags + ICL) showed this hoist drops | ||
| ``ColumnStore.ingest`` wall by ~30% and total ingest function calls by 40%. | ||
| The handlers are invalidated by ``_grow()`` because numeric arrays get | ||
| reallocated; closures captured the old array references and would write to | ||
| garbage. List backends and string lists are unaffected (in-place growth) but | ||
| clearing all handlers on grow is simpler and grow runs ~log2(N) times. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Callable | ||
| from typing import Any | ||
|
|
||
| import numpy as np | ||
| from numpy.typing import NDArray | ||
|
|
||
| from aiperf.metrics.list_metric_aggregation import TDigestListMetricAggregator | ||
| from aiperf.metrics.ragged_series import RaggedSeries | ||
|
|
||
|
|
||
| def make_numeric_handler( | ||
| col: NDArray[np.float64], | ||
| tag: str, | ||
| sums: dict[str, float], | ||
| counts: dict[str, int], | ||
| ) -> Callable[[int, Any], None]: | ||
| """Closure that writes a numeric metric value at ``idx`` and updates the | ||
| O(1) running sum/count side-channel. | ||
|
|
||
| The ``float()`` cast is intentionally absent: numpy's ``__setitem__`` | ||
| coerces Python ``int`` to ``float64`` automatically, and ``+=`` on the | ||
| sum dict promotes the int operand the same way. Saves a Python-level | ||
| function call per numeric metric per record (~5-8% on the scalar path). | ||
| """ | ||
|
|
||
| def handler(idx: int, value: Any) -> None: | ||
| col[idx] = value | ||
| sums[tag] = sums[tag] + value | ||
| counts[tag] = counts[tag] + 1 | ||
|
Comment on lines
+44
to
+47
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guard numeric ingest against non-finite values. This path updates running numeric aggregates directly; accepting Proposed fix+from aiperf.common.finite import is_finite_value
...
def handler(idx: int, value: Any) -> None:
+ if not is_finite_value(value):
+ return
col[idx] = value
sums[tag] = sums[tag] + value
counts[tag] = counts[tag] + 1As per coding guidelines: “Numeric metric values crossing a serialization boundary or feeding a numerical algorithm must be finite or explicitly None - use aiperf.common.finite utilities”. 🤖 Prompt for AI Agents |
||
|
|
||
| return handler | ||
|
|
||
|
|
||
| def make_string_handler( | ||
| col: list[str | None], | ||
| ) -> Callable[[int, Any], None]: | ||
| """Closure that writes a string metric value at ``idx``. The list reference | ||
| survives capacity growth (``list.extend`` is in-place).""" | ||
|
|
||
| def handler(idx: int, value: Any) -> None: | ||
| col[idx] = value | ||
|
|
||
| return handler | ||
|
|
||
|
|
||
| def make_list_handler( | ||
| backend: RaggedSeries | TDigestListMetricAggregator, | ||
| ) -> Callable[[int, Any], None]: | ||
| """Closure that hands a list-valued metric to the configured list backend. | ||
| The backend reference is stable across ``ColumnStore._grow`` (list backends | ||
| own their own growth).""" | ||
|
|
||
| def handler(idx: int, value: Any) -> None: | ||
| backend.add_for_record(idx, value) | ||
|
|
||
| return handler | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding
__len__to the protocol whileMetricAggregatoris kept as a back-compat alias breaks existing custom aggregators that only implement the oldsum/to_resultruntime contract. Fix: keepMetricAggregatoron the old protocol shape or avoid the stricter protocol for legacyisinstancechecks.