Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,11 @@ Number of text inputs to include in each request for batch processing endpoints.
<br/>_Constraints: ≥ 0_
<br/>_Default: `1`_

#### `--prompt-corpus` `<str>`

Source corpus for synthetic prompt text generation. `sonnet` uses Shakespeare sonnets (default for most loaders). `coding` uses pseudo-realistic, template-filled coding content (code, bash output, JSON, error tracebacks, git diffs, configs, markdown) — filler whose token distribution approximates real coding-agent traffic, driving realistic expert-routing patterns on Mixture-of-Experts models (sonnet over-activates English-prose experts and underweights the broader expert set hit by real agentic-coding workloads). When unset, the active dataset loader's default applies — most loaders default to 'sonnet'; agentic-coding loaders override to 'coding'.
<br/>_Choices: [`sonnet`, `coding`]_

### Prefix Prompt

#### `--prompt-prefix-pool-size`, `--prefix-prompt-pool-size`, `--num-prefix-prompts` `<int>`
Expand Down Expand Up @@ -1784,6 +1789,11 @@ Number of text inputs to include in each request for batch processing endpoints.
<br/>_Constraints: ≥ 0_
<br/>_Default: `1`_

#### `--prompt-corpus` `<str>`

Source corpus for synthetic prompt text generation. `sonnet` uses Shakespeare sonnets (default for most loaders). `coding` uses pseudo-realistic, template-filled coding content (code, bash output, JSON, error tracebacks, git diffs, configs, markdown) — filler whose token distribution approximates real coding-agent traffic, driving realistic expert-routing patterns on Mixture-of-Experts models (sonnet over-activates English-prose experts and underweights the broader expert set hit by real agentic-coding workloads). When unset, the active dataset loader's default applies — most loaders default to 'sonnet'; agentic-coding loaders override to 'coding'.
<br/>_Choices: [`sonnet`, `coding`]_

### Prefix Prompt

#### `--prompt-prefix-pool-size`, `--prefix-prompt-pool-size`, `--num-prefix-prompts` `<int>`
Expand Down
2 changes: 2 additions & 0 deletions src/aiperf/common/enums/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
OptimizationDirection,
PrerequisiteKind,
PrometheusMetricType,
PromptCorpus,
PromptSource,
RequestContentType,
ServerMetricsDiscoveryMode,
Expand Down Expand Up @@ -135,6 +136,7 @@
"PowerMetricUnitInfo",
"PrerequisiteKind",
"PrometheusMetricType",
"PromptCorpus",
"PromptSource",
"RequestContentType",
"SSEEventType",
Expand Down
13 changes: 13 additions & 0 deletions src/aiperf/common/enums/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,19 @@ def _missing_(cls, value: Any) -> Self:
return cls.UNKNOWN


class PromptCorpus(CaseInsensitiveStrEnum):
"""Corpus used for synthetic prompt text generation.

Defined in :mod:`aiperf.common.enums.prompt_corpus` and re-exported here so
leaf modules (e.g. ``aiperf.plugin.schema.schemas``) can import it without
triggering this package's ``__init__`` chain. Keep that file as the
authoritative definition.
"""

SONNET = "sonnet"
CODING = "coding"


class PromptSource(CaseInsensitiveStrEnum):
SYNTHETIC = "synthetic"
FILE = "file"
Expand Down
77 changes: 77 additions & 0 deletions src/aiperf/common/hash_id_random_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Hash-ID-based random generator for parallel processing with reproducibility.

Enables parallel processing of traces with hash_ids while maintaining
reproducibility. Each (trace_id, hash_id) pair produces a deterministic random
sequence regardless of worker count or processing order.

Architecture:
Global Seed -> Base RNG -> (trace_id, hash_id) -> Deterministic tokens

The trace_id (typically a content hash of the trace file) ensures that different
trace files with overlapping hash_id values produce different content, while the
same trace file always produces identical results.
"""

import hashlib

from aiperf.common.random_generator import RandomGenerator

__all__ = ["HashIdRandomGenerator"]


class _DisabledNumpyRNG:
"""Raises on any attribute access to prevent NumPy RNG usage."""

def __getattr__(self, name):
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

cat -n src/aiperf/common/hash_id_random_generator.py

Repository: ai-dynamo/aiperf

Length of output: 3480


Add missing function type hints.

__getattr__ on line 28 is missing type annotations for the name parameter and return type. __init__ on line 51 is missing the -> None return type annotation. Per coding guidelines, all functions require complete type hints (params and return).

Proposed fix
+from typing import Never
+
 class _DisabledNumpyRNG:
     """Raises on any attribute access to prevent NumPy RNG usage."""
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Never:
         raise RuntimeError(
             "HashIdRandomGenerator does not support NumPy RNG operations. "
             "Use Python RNG methods (randrange, choice, etc.) instead."
         )
 
 class HashIdRandomGenerator(RandomGenerator):
     """RandomGenerator that re-seeds deterministically per (trace_id, hash_id).
 
     Designed for parallel processing where multiple workers need to generate
     identical content for the same hash_id within a trace file.
 
     Thread Safety:
         NOT thread-safe. Each worker process must have its own instance.
     """
 
     `@classmethod`
     def from_base_rng(cls, base_rng: RandomGenerator) -> "HashIdRandomGenerator":
         """Create from a base RandomGenerator (typically from rng.derive())."""
         base_seed = base_rng.seed or base_rng.randrange(0, 2**64)
         return cls(base_seed, _internal=True)
 
-    def __init__(self, base_seed: int, *, _internal: bool = False):
+    def __init__(self, base_seed: int, *, _internal: bool = False) -> None:
         super().__init__(base_seed, _internal=_internal)
         self._numpy_rng = _DisabledNumpyRNG()
         self._trace_id: str = ""
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/aiperf/common/hash_id_random_generator.py` at line 28, Add missing type
hints: annotate the __getattr__ method signature as def __getattr__(self, name:
str) -> Any (import Any from typing if not already), and annotate the
constructor as def __init__(self, ...) -> None (add -> None to its signature).
Update imports to include Any when adding the __getattr__ return type.

raise RuntimeError(
"HashIdRandomGenerator does not support NumPy RNG operations. "
"Use Python RNG methods (randrange, choice, etc.) instead."
)


class HashIdRandomGenerator(RandomGenerator):
"""RandomGenerator that re-seeds deterministically per (trace_id, hash_id).

Designed for parallel processing where multiple workers need to generate
identical content for the same hash_id within a trace file.

Thread Safety:
NOT thread-safe. Each worker process must have its own instance.
"""

@classmethod
def from_base_rng(cls, base_rng: RandomGenerator) -> "HashIdRandomGenerator":
"""Create from a base RandomGenerator (typically from rng.derive())."""
base_seed = base_rng.seed or base_rng.randrange(0, 2**64)
return cls(base_seed, _internal=True)
Comment on lines +48 to +49
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
python - <<'PY'
seed = 0
fallback = 123
print(seed or fallback)  # demonstrates current behavior picks fallback instead of 0
PY

Repository: ai-dynamo/aiperf

Length of output: 64


🏁 Script executed:

cat -n src/aiperf/common/hash_id_random_generator.py

Repository: ai-dynamo/aiperf

Length of output: 3480


🏁 Script executed:

rg "class RandomGenerator" -A 20 src/aiperf/

Repository: ai-dynamo/aiperf

Length of output: 1811


🏁 Script executed:

rg "self\.seed" src/aiperf/common/random_generator.py -B 2 -A 2

Repository: ai-dynamo/aiperf

Length of output: 42


🏁 Script executed:

cat -n src/aiperf/common/random_generator.py | head -80

Repository: ai-dynamo/aiperf

Length of output: 3380


🏁 Script executed:

cat -n src/aiperf/common/random_generator.py | sed -n '81,150p'

Repository: ai-dynamo/aiperf

Length of output: 3100


🏁 Script executed:

cat -n src/aiperf/common/random_generator.py | sed -n '118,122p'

Repository: ai-dynamo/aiperf

Length of output: 232


Fix seed value 0 being treated as falsy in from_base_rng.

Line 48 uses base_rng.seed or base_rng.randrange(0, 2**64), which treats seed value 0 as falsy. Since 0 is documented as a valid seed (per RandomGenerator's docstring: "Optional random seed (0 to 2^64-1)"), this breaks reproducibility when seed is legitimately 0. Replace the or operator with an explicit is not None check.

Proposed fix
-        base_seed = base_rng.seed or base_rng.randrange(0, 2**64)
+        base_seed = base_rng.seed if base_rng.seed is not None else base_rng.randrange(0, 2**64)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
base_seed = base_rng.seed or base_rng.randrange(0, 2**64)
return cls(base_seed, _internal=True)
base_seed = base_rng.seed if base_rng.seed is not None else base_rng.randrange(0, 2**64)
return cls(base_seed, _internal=True)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/aiperf/common/hash_id_random_generator.py` around lines 48 - 49,
from_base_rng incorrectly treats base_rng.seed == 0 as falsy; change the logic
in from_base_rng to check explicitly for None (e.g., use "if base_rng.seed is
not None then base_seed = base_rng.seed else base_seed = base_rng.randrange(0,
2**64)") so a legitimate seed of 0 is preserved; update the return path that
constructs the instance (reference: class method from_base_rng and attribute
base_rng.seed) to use that explicit None check instead of "or".


def __init__(self, base_seed: int, *, _internal: bool = False):
super().__init__(base_seed, _internal=_internal)
self._numpy_rng = _DisabledNumpyRNG()
self._trace_id: str = ""

def set_trace_id(self, trace_id: str) -> None:
"""Set trace identifier to scope hash_ids to a specific trace file.

Args:
trace_id: Content hash or unique identifier for the trace file.
Different trace files must use different trace_ids.
"""
self._trace_id = trace_id

def reseed_for_hash_id(self, hash_id: int) -> None:
"""Re-seed RNG deterministically for a specific hash_id.

After calling, all random operations use the derived seed until
the next reseed_for_hash_id call.

Args:
hash_id: KV block hash ID from trace data.
"""
seed_bytes = hashlib.sha256(
f"{self.seed}:{self._trace_id}:{hash_id}".encode()
).digest()
self._python_rng.seed(int.from_bytes(seed_bytes[:8], "big"))
20 changes: 20 additions & 0 deletions src/aiperf/config/dataset/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
AudioFormat,
ImageFormat,
ImageSource,
PromptCorpus,
)
from aiperf.config.base import BaseConfig
from aiperf.config.types import (
Expand Down Expand Up @@ -125,6 +126,25 @@ class PromptConfig(BaseConfig):
),
]

corpus: Annotated[
PromptCorpus | None,
Field(
default=None,
description="Source corpus for synthetic prompt text generation. "
"'sonnet' uses Shakespeare sonnets (default for most loaders). "
"'coding' uses pseudo-realistic, template-filled coding content "
"(code, bash output, JSON, error tracebacks, git diffs, configs, "
"markdown) — filler whose token distribution approximates real "
"coding-agent traffic, driving realistic expert-routing patterns "
"on Mixture-of-Experts models (sonnet over-activates English-prose "
"experts and underweights the broader expert set hit by real "
"agentic-coding workloads). "
"When unset, the active dataset loader's default applies — most "
"loaders default to 'sonnet'; agentic-coding loaders override to "
"'coding'.",
),
]

@field_validator("sequence_distribution")
@classmethod
def validate_sequence_probabilities(
Expand Down
2 changes: 2 additions & 0 deletions src/aiperf/config/flags/_converter_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def _build_prompts(cli: CLIConfig) -> dict[str, Any]:
prompts["block_size"] = cli.prompt_input_tokens_block_size
if "prompt_batch_size" in s:
prompts["batch_size"] = cli.prompt_batch_size
if "prompt_corpus" in s and cli.prompt_corpus is not None:
prompts["corpus"] = cli.prompt_corpus
Comment on lines +82 to +83
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

--prompt-corpus is silently discarded for file-dataset paths

Line [82] stores prompt_corpus under prompts, but file datasets later remove prompts in _apply_dataset_type (Lines [330]-[338]). That makes --prompt-corpus a no-op for --input-file flows instead of applying or failing fast. Please preserve corpus for supported file/trace loaders (or explicitly reject it for unsupported file formats) rather than dropping it silently.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/aiperf/config/flags/_converter_dataset.py` around lines 82 - 83, The CLI
`--prompt-corpus` value is being added to the local `prompts` dict (when
`cli.prompt_corpus` is set) but later `_apply_dataset_type` removes `prompts`
for file datasets, making `--prompt-corpus` a no-op; update
`_apply_dataset_type` (or the dataset conversion flow that strips `prompts`) to
either preserve `prompts["corpus"]` for supported file/trace loaders (check the
loader type/format and pass `prompts` through to the file/trace loader
constructors) or validate early and raise a clear error when a provided
`cli.prompt_corpus` is incompatible with the chosen dataset type; locate
references to `prompts`, `cli.prompt_corpus`, and `_apply_dataset_type` in
_converter_dataset.py and implement the preservation or explicit rejection
accordingly.

return prompts


Expand Down
1 change: 1 addition & 0 deletions src/aiperf/config/flags/_section_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"prompt_prefix_shared_system_length",
"prompt_prefix_user_context_length",
"prompt_sequence_distribution",
"prompt_corpus",
# ----- image modality -----
"image_width_mean",
"image_width_stddev",
Expand Down
24 changes: 24 additions & 0 deletions src/aiperf/config/flags/cli_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
GPUTelemetryMode,
ImageFormat,
ModelSelectionStrategy,
PromptCorpus,
RequestContentType,
ServerMetricsFormat,
SweepMode,
Expand Down Expand Up @@ -1006,6 +1007,29 @@ def url(self) -> str:
),
] = None

prompt_corpus: Annotated[
PromptCorpus | None,
Field(
default=None,
description="Source corpus for synthetic prompt text generation. "
"`sonnet` uses Shakespeare sonnets (default for most loaders). "
"`coding` uses pseudo-realistic, template-filled coding content "
"(code, bash output, JSON, error tracebacks, git diffs, configs, "
"markdown) — filler whose token distribution approximates real "
"coding-agent traffic, driving realistic expert-routing patterns "
"on Mixture-of-Experts models (sonnet over-activates English-prose "
"experts and underweights the broader expert set hit by real "
"agentic-coding workloads). "
"When unset, the active dataset loader's default applies — most "
"loaders default to 'sonnet'; agentic-coding loaders override to "
"'coding'.",
),
CLIParameter(
name=("--prompt-corpus",),
group=Groups.PROMPT,
),
] = None

##############################################################################
# Output Sequence Length (OSL)
##############################################################################
Expand Down
21 changes: 21 additions & 0 deletions src/aiperf/config/schema/aiperf-config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -9324,11 +9324,32 @@
"default": null,
"description": "Distribution of (ISL, OSL) pairs with probabilities for mixed workload simulation. Each entry specifies {isl, osl, probability}. Probabilities are percentages (0-100) and must sum to 100. When specified, requests are sampled from this distribution instead of using isl/osl fields.",
"title": "Sequencedistribution"
},
"corpus": {
"anyOf": [
{
"$ref": "#/$defs/PromptCorpus"
},
{
"type": "null"
}
],
"default": null,
"description": "Source corpus for synthetic prompt text generation. 'sonnet' uses Shakespeare sonnets (default for most loaders). 'coding' uses pseudo-realistic, template-filled coding content (code, bash output, JSON, error tracebacks, git diffs, configs, markdown) — filler whose token distribution approximates real coding-agent traffic, driving realistic expert-routing patterns on Mixture-of-Experts models (sonnet over-activates English-prose experts and underweights the broader expert set hit by real agentic-coding workloads). When unset, the active dataset loader's default applies — most loaders default to 'sonnet'; agentic-coding loaders override to 'coding'."
}
},
"title": "PromptConfig",
"type": "object"
},
"PromptCorpus": {
"description": "Corpus used for synthetic prompt text generation.\n\nDefined in :mod:`aiperf.common.enums.prompt_corpus` and re-exported here so\nleaf modules (e.g. ``aiperf.plugin.schema.schemas``) can import it without\ntriggering this package's ``__init__`` chain. Keep that file as the\nauthoritative definition.",
"enum": [
"sonnet",
"coding"
],
"title": "PromptCorpus",
"type": "string"
},
"PublicDataset": {
"additionalProperties": false,
"description": "Public dataset configuration.\n\nUses well-known public benchmarking datasets that are\nautomatically downloaded and processed by AIPerf.",
Expand Down
38 changes: 29 additions & 9 deletions src/aiperf/dataset/composer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
from typing import TYPE_CHECKING

from aiperf.common import random_generator as rng
from aiperf.common.enums import ConversationContextMode, ModelSelectionStrategy
from aiperf.common.enums import (
ConversationContextMode,
ModelSelectionStrategy,
PromptCorpus,
)
from aiperf.common.mixins import AIPerfLoggerMixin
from aiperf.common.models import Conversation, Turn
from aiperf.common.models.sequence_distribution import (
Expand All @@ -30,6 +34,7 @@
)
from aiperf.config.distributions import SamplingDistribution
from aiperf.config.resolution.plan import BenchmarkRun
from aiperf.dataset.generator.coding_content import CodingContentGenerator


class BaseDatasetComposer(AIPerfLoggerMixin, ABC):
Expand Down Expand Up @@ -66,14 +71,8 @@ def __init__(
)

# Create generators (prompt generator requires a tokenizer)
self.prompt_generator: PromptGenerator | None = (
PromptGenerator(
prompts=self._synthetic_prompts,
prefix_prompts=self._synthetic_prefix_prompts,
tokenizer=tokenizer,
)
if tokenizer
else None
self.prompt_generator: PromptGenerator | CodingContentGenerator | None = (
self._create_prompt_generator(tokenizer) if tokenizer else None
)
self.image_generator = ImageGenerator(self._synthetic_images)
self.audio_generator = AudioGenerator(self._synthetic_audio)
Expand All @@ -95,6 +94,27 @@ def __init__(
# Cache for turn-level sequence lengths to ensure ISL/OSL pairing consistency
self._turn_sequence_cache: dict[int, tuple[int, int]] = {}

def _create_prompt_generator(
self, tokenizer: Tokenizer
) -> PromptGenerator | CodingContentGenerator:
if (
self._synthetic_prompts is not None
and self._synthetic_prompts.corpus == PromptCorpus.CODING
):
from aiperf.dataset.generator.coding_content import CodingContentGenerator

return CodingContentGenerator(
config=self._synthetic_prompts,
prefix_prompts=self._synthetic_prefix_prompts,
tokenizer=tokenizer,
)

return PromptGenerator(
prompts=self._synthetic_prompts,
prefix_prompts=self._synthetic_prefix_prompts,
tokenizer=tokenizer,
)

@abstractmethod
def create_dataset(self) -> list[Conversation]:
"""
Expand Down
27 changes: 25 additions & 2 deletions src/aiperf/dataset/composer/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

import orjson

from aiperf.common.enums import ConversationContextMode, DatasetFormat
from aiperf.common.enums import ConversationContextMode, DatasetFormat, PromptCorpus
from aiperf.common.models import Conversation
from aiperf.common.tokenizer import Tokenizer
from aiperf.common.utils import load_json_str
from aiperf.config.dataset import FileDataset
from aiperf.config.dataset.content import PromptConfig
from aiperf.dataset.composer.base import BaseDatasetComposer
from aiperf.dataset.loader.base_loader import BaseLoader
from aiperf.dataset.utils import check_file_exists
Expand Down Expand Up @@ -254,7 +255,29 @@ def _create_loader_instance(self, dataset_type: CustomDatasetType) -> None:
"Trace datasets require a tokenizer for prompt synthesis. "
"Ensure the endpoint supports tokenization or provide a --tokenizer."
)
kwargs["prompt_generator"] = self.prompt_generator

corpus = (
self._synthetic_prompts.corpus
if self._synthetic_prompts is not None
and self._synthetic_prompts.corpus is not None
else loader_metadata.default_prompt_corpus
)
if corpus == PromptCorpus.CODING:
from aiperf.dataset.generator.coding_content import (
CodingContentGenerator,
)

prompt_config = (
self._synthetic_prompts
if self._synthetic_prompts is not None
else PromptConfig()
)
kwargs["prompt_generator"] = CodingContentGenerator(
config=prompt_config,
tokenizer=self.prompt_generator.tokenizer,
)
else:
kwargs["prompt_generator"] = self.prompt_generator

if loader_metadata.default_block_size is not None:
kwargs["default_block_size"] = loader_metadata.default_block_size
Expand Down
Loading
Loading