From cf88051c5778c85d60e15542e0451e872db3fd3f Mon Sep 17 00:00:00 2001
From: Matthew Kotila <7692737+matthewkotila@users.noreply.github.com>
Date: Wed, 27 May 2026 14:51:44 -0700
Subject: [PATCH] fix(redact): tighten sensitive-token list to stop matching
 LLM token-count flags

_CLI_COMMAND_SENSITIVE_TOKENS previously included bare "token" as a
substring marker. Combined with the substring-match in
_redact_cli_args (`if any(tok in key for tok in ...)`), this caused
every aiperf flag with "tokens" (plural) in its name -- the entire
LLM token-count family -- to have its value silently replaced with
<redacted> in the canonical cli_command string.

Real-run output showed `--synthetic-input-tokens-mean '<redacted>'`,
`--output-tokens-mean '<redacted>'`, etc., breaking cli_command's
reproducibility purpose: you can't replay a benchmark whose configured
token counts have been scrubbed.

Replace the bare "token" with the specific auth-token compound forms
the redaction was intended to catch (api-token, auth-token,
access-token, bearer-token, id-token, refresh-token, with both - and
_ variants). Each entry now contains at least one - or _, so
substring-matching can't fire on innocent plurals.

Tradeoff: bare --token <secret> is no longer caught. Aiperf has no
such flag today; future additions can extend the list explicitly.

Tests:
- 6 parametrized regression cases for the token-count flag family
  (input/output/synthetic-input variants), asserting numeric values
  pass through unredacted.
- 7 parametrized positive cases for the auth-token compound family,
  asserting their secret values are still redacted.

The buggy substring match was introduced alongside the v2 config
refactor (commit 94a91026 / PR #912). Test coverage at the time
exercised --api-key only; the token-count family was uncovered.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/aiperf/common/redact.py      | 15 +++++++++-
 tests/unit/common/test_redact.py | 50 ++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/src/aiperf/common/redact.py b/src/aiperf/common/redact.py
index 20390af0d..a355f6f3b 100644
--- a/src/aiperf/common/redact.py
+++ b/src/aiperf/common/redact.py
@@ -238,7 +238,20 @@ def redact_cli_command(cmd: str) -> str:
     return cmd
 
 
-_CLI_COMMAND_SENSITIVE_TOKENS = ("api-key", "api_key", "authorization", "token")
+# Each entry must contain a `-` or `_` so it can't substring-match an innocent
+# plural (e.g. bare `"token"` would match `--*-tokens-mean` flags carrying LLM
+# token counts). Bare `--token` is intentionally not matched; add a specific
+# compound form (e.g. `"my-token"`) if a new auth flag needs to be covered.
+_CLI_COMMAND_SENSITIVE_TOKENS = (
+    "api-key", "api_key",
+    "api-token", "api_token",
+    "auth-token", "auth_token",
+    "access-token", "access_token",
+    "bearer-token", "bearer_token",
+    "id-token", "id_token",
+    "refresh-token", "refresh_token",
+    "authorization",
+)  # fmt: skip
 
 
 def _redact_cli_args(args: list) -> list:
diff --git a/tests/unit/common/test_redact.py b/tests/unit/common/test_redact.py
index 48deb3fea..d007da821 100644
--- a/tests/unit/common/test_redact.py
+++ b/tests/unit/common/test_redact.py
@@ -1455,6 +1455,56 @@ def test_non_sensitive_args_preserved_in_cli_command(self):
         assert "http://localhost:8000" in cmd
         assert "gpt2" in cmd
 
+    @pytest.mark.parametrize(
+        "flag, value",
+        [
+            param("--input-tokens-mean", "1024", id="input-tokens-mean"),
+            param("--input-tokens-stddev", "0", id="input-tokens-stddev"),
+            param("--output-tokens-mean", "128", id="output-tokens-mean"),
+            param("--output-tokens-stddev", "32", id="output-tokens-stddev"),
+            param(
+                "--synthetic-input-tokens-mean", "1024", id="synthetic-input-tokens-mean",
+            ),
+            param(
+                "--synthetic-input-tokens-stddev",
+                "0",
+                id="synthetic-input-tokens-stddev",
+            ),
+        ],
+    )  # fmt: skip
+    def test_token_count_flags_preserve_value_in_cli_command(self, flag, value):
+        """Regression: flag names containing 'tokens' (plural — LLM token
+        counts) must not be redacted just because they share a substring with
+        auth-token flags. Reported via real-run output where every
+        --*-tokens-* flag value came back as '<redacted>'."""
+        cmd = self._build_cli_command(
+            ["aiperf", "profile", "--model", "gpt2", flag, value]
+        )
+        assert value in cmd, f"{flag} value {value!r} should not be redacted"
+        assert REDACTED_VALUE not in cmd
+
+    @pytest.mark.parametrize(
+        "flag",
+        [
+            param("--api-token", id="api-token"),
+            param("--api_token", id="api_token-underscore"),
+            param("--auth-token", id="auth-token"),
+            param("--access-token", id="access-token"),
+            param("--bearer-token", id="bearer-token"),
+            param("--id-token", id="id-token"),
+            param("--refresh-token", id="refresh-token"),
+        ],
+    )  # fmt: skip
+    def test_auth_token_compound_flags_redacted_in_cli_command(self, flag):
+        """Auth-token flag variants must still be redacted after tightening
+        the sensitive-token list away from the bare-'token' substring match."""
+        secret = "nv-secret-AUTH-TOKEN-9876543210"
+        cmd = self._build_cli_command(
+            ["aiperf", "profile", "--model", "gpt2", flag, secret]
+        )
+        assert secret not in cmd
+        assert REDACTED_VALUE in cmd
+
 
 # =============================================================================
 # ErrorDetails safe repr