From cf88051c5778c85d60e15542e0451e872db3fd3f Mon Sep 17 00:00:00 2001 From: Matthew Kotila <7692737+matthewkotila@users.noreply.github.com> Date: Wed, 27 May 2026 14:51:44 -0700 Subject: [PATCH] fix(redact): tighten sensitive-token list to stop matching LLM token-count flags _CLI_COMMAND_SENSITIVE_TOKENS previously included bare "token" as a substring marker. Combined with the substring-match in _redact_cli_args (`if any(tok in key for tok in ...)`), this caused every aiperf flag with "tokens" (plural) in its name -- the entire LLM token-count family -- to have its value silently replaced with in the canonical cli_command string. Real-run output showed `--synthetic-input-tokens-mean ''`, `--output-tokens-mean ''`, etc., breaking cli_command's reproducibility purpose: you can't replay a benchmark whose configured token counts have been scrubbed. Replace the bare "token" with the specific auth-token compound forms the redaction was intended to catch (api-token, auth-token, access-token, bearer-token, id-token, refresh-token, with both - and _ variants). Each entry now contains at least one - or _, so substring-matching can't fire on innocent plurals. Tradeoff: bare --token is no longer caught. Aiperf has no such flag today; future additions can extend the list explicitly. Tests: - 6 parametrized regression cases for the token-count flag family (input/output/synthetic-input variants), asserting numeric values pass through unredacted. - 7 parametrized positive cases for the auth-token compound family, asserting their secret values are still redacted. The buggy substring match was introduced alongside the v2 config refactor (commit 94a91026 / PR #912). Test coverage at the time exercised --api-key only; the token-count family was uncovered. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/aiperf/common/redact.py | 15 +++++++++- tests/unit/common/test_redact.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/aiperf/common/redact.py b/src/aiperf/common/redact.py index 20390af0d..a355f6f3b 100644 --- a/src/aiperf/common/redact.py +++ b/src/aiperf/common/redact.py @@ -238,7 +238,20 @@ def redact_cli_command(cmd: str) -> str: return cmd -_CLI_COMMAND_SENSITIVE_TOKENS = ("api-key", "api_key", "authorization", "token") +# Each entry must contain a `-` or `_` so it can't substring-match an innocent +# plural (e.g. bare `"token"` would match `--*-tokens-mean` flags carrying LLM +# token counts). Bare `--token` is intentionally not matched; add a specific +# compound form (e.g. `"my-token"`) if a new auth flag needs to be covered. +_CLI_COMMAND_SENSITIVE_TOKENS = ( + "api-key", "api_key", + "api-token", "api_token", + "auth-token", "auth_token", + "access-token", "access_token", + "bearer-token", "bearer_token", + "id-token", "id_token", + "refresh-token", "refresh_token", + "authorization", +) # fmt: skip def _redact_cli_args(args: list) -> list: diff --git a/tests/unit/common/test_redact.py b/tests/unit/common/test_redact.py index 48deb3fea..d007da821 100644 --- a/tests/unit/common/test_redact.py +++ b/tests/unit/common/test_redact.py @@ -1455,6 +1455,56 @@ def test_non_sensitive_args_preserved_in_cli_command(self): assert "http://localhost:8000" in cmd assert "gpt2" in cmd + @pytest.mark.parametrize( + "flag, value", + [ + param("--input-tokens-mean", "1024", id="input-tokens-mean"), + param("--input-tokens-stddev", "0", id="input-tokens-stddev"), + param("--output-tokens-mean", "128", id="output-tokens-mean"), + param("--output-tokens-stddev", "32", id="output-tokens-stddev"), + param( + "--synthetic-input-tokens-mean", "1024", id="synthetic-input-tokens-mean", + ), + param( + "--synthetic-input-tokens-stddev", + "0", + id="synthetic-input-tokens-stddev", + ), + ], + ) # fmt: skip + def test_token_count_flags_preserve_value_in_cli_command(self, flag, value): + """Regression: flag names containing 'tokens' (plural — LLM token + counts) must not be redacted just because they share a substring with + auth-token flags. Reported via real-run output where every + --*-tokens-* flag value came back as ''.""" + cmd = self._build_cli_command( + ["aiperf", "profile", "--model", "gpt2", flag, value] + ) + assert value in cmd, f"{flag} value {value!r} should not be redacted" + assert REDACTED_VALUE not in cmd + + @pytest.mark.parametrize( + "flag", + [ + param("--api-token", id="api-token"), + param("--api_token", id="api_token-underscore"), + param("--auth-token", id="auth-token"), + param("--access-token", id="access-token"), + param("--bearer-token", id="bearer-token"), + param("--id-token", id="id-token"), + param("--refresh-token", id="refresh-token"), + ], + ) # fmt: skip + def test_auth_token_compound_flags_redacted_in_cli_command(self, flag): + """Auth-token flag variants must still be redacted after tightening + the sensitive-token list away from the bare-'token' substring match.""" + secret = "nv-secret-AUTH-TOKEN-9876543210" + cmd = self._build_cli_command( + ["aiperf", "profile", "--model", "gpt2", flag, secret] + ) + assert secret not in cmd + assert REDACTED_VALUE in cmd + # ============================================================================= # ErrorDetails safe repr