From 431d9092ec742e0020212fb556f2f4b459b877ff Mon Sep 17 00:00:00 2001 From: Barak Korren Date: Tue, 19 May 2026 15:56:51 +0300 Subject: [PATCH 1/5] fix(security): run unicode normalization before secret redaction Zero-width characters interleaved in token-shaped strings bypassed secret regexes when redaction ran before unicode stripping in sandbox post-tool hooks. Reorder hooks to normalize first, extend OutputPipeline and output scanning paths, and add regression tests. Fixes #444 Co-authored-by: Cursor Signed-off-by: Barak Korren --- internal/cli/run.go | 4 +- internal/cli/scan.go | 12 ++- internal/security/hooks.go | 14 ++-- .../security/hooks/posttool_chain_test.py | 76 +++++++++++++++++++ internal/security/hooks_test.go | 43 ++++++++++- internal/security/scanner.go | 4 +- internal/security/scanner_test.go | 15 ++++ 7 files changed, 152 insertions(+), 16 deletions(-) create mode 100644 internal/security/hooks/posttool_chain_test.py diff --git a/internal/cli/run.go b/internal/cli/run.go index 763213002..c30d9c3f2 100644 --- a/internal/cli/run.go +++ b/internal/cli/run.go @@ -1408,7 +1408,7 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { return nil } - redactor := security.NewSecretRedactor() + pipeline := security.OutputPipeline() redacted := 0 findingsPath := filepath.Join(outputDir, "security", "findings.jsonl") @@ -1430,7 +1430,7 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { return nil } - result := redactor.Scan(string(content)) + result := pipeline.Scan(string(content)) if len(result.Findings) > 0 { redacted += len(result.Findings) relPath, _ := filepath.Rel(outputDir, path) diff --git a/internal/cli/scan.go b/internal/cli/scan.go index 61d8968c6..ab290c0bf 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -186,15 +186,19 @@ Usage in a workflow step: return nil } - redactor := security.NewSecretRedactor() - result := redactor.Scan(text) + pipeline := security.OutputPipeline() + result := pipeline.Scan(text) if len(result.Findings) > 0 { - printer.StepWarn(fmt.Sprintf("Redacted %d secret(s) from agent output", len(result.Findings))) + printer.StepWarn(fmt.Sprintf("Sanitized %d finding(s) in agent output", len(result.Findings))) for _, f := range result.Findings { printer.StepWarn(fmt.Sprintf(" %s: %s", f.Name, f.Detail)) } - fmt.Fprint(os.Stdout, result.Sanitized) + out := result.Sanitized + if out == "" { + out = text + } + fmt.Fprint(os.Stdout, out) } else { printer.StepDone("No secrets found in output") fmt.Fprint(os.Stdout, text) diff --git a/internal/security/hooks.go b/internal/security/hooks.go index 16b03da3b..8667f4c61 100644 --- a/internal/security/hooks.go +++ b/internal/security/hooks.go @@ -110,22 +110,24 @@ func GenerateClaudeSettings(h *harness.Harness) ([]byte, error) { // PostToolUse hooks for Bash|WebFetch|Read. Combined into a single matcher // so Claude Code chains them sequentially (separate matchers run in parallel // on the original result, which would cause modifications to conflict). - // Order: context suppress (compacts verbose success output) → secret redact - // → unicode scan. Suppressing first avoids scanning text we'd discard. + // Order: context suppress (compacts verbose success output) → unicode normalize + // → secret redact. Suppressing first avoids scanning text we'd discard. + // Invariant: unicode normalization must run before secret redaction so + // zero-width characters cannot break prefix regexes and reconstruct secrets. var postToolHooks []hookEntry if contextSuppressPostToolEnabled(sec) { postToolHooks = append(postToolHooks, hookEntry{ Type: "command", Command: "python3 " + SandboxHooksDir + "/context_suppress_posttool.py", }) } - if secretRedactPostToolEnabled(sec) { + if unicodePostToolEnabled(sec) { postToolHooks = append(postToolHooks, hookEntry{ - Type: "command", Command: "python3 " + SandboxHooksDir + "/secret_redact_posttool.py", + Type: "command", Command: "python3 " + SandboxHooksDir + "/unicode_posttool.py", }) } - if unicodePostToolEnabled(sec) { + if secretRedactPostToolEnabled(sec) { postToolHooks = append(postToolHooks, hookEntry{ - Type: "command", Command: "python3 " + SandboxHooksDir + "/unicode_posttool.py", + Type: "command", Command: "python3 " + SandboxHooksDir + "/secret_redact_posttool.py", }) } if len(postToolHooks) > 0 { diff --git a/internal/security/hooks/posttool_chain_test.py b/internal/security/hooks/posttool_chain_test.py new file mode 100644 index 000000000..a0ff78bae --- /dev/null +++ b/internal/security/hooks/posttool_chain_test.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""Integration tests for post-tool hook chain ordering (unicode before secret redact).""" + +from __future__ import annotations + +import json +import subprocess +import sys +import unittest +from pathlib import Path + +HOOKS_DIR = Path(__file__).parent +UNICODE_HOOK = str(HOOKS_DIR / "unicode_posttool.py") +SECRET_HOOK = str(HOOKS_DIR / "secret_redact_posttool.py") + +PLAIN_PAT = "ghp_FAKEtesttoken000000000000000000000000" + + +def obfuscate_with_zwnj(text: str) -> str: + """Insert zero-width non-joiner (U+200C) between each character.""" + return "\u200c".join(text) + + +def run_hook(script: str, tool_result: str) -> tuple[int, str]: + proc = subprocess.run( + [sys.executable, script], + input=json.dumps({"tool_name": "Read", "tool_result": tool_result}), + capture_output=True, + text=True, + timeout=10, + ) + return proc.returncode, proc.stdout + + +def run_chain(tool_result: str) -> str: + """Run unicode_posttool then secret_redact_posttool (correct sandbox order).""" + rc, stdout = run_hook(UNICODE_HOOK, tool_result) + if rc != 0: + raise RuntimeError(f"unicode hook failed: rc={rc}") + if stdout.strip(): + out = json.loads(stdout) + tool_result = out["tool_result"] + + rc, stdout = run_hook(SECRET_HOOK, tool_result) + if rc != 0: + raise RuntimeError(f"secret_redact hook failed: rc={rc}") + if stdout.strip(): + out = json.loads(stdout) + return out["tool_result"] + return tool_result + + +class TestPostToolChain(unittest.TestCase): + def test_plain_pat_redacted_by_chain(self): + result = run_chain(PLAIN_PAT) + self.assertNotIn("ghp_FAKEtest", result) + self.assertIn("...", result) + + def test_zero_width_obfuscated_pat_redacted_by_chain(self): + obfuscated = obfuscate_with_zwnj(PLAIN_PAT) + result = run_chain(obfuscated) + self.assertNotIn("ghp_FAKEtest", result) + self.assertIn("...", result) + + def test_redact_alone_misses_zero_width_obfuscated_pat(self): + obfuscated = obfuscate_with_zwnj(PLAIN_PAT) + rc, stdout = run_hook(SECRET_HOOK, obfuscated) + self.assertEqual(rc, 0) + # secret_redact alone does not modify output when regex cannot match + self.assertEqual(stdout.strip(), "") + # Obfuscated token still present in source (would leak after unicode strips ZWNJ) + self.assertIn("\u200c", obfuscated) + + +if __name__ == "__main__": + unittest.main() diff --git a/internal/security/hooks_test.go b/internal/security/hooks_test.go index abf456b59..170f1401e 100644 --- a/internal/security/hooks_test.go +++ b/internal/security/hooks_test.go @@ -2,6 +2,7 @@ package security import ( "encoding/json" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -32,7 +33,7 @@ func TestGenerateClaudeSettings_AllDefaults(t *testing.T) { matcher := postTools[0].(map[string]any) assert.Equal(t, "Bash|WebFetch|Read", matcher["matcher"]) chainedHooks := matcher["hooks"].([]any) - assert.Len(t, chainedHooks, 3) // context_suppress → secret_redact → unicode + assert.Len(t, chainedHooks, 3) // context_suppress → unicode → secret_redact // Verify canary hook has its own * matcher. canaryMatcher := postTools[1].(map[string]any) @@ -222,10 +223,46 @@ func TestGenerateClaudeSettings_ContextSuppressDisabled(t *testing.T) { postTools := hooks["PostToolUse"].([]any) assert.Len(t, postTools, 2) // chain matcher + canary matcher - // With context_suppress disabled: secret_redact + unicode in the chain. + // With context_suppress disabled: unicode + secret_redact in the chain. matcher := postTools[0].(map[string]any) chainedHooks := matcher["hooks"].([]any) - assert.Len(t, chainedHooks, 2) // secret_redact + unicode + assert.Len(t, chainedHooks, 2) // unicode + secret_redact +} + +func TestGenerateClaudeSettings_PostToolSanitizeHookOrder(t *testing.T) { + h := &harness.Harness{Agent: "test.md"} + data, err := GenerateClaudeSettings(h) + require.NoError(t, err) + + var settings map[string]any + require.NoError(t, json.Unmarshal(data, &settings)) + + postTools := settings["hooks"].(map[string]any)["PostToolUse"].([]any) + matcher := postTools[0].(map[string]any) + require.Equal(t, "Bash|WebFetch|Read", matcher["matcher"]) + + chainedHooks := matcher["hooks"].([]any) + commands := make([]string, len(chainedHooks)) + for i, h := range chainedHooks { + commands[i] = h.(map[string]any)["command"].(string) + } + + hookIndex := func(substr string) int { + for i, cmd := range commands { + if strings.Contains(cmd, substr) { + return i + } + } + t.Fatalf("hook containing %q not found in %v", substr, commands) + return -1 + } + + suppressIdx := hookIndex("context_suppress_posttool.py") + unicodeIdx := hookIndex("unicode_posttool.py") + redactIdx := hookIndex("secret_redact_posttool.py") + + assert.Less(t, suppressIdx, unicodeIdx, "context_suppress must run before unicode") + assert.Less(t, unicodeIdx, redactIdx, "unicode must run before secret_redact") } func TestGenerateClaudeSettings_CanaryPostToolDisabled(t *testing.T) { diff --git a/internal/security/scanner.go b/internal/security/scanner.go index 4334569d7..a793cdee5 100644 --- a/internal/security/scanner.go +++ b/internal/security/scanner.go @@ -92,9 +92,11 @@ func InputPipeline() *Pipeline { // OutputPipeline returns the standard output scanning pipeline for // agent-generated text before posting to the forge. -// 1. SecretRedactor — redact API keys, tokens, credentials +// 1. UnicodeNormalizer — strip invisible chars, normalize fullwidth +// 2. SecretRedactor — redact API keys, tokens, credentials func OutputPipeline() *Pipeline { return NewPipeline( + NewUnicodeNormalizer(), NewSecretRedactor(), ) } diff --git a/internal/security/scanner_test.go b/internal/security/scanner_test.go index 2606c792e..39991af5c 100644 --- a/internal/security/scanner_test.go +++ b/internal/security/scanner_test.go @@ -297,6 +297,21 @@ func TestPipeline(t *testing.T) { assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") }) + t.Run("normalize then redact catches zero-width obfuscated PAT", func(t *testing.T) { + p := NewPipeline(NewUnicodeNormalizer(), NewSecretRedactor()) + plain := "ghp_FAKEtesttoken000000000000000000000000" + var obfuscated strings.Builder + for _, r := range plain { + obfuscated.WriteRune(r) + obfuscated.WriteRune('\u200c') + } + r := p.Scan(obfuscated.String()) + assert.False(t, r.Safe) + assert.True(t, hasFinding(r, "zero_width")) + assert.True(t, hasFinding(r, "github_pat")) + assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") + }) + t.Run("clean text passes both", func(t *testing.T) { p := InputPipeline() r := p.Scan("Normal commit message fixing a null pointer bug.") From 6ec05a220bb95535048dede0ef965f7b52f62535 Mon Sep 17 00:00:00 2001 From: Barak Korren Date: Wed, 20 May 2026 09:09:38 +0300 Subject: [PATCH 2/5] fix(security): address PR review feedback on unicode coverage Expand invisible-character stripping (U+200E/F, U+180E, etc.), align Go ANSI/OSC handling with Python including post-NFKC rescan, add empty Sanitized guards and consistent scan output logging. Co-authored-by: Cursor Signed-off-by: Barak Korren --- internal/cli/run.go | 28 +++-- internal/cli/scan.go | 5 +- .../security/hooks/posttool_chain_test.py | 30 +++--- internal/security/hooks/unicode_posttool.py | 18 +++- internal/security/scanner_test.go | 15 +++ internal/security/unicode.go | 102 +++++++++++++++--- 6 files changed, 157 insertions(+), 41 deletions(-) diff --git a/internal/cli/run.go b/internal/cli/run.go index c30d9c3f2..7775ee404 100644 --- a/internal/cli/run.go +++ b/internal/cli/run.go @@ -1400,8 +1400,9 @@ func scanRepoContextFiles(repoDir string) []security.Finding { return allFindings } -// scanOutputFiles runs the secret redactor on extracted output files, -// recursively walking all subdirectories (iteration-N/output/, etc.). +// scanOutputFiles runs the output security pipeline (unicode normalization and +// secret redaction) on extracted output files, recursively walking all +// subdirectories (iteration-N/output/, etc.). func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { if _, err := os.Stat(outputDir); os.IsNotExist(err) { printer.StepInfo("No output files to scan") @@ -1409,7 +1410,7 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { } pipeline := security.OutputPipeline() - redacted := 0 + findingCount := 0 findingsPath := filepath.Join(outputDir, "security", "findings.jsonl") err := filepath.WalkDir(outputDir, func(path string, d os.DirEntry, err error) error { @@ -1430,12 +1431,13 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { return nil } - result := pipeline.Scan(string(content)) + text := string(content) + result := pipeline.Scan(text) if len(result.Findings) > 0 { - redacted += len(result.Findings) + findingCount += len(result.Findings) relPath, _ := filepath.Rel(outputDir, path) for _, f := range result.Findings { - printer.StepWarn(fmt.Sprintf("Redacted [%s] in %s: %s", f.Name, relPath, f.Detail)) + printer.StepWarn(fmt.Sprintf("Sanitized [%s] in %s: %s", f.Name, relPath, f.Detail)) security.AppendFinding(findingsPath, security.TracedFinding{ TraceID: traceID, @@ -1444,8 +1446,12 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { Finding: f, }) } - if writeErr := os.WriteFile(path, []byte(result.Sanitized), 0o644); writeErr != nil { - printer.StepWarn(fmt.Sprintf("Could not write redacted %s: %v", relPath, writeErr)) + out := result.Sanitized + if out == "" { + out = text + } + if writeErr := os.WriteFile(path, []byte(out), 0o644); writeErr != nil { + printer.StepWarn(fmt.Sprintf("Could not write sanitized %s: %v", relPath, writeErr)) } } return nil @@ -1454,10 +1460,10 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { return err } - if redacted > 0 { - printer.StepWarn(fmt.Sprintf("Redacted %d secret(s) from output files", redacted)) + if findingCount > 0 { + printer.StepWarn(fmt.Sprintf("Sanitized %d finding(s) in output files", findingCount)) } else { - printer.StepDone("Output files clean — no secrets found") + printer.StepDone("Output files clean — no issues found") } return nil } diff --git a/internal/cli/scan.go b/internal/cli/scan.go index ab290c0bf..c2886eb3d 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -160,8 +160,9 @@ func newScanOutputCmd() *cobra.Command { return &cobra.Command{ Use: "output", Short: "Scan agent output for leaked secrets before posting", - Long: `Reads text from stdin and scans for API keys, tokens, credentials, -and sensitive patterns. Outputs the redacted version to stdout. + Long: `Reads text from stdin, normalizes invisible Unicode characters, and +scans for API keys, tokens, credentials, and sensitive patterns. Outputs the +sanitized version to stdout. Usage in a workflow step: echo "$AGENT_OUTPUT" | fullsend scan output > safe_output.txt`, diff --git a/internal/security/hooks/posttool_chain_test.py b/internal/security/hooks/posttool_chain_test.py index a0ff78bae..68cf4d50f 100644 --- a/internal/security/hooks/posttool_chain_test.py +++ b/internal/security/hooks/posttool_chain_test.py @@ -16,12 +16,12 @@ PLAIN_PAT = "ghp_FAKEtesttoken000000000000000000000000" -def obfuscate_with_zwnj(text: str) -> str: - """Insert zero-width non-joiner (U+200C) between each character.""" - return "\u200c".join(text) +def obfuscate_with_char(text: str, char: str) -> str: + """Insert invisible character between each codepoint.""" + return char.join(text) -def run_hook(script: str, tool_result: str) -> tuple[int, str]: +def run_hook(script: str, tool_result: str) -> tuple[int, str, str]: proc = subprocess.run( [sys.executable, script], input=json.dumps({"tool_name": "Read", "tool_result": tool_result}), @@ -29,21 +29,21 @@ def run_hook(script: str, tool_result: str) -> tuple[int, str]: text=True, timeout=10, ) - return proc.returncode, proc.stdout + return proc.returncode, proc.stdout, proc.stderr def run_chain(tool_result: str) -> str: """Run unicode_posttool then secret_redact_posttool (correct sandbox order).""" - rc, stdout = run_hook(UNICODE_HOOK, tool_result) + rc, stdout, stderr = run_hook(UNICODE_HOOK, tool_result) if rc != 0: - raise RuntimeError(f"unicode hook failed: rc={rc}") + raise RuntimeError(f"unicode hook failed: rc={rc}, stderr={stderr}") if stdout.strip(): out = json.loads(stdout) tool_result = out["tool_result"] - rc, stdout = run_hook(SECRET_HOOK, tool_result) + rc, stdout, stderr = run_hook(SECRET_HOOK, tool_result) if rc != 0: - raise RuntimeError(f"secret_redact hook failed: rc={rc}") + raise RuntimeError(f"secret_redact hook failed: rc={rc}, stderr={stderr}") if stdout.strip(): out = json.loads(stdout) return out["tool_result"] @@ -57,14 +57,20 @@ def test_plain_pat_redacted_by_chain(self): self.assertIn("...", result) def test_zero_width_obfuscated_pat_redacted_by_chain(self): - obfuscated = obfuscate_with_zwnj(PLAIN_PAT) + obfuscated = obfuscate_with_char(PLAIN_PAT, "\u200c") + result = run_chain(obfuscated) + self.assertNotIn("ghp_FAKEtest", result) + self.assertIn("...", result) + + def test_ltr_mark_obfuscated_pat_redacted_by_chain(self): + obfuscated = obfuscate_with_char(PLAIN_PAT, "\u200e") result = run_chain(obfuscated) self.assertNotIn("ghp_FAKEtest", result) self.assertIn("...", result) def test_redact_alone_misses_zero_width_obfuscated_pat(self): - obfuscated = obfuscate_with_zwnj(PLAIN_PAT) - rc, stdout = run_hook(SECRET_HOOK, obfuscated) + obfuscated = obfuscate_with_char(PLAIN_PAT, "\u200c") + rc, stdout, _ = run_hook(SECRET_HOOK, obfuscated) self.assertEqual(rc, 0) # secret_redact alone does not modify output when regex cannot match self.assertEqual(stdout.strip(), "") diff --git a/internal/security/hooks/unicode_posttool.py b/internal/security/hooks/unicode_posttool.py index 98d38a9d6..044c8d4cc 100644 --- a/internal/security/hooks/unicode_posttool.py +++ b/internal/security/hooks/unicode_posttool.py @@ -39,7 +39,9 @@ ( "zero_width", "high", - re.compile("[\u200b-\u200d\ufeff\u00ad\u2060-\u2064]+"), + re.compile( + "[\u00ad\u034f\u180e\u200b-\u200f\ufeff\u2060-\u2064\u206a-\u206f\ufff9-\ufffb]+" + ), ), ( "bidi_override", @@ -129,6 +131,20 @@ def scan_text(text: str) -> tuple[str, list[dict]]: result = pattern.sub("", result) + # Supplementary variation selectors (VS17-VS256, U+E0100-U+E01EF). + supp_vs = [c for c in result if 0xE0100 <= ord(c) <= 0xE01EF] + if supp_vs: + findings.append( + { + "name": "variation_selector", + "severity": "medium", + "detail": ( + f"{len(supp_vs)} supplementary variation selector character(s) removed" + ), + } + ) + result = "".join(c for c in result if not (0xE0100 <= ord(c) <= 0xE01EF)) + # NFKC normalization (fullwidth -> ASCII, compatibility decomposition). nfkc = unicodedata.normalize("NFKC", result) if nfkc != result: diff --git a/internal/security/scanner_test.go b/internal/security/scanner_test.go index 39991af5c..8677649fc 100644 --- a/internal/security/scanner_test.go +++ b/internal/security/scanner_test.go @@ -312,6 +312,21 @@ func TestPipeline(t *testing.T) { assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") }) + t.Run("normalize then redact catches LTR mark obfuscated PAT", func(t *testing.T) { + p := NewPipeline(NewUnicodeNormalizer(), NewSecretRedactor()) + plain := "ghp_FAKEtesttoken000000000000000000000000" + var obfuscated strings.Builder + for _, r := range plain { + obfuscated.WriteRune(r) + obfuscated.WriteRune('\u200e') + } + r := p.Scan(obfuscated.String()) + assert.False(t, r.Safe) + assert.True(t, hasFinding(r, "zero_width")) + assert.True(t, hasFinding(r, "github_pat")) + assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") + }) + t.Run("clean text passes both", func(t *testing.T) { p := InputPipeline() r := p.Scan("Normal commit message fixing a null pointer bug.") diff --git a/internal/security/unicode.go b/internal/security/unicode.go index 2a1625301..0c5983bd7 100644 --- a/internal/security/unicode.go +++ b/internal/security/unicode.go @@ -22,9 +22,9 @@ func NewUnicodeNormalizer() *UnicodeNormalizer { func (u *UnicodeNormalizer) Name() string { return "unicode_normalizer" } var ( - // Zero-width and invisible characters. + // Zero-width and invisible format characters (aligned with unicode_posttool.py). reZeroWidth = regexp.MustCompile( - "[\u200B\u200C\u200D\uFEFF\u00AD\u2060-\u2064]+", + "[\u00AD\u034F\u180E\u200B-\u200F\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB]+", ) // Bidirectional override characters. @@ -32,16 +32,37 @@ var ( "[\u202A-\u202E\u2066-\u2069]+", ) - // ANSI escape sequences. - reANSI = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]`) + // ANSI CSI escape sequences (ECMA-48 compliant). + reANSI = regexp.MustCompile(`\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]`) + + // OSC escape sequences (terminal hyperlinks, title injection). + reOSC = regexp.MustCompile(`\x1b\][^\x1b\x07]*(?:\x1b\\|\x07)`) // Null bytes. reNull = regexp.MustCompile("\x00+") - // Variation selectors. + // BMP variation selectors (VS1-VS16). reVariation = regexp.MustCompile("[\uFE00-\uFE0F]+") ) +// stripTerminalEscapes removes ANSI CSI and OSC sequences from text. +func stripTerminalEscapes(text string) (string, int, int) { + current := text + ansiCount := 0 + oscCount := 0 + + if matches := reANSI.FindAllString(current, -1); len(matches) > 0 { + ansiCount = len(matches) + current = reANSI.ReplaceAllString(current, "") + } + if matches := reOSC.FindAllString(current, -1); len(matches) > 0 { + oscCount = len(matches) + current = reOSC.ReplaceAllString(current, "") + } + + return current, ansiCount, oscCount +} + func (u *UnicodeNormalizer) Scan(text string) ScanResult { result := ScanResult{Safe: true, Sanitized: text} current := text @@ -62,15 +83,25 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = reNull.ReplaceAllString(current, "") } - // ANSI escapes - if matches := reANSI.FindAllString(current, -1); len(matches) > 0 { - findings = append(findings, Finding{ - Scanner: "unicode_normalizer", - Name: "ansi_escape", - Severity: "medium", - Detail: fmt.Sprintf("%d ANSI escape sequences removed", len(matches)), - }) - current = reANSI.ReplaceAllString(current, "") + // ANSI and OSC escapes + if stripped, ansiCount, oscCount := stripTerminalEscapes(current); ansiCount > 0 || oscCount > 0 { + if ansiCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "ansi_escape", + Severity: "medium", + Detail: fmt.Sprintf("%d ANSI escape sequences removed", ansiCount), + }) + } + if oscCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "osc_escape", + Severity: "medium", + Detail: fmt.Sprintf("%d OSC escape sequences removed", oscCount), + }) + } + current = stripped } // Zero-width characters @@ -130,7 +161,7 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = tagStripped.String() } - // Variation selectors + // BMP variation selectors (VS1-VS16) if locs := reVariation.FindAllStringIndex(current, -1); len(locs) > 0 { count := 0 for _, loc := range locs { @@ -145,6 +176,26 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = reVariation.ReplaceAllString(current, "") } + // Supplementary variation selectors (VS17-VS256, U+E0100-U+E01EF). + var suppVSStripped strings.Builder + suppVSCount := 0 + for _, r := range current { + if r >= 0xE0100 && r <= 0xE01EF { + suppVSCount++ + } else { + suppVSStripped.WriteRune(r) + } + } + if suppVSCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "variation_selector", + Severity: "medium", + Detail: fmt.Sprintf("%d supplementary variation selectors removed", suppVSCount), + }) + current = suppVSStripped.String() + } + // NFKC normalization (fullwidth -> ASCII, compatibility decomposition) nfkc := norm.NFKC.String(current) if nfkc != current { @@ -177,6 +228,27 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { Detail: fmt.Sprintf("NFKC normalization applied (%d characters affected)", diffCount), }) current = nfkc + + // NFKC can reconstruct escape sequences from fullwidth characters. + if stripped, ansiCount, oscCount := stripTerminalEscapes(current); ansiCount > 0 || oscCount > 0 { + if ansiCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "ansi_escape", + Severity: "medium", + Detail: fmt.Sprintf("%d ANSI escape sequences removed (post-NFKC)", ansiCount), + }) + } + if oscCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "osc_escape", + Severity: "medium", + Detail: fmt.Sprintf("%d OSC escape sequences removed (post-NFKC)", oscCount), + }) + } + current = stripped + } } result.Findings = findings From 2a0be483640b25485cbe4daa04edf5405fabe3f5 Mon Sep 17 00:00:00 2001 From: Barak Korren Date: Wed, 20 May 2026 09:12:20 +0300 Subject: [PATCH 3/5] style: ruff-format unicode_posttool.py Co-authored-by: Cursor Signed-off-by: Barak Korren --- internal/security/hooks/unicode_posttool.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/security/hooks/unicode_posttool.py b/internal/security/hooks/unicode_posttool.py index 044c8d4cc..20d1dee9a 100644 --- a/internal/security/hooks/unicode_posttool.py +++ b/internal/security/hooks/unicode_posttool.py @@ -138,9 +138,7 @@ def scan_text(text: str) -> tuple[str, list[dict]]: { "name": "variation_selector", "severity": "medium", - "detail": ( - f"{len(supp_vs)} supplementary variation selector character(s) removed" - ), + "detail": (f"{len(supp_vs)} supplementary variation selector character(s) removed"), } ) result = "".join(c for c in result if not (0xE0100 <= ord(c) <= 0xE01EF)) From 0cb6fa3a8c46d709a4b37314790727da58633426 Mon Sep 17 00:00:00 2001 From: Barak Korren Date: Tue, 26 May 2026 11:45:24 +0300 Subject: [PATCH 4/5] fix(security): address post-rebase review feedback Only fall back to original text when the pipeline reports no findings, use single-pass terminal escape stripping, and add wrong-order pipeline test. Co-authored-by: Cursor Signed-off-by: Barak Korren --- internal/cli/run.go | 4 +++- internal/cli/scan.go | 2 +- internal/security/scanner_test.go | 13 +++++++++++++ internal/security/unicode.go | 19 ++++++++----------- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/internal/cli/run.go b/internal/cli/run.go index 7775ee404..eac07dd7c 100644 --- a/internal/cli/run.go +++ b/internal/cli/run.go @@ -1447,7 +1447,9 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { }) } out := result.Sanitized - if out == "" { + // Only fall back when the pipeline made no changes; empty Sanitized with + // findings means content was fully stripped (e.g. all invisible chars). + if out == "" && len(result.Findings) == 0 { out = text } if writeErr := os.WriteFile(path, []byte(out), 0o644); writeErr != nil { diff --git a/internal/cli/scan.go b/internal/cli/scan.go index c2886eb3d..1c7ba6884 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -196,7 +196,7 @@ Usage in a workflow step: printer.StepWarn(fmt.Sprintf(" %s: %s", f.Name, f.Detail)) } out := result.Sanitized - if out == "" { + if out == "" && len(result.Findings) == 0 { out = text } fmt.Fprint(os.Stdout, out) diff --git a/internal/security/scanner_test.go b/internal/security/scanner_test.go index 8677649fc..d72cd18d1 100644 --- a/internal/security/scanner_test.go +++ b/internal/security/scanner_test.go @@ -327,6 +327,19 @@ func TestPipeline(t *testing.T) { assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") }) + t.Run("wrong order leaks zero-width obfuscated PAT", func(t *testing.T) { + p := NewPipeline(NewSecretRedactor(), NewUnicodeNormalizer()) + plain := "ghp_FAKEtesttoken000000000000000000000000" + var obfuscated strings.Builder + for _, r := range plain { + obfuscated.WriteRune(r) + obfuscated.WriteRune('\u200c') + } + r := p.Scan(obfuscated.String()) + assert.True(t, hasFinding(r, "zero_width")) + assert.False(t, hasFinding(r, "github_pat"), "redact-before-normalize must not catch obfuscated token") + }) + t.Run("clean text passes both", func(t *testing.T) { p := InputPipeline() r := p.Scan("Normal commit message fixing a null pointer bug.") diff --git a/internal/security/unicode.go b/internal/security/unicode.go index 0c5983bd7..c3fc078ea 100644 --- a/internal/security/unicode.go +++ b/internal/security/unicode.go @@ -47,19 +47,16 @@ var ( // stripTerminalEscapes removes ANSI CSI and OSC sequences from text. func stripTerminalEscapes(text string) (string, int, int) { - current := text ansiCount := 0 + current := reANSI.ReplaceAllStringFunc(text, func(string) string { + ansiCount++ + return "" + }) oscCount := 0 - - if matches := reANSI.FindAllString(current, -1); len(matches) > 0 { - ansiCount = len(matches) - current = reANSI.ReplaceAllString(current, "") - } - if matches := reOSC.FindAllString(current, -1); len(matches) > 0 { - oscCount = len(matches) - current = reOSC.ReplaceAllString(current, "") - } - + current = reOSC.ReplaceAllStringFunc(current, func(string) string { + oscCount++ + return "" + }) return current, ansiCount, oscCount } From a4f164e8741012461019224ff7c41c261a8eac4e Mon Sep 17 00:00:00 2001 From: Barak Korren Date: Tue, 26 May 2026 21:10:28 +0300 Subject: [PATCH 5/5] fix(security): address latest PR review on unicode normalization Remove dead Sanitized fallback guards, expand invisible-character coverage (U+061C, line/paragraph separators, Cf sweep), generalize ST-terminated escape stripping, and add wrong-order/fullwidth chain tests. Signed-off-by: Barak Korren Co-authored-by: Cursor --- internal/cli/run.go | 6 +-- internal/cli/scan.go | 7 +-- .../security/hooks/posttool_chain_test.py | 41 ++++++++++++++++ internal/security/hooks/unicode_posttool.py | 9 ++-- internal/security/scanner_test.go | 33 +++++++++++++ internal/security/unicode.go | 49 +++++++++++++------ 6 files changed, 117 insertions(+), 28 deletions(-) diff --git a/internal/cli/run.go b/internal/cli/run.go index eac07dd7c..d1b2b986d 100644 --- a/internal/cli/run.go +++ b/internal/cli/run.go @@ -1446,12 +1446,8 @@ func scanOutputFiles(outputDir, traceID string, printer *ui.Printer) error { Finding: f, }) } + // Sanitized may be empty when all content was invisible characters. out := result.Sanitized - // Only fall back when the pipeline made no changes; empty Sanitized with - // findings means content was fully stripped (e.g. all invisible chars). - if out == "" && len(result.Findings) == 0 { - out = text - } if writeErr := os.WriteFile(path, []byte(out), 0o644); writeErr != nil { printer.StepWarn(fmt.Sprintf("Could not write sanitized %s: %v", relPath, writeErr)) } diff --git a/internal/cli/scan.go b/internal/cli/scan.go index 1c7ba6884..4b96f3157 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -195,11 +195,8 @@ Usage in a workflow step: for _, f := range result.Findings { printer.StepWarn(fmt.Sprintf(" %s: %s", f.Name, f.Detail)) } - out := result.Sanitized - if out == "" && len(result.Findings) == 0 { - out = text - } - fmt.Fprint(os.Stdout, out) + // Sanitized may be empty when all content was invisible characters. + fmt.Fprint(os.Stdout, result.Sanitized) } else { printer.StepDone("No secrets found in output") fmt.Fprint(os.Stdout, text) diff --git a/internal/security/hooks/posttool_chain_test.py b/internal/security/hooks/posttool_chain_test.py index 68cf4d50f..d52b1ddb8 100644 --- a/internal/security/hooks/posttool_chain_test.py +++ b/internal/security/hooks/posttool_chain_test.py @@ -32,6 +32,36 @@ def run_hook(script: str, tool_result: str) -> tuple[int, str, str]: return proc.returncode, proc.stdout, proc.stderr +def run_wrong_chain(tool_result: str) -> str: + """Run secret_redact then unicode (wrong sandbox order — leaks obfuscated tokens).""" + rc, stdout, stderr = run_hook(SECRET_HOOK, tool_result) + if rc != 0: + raise RuntimeError(f"secret_redact hook failed: rc={rc}, stderr={stderr}") + if stdout.strip(): + out = json.loads(stdout) + tool_result = out["tool_result"] + + rc, stdout, stderr = run_hook(UNICODE_HOOK, tool_result) + if rc != 0: + raise RuntimeError(f"unicode hook failed: rc={rc}, stderr={stderr}") + if stdout.strip(): + out = json.loads(stdout) + return out["tool_result"] + return tool_result + + +def to_fullwidth_ascii(text: str) -> str: + """Convert printable ASCII to fullwidth compatibility forms.""" + out: list[str] = [] + for c in text: + o = ord(c) + if 0x21 <= o <= 0x7E: + out.append(chr(o + 0xFF00 - 0x20)) + else: + out.append(c) + return "".join(out) + + def run_chain(tool_result: str) -> str: """Run unicode_posttool then secret_redact_posttool (correct sandbox order).""" rc, stdout, stderr = run_hook(UNICODE_HOOK, tool_result) @@ -77,6 +107,17 @@ def test_redact_alone_misses_zero_width_obfuscated_pat(self): # Obfuscated token still present in source (would leak after unicode strips ZWNJ) self.assertIn("\u200c", obfuscated) + def test_wrong_order_chain_leaks_obfuscated_pat(self): + obfuscated = obfuscate_with_char(PLAIN_PAT, "\u200c") + result = run_wrong_chain(obfuscated) + self.assertIn("ghp_FAKEtest", result) + + def test_fullwidth_obfuscated_pat_redacted_by_chain(self): + fullwidth = to_fullwidth_ascii(PLAIN_PAT) + result = run_chain(fullwidth) + self.assertNotIn("ghp_FAKEtest", result) + self.assertIn("...", result) + if __name__ == "__main__": unittest.main() diff --git a/internal/security/hooks/unicode_posttool.py b/internal/security/hooks/unicode_posttool.py index 20d1dee9a..ba9418aee 100644 --- a/internal/security/hooks/unicode_posttool.py +++ b/internal/security/hooks/unicode_posttool.py @@ -40,7 +40,8 @@ "zero_width", "high", re.compile( - "[\u00ad\u034f\u180e\u200b-\u200f\ufeff\u2060-\u2064\u206a-\u206f\ufff9-\ufffb]+" + "[\u00ad\u034f\u061c\u0600-\u0605\u070f\u0890-\u0891\u08e2\u180e" + "\u200b-\u200f\u2028\u2029\u2060-\u2064\u206a-\u206f\ufeff\ufff9-\ufffb]+" ), ), ( @@ -59,13 +60,13 @@ "medium", re.compile(r"\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]"), ), - # OSC: terminal hyperlinks, window title injection. + # ST-terminated: OSC (ESC ]), DCS (ESC P), APC (ESC _), PM (ESC ^). # Uses negated class [^\x1b\x07]* instead of .*? to avoid O(n^2) - # backtracking on dense unterminated ESC] sequences. + # backtracking on dense unterminated sequences. ( "osc_escape", "medium", - re.compile(r"\x1b\][^\x1b\x07]*(?:\x1b\\|\x07)"), + re.compile(r"\x1b[\]P_^][^\x1b\x07]*(?:\x1b\\|\x07)"), ), ( "null_byte", diff --git a/internal/security/scanner_test.go b/internal/security/scanner_test.go index d72cd18d1..0b627c1c9 100644 --- a/internal/security/scanner_test.go +++ b/internal/security/scanner_test.go @@ -340,6 +340,39 @@ func TestPipeline(t *testing.T) { assert.False(t, hasFinding(r, "github_pat"), "redact-before-normalize must not catch obfuscated token") }) + t.Run("normalize then redact catches ALM obfuscated PAT", func(t *testing.T) { + p := NewPipeline(NewUnicodeNormalizer(), NewSecretRedactor()) + plain := "ghp_FAKEtesttoken000000000000000000000000" + var obfuscated strings.Builder + for _, r := range plain { + obfuscated.WriteRune(r) + obfuscated.WriteRune('\u061c') + } + r := p.Scan(obfuscated.String()) + assert.False(t, r.Safe) + assert.True(t, hasFinding(r, "zero_width")) + assert.True(t, hasFinding(r, "github_pat")) + assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") + }) + + t.Run("normalize then redact catches fullwidth obfuscated PAT", func(t *testing.T) { + p := NewPipeline(NewUnicodeNormalizer(), NewSecretRedactor()) + plain := "ghp_FAKEtesttoken000000000000000000000000" + var fullwidth strings.Builder + for _, r := range plain { + if r >= 0x21 && r <= 0x7e { + fullwidth.WriteRune(r + 0xff00 - 0x20) + } else { + fullwidth.WriteRune(r) + } + } + r := p.Scan(fullwidth.String()) + assert.False(t, r.Safe) + assert.True(t, hasFinding(r, "fullwidth")) + assert.True(t, hasFinding(r, "github_pat")) + assert.NotContains(t, r.Sanitized, "ghp_FAKEtest") + }) + t.Run("clean text passes both", func(t *testing.T) { p := InputPipeline() r := p.Scan("Normal commit message fixing a null pointer bug.") diff --git a/internal/security/unicode.go b/internal/security/unicode.go index c3fc078ea..ffda77941 100644 --- a/internal/security/unicode.go +++ b/internal/security/unicode.go @@ -4,6 +4,7 @@ import ( "fmt" "regexp" "strings" + "unicode" "unicode/utf8" "golang.org/x/text/unicode/norm" @@ -24,7 +25,7 @@ func (u *UnicodeNormalizer) Name() string { return "unicode_normalizer" } var ( // Zero-width and invisible format characters (aligned with unicode_posttool.py). reZeroWidth = regexp.MustCompile( - "[\u00AD\u034F\u180E\u200B-\u200F\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB]+", + "[\u00AD\u034F\u061C\u0600-\u0605\u070F\u0890-\u0891\u08E2\u180E\u200B-\u200F\u2028\u2029\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB]+", ) // Bidirectional override characters. @@ -35,8 +36,8 @@ var ( // ANSI CSI escape sequences (ECMA-48 compliant). reANSI = regexp.MustCompile(`\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]`) - // OSC escape sequences (terminal hyperlinks, title injection). - reOSC = regexp.MustCompile(`\x1b\][^\x1b\x07]*(?:\x1b\\|\x07)`) + // ST-terminated escape sequences: OSC (ESC ]), DCS (ESC P), APC (ESC _), PM (ESC ^). + reSTTerminated = regexp.MustCompile(`\x1b[\]P_^][^\x1b\x07]*(?:\x1b\\|\x07)`) // Null bytes. reNull = regexp.MustCompile("\x00+") @@ -52,12 +53,12 @@ func stripTerminalEscapes(text string) (string, int, int) { ansiCount++ return "" }) - oscCount := 0 - current = reOSC.ReplaceAllStringFunc(current, func(string) string { - oscCount++ + stCount := 0 + current = reSTTerminated.ReplaceAllStringFunc(current, func(string) string { + stCount++ return "" }) - return current, ansiCount, oscCount + return current, ansiCount, stCount } func (u *UnicodeNormalizer) Scan(text string) ScanResult { @@ -80,8 +81,8 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = reNull.ReplaceAllString(current, "") } - // ANSI and OSC escapes - if stripped, ansiCount, oscCount := stripTerminalEscapes(current); ansiCount > 0 || oscCount > 0 { + // ANSI and ST-terminated escapes (OSC, DCS, APC, PM). + if stripped, ansiCount, stCount := stripTerminalEscapes(current); ansiCount > 0 || stCount > 0 { if ansiCount > 0 { findings = append(findings, Finding{ Scanner: "unicode_normalizer", @@ -90,12 +91,12 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { Detail: fmt.Sprintf("%d ANSI escape sequences removed", ansiCount), }) } - if oscCount > 0 { + if stCount > 0 { findings = append(findings, Finding{ Scanner: "unicode_normalizer", Name: "osc_escape", Severity: "medium", - Detail: fmt.Sprintf("%d OSC escape sequences removed", oscCount), + Detail: fmt.Sprintf("%d ST-terminated escape sequences removed", stCount), }) } current = stripped @@ -193,6 +194,26 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = suppVSStripped.String() } + // Remaining Cf (format) category characters (e.g. U+061C Arabic Letter Mark). + var cfStripped strings.Builder + cfCount := 0 + for _, r := range current { + if unicode.Is(unicode.Cf, r) { + cfCount++ + } else { + cfStripped.WriteRune(r) + } + } + if cfCount > 0 { + findings = append(findings, Finding{ + Scanner: "unicode_normalizer", + Name: "zero_width", + Severity: "high", + Detail: fmt.Sprintf("%d format (Cf) characters removed", cfCount), + }) + current = cfStripped.String() + } + // NFKC normalization (fullwidth -> ASCII, compatibility decomposition) nfkc := norm.NFKC.String(current) if nfkc != current { @@ -227,7 +248,7 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { current = nfkc // NFKC can reconstruct escape sequences from fullwidth characters. - if stripped, ansiCount, oscCount := stripTerminalEscapes(current); ansiCount > 0 || oscCount > 0 { + if stripped, ansiCount, stCount := stripTerminalEscapes(current); ansiCount > 0 || stCount > 0 { if ansiCount > 0 { findings = append(findings, Finding{ Scanner: "unicode_normalizer", @@ -236,12 +257,12 @@ func (u *UnicodeNormalizer) Scan(text string) ScanResult { Detail: fmt.Sprintf("%d ANSI escape sequences removed (post-NFKC)", ansiCount), }) } - if oscCount > 0 { + if stCount > 0 { findings = append(findings, Finding{ Scanner: "unicode_normalizer", Name: "osc_escape", Severity: "medium", - Detail: fmt.Sprintf("%d OSC escape sequences removed (post-NFKC)", oscCount), + Detail: fmt.Sprintf("%d ST-terminated escape sequences removed (post-NFKC)", stCount), }) } current = stripped