From 033a77e88bbb5e4c05ee4deec97faacd345bee96 Mon Sep 17 00:00:00 2001 From: lbruton Date: Sun, 14 Jun 2026 18:19:11 -0500 Subject: [PATCH 1/3] test(SESF-44): add Tier-2 ASSIGNMENT precision RED tests (B.1-B.3) Cover the author*/authority/co-author FP family, $(...) command-sub and ${...} env-interpolation FP values, and genuine-key true positives (incl. bare oauth) through both redact() and scan_spans(). 10 RED, 8 green. --- tests/test_secret_redaction.py | 119 +++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tests/test_secret_redaction.py b/tests/test_secret_redaction.py index ce4d313..ceb320d 100644 --- a/tests/test_secret_redaction.py +++ b/tests/test_secret_redaction.py @@ -854,3 +854,122 @@ def test_redact_public_contract_unchanged(): assert isinstance(hits, list) assert hits assert tuple(type(hits[0])._fields) == ("rule_name", "tier") + + +# === SESF-44 — Tier-2 ASSIGNMENT precision ================================== +# +# Two FP classes the SESF-42 live baseline surfaced: +# * the bare `auth` keyword substring-matching the `author*`/`authority`/ +# co-author family (D-A: drop bare `auth`, add a left-anchored auth family); +# * command-substitution `$(…)` and env-interpolation `${…}` values that carry +# no literal secret (D-B: reject them in the Tier-2 branch). +# Every case is asserted through BOTH public paths -- `redact()` (Hit) AND +# `scan_spans()` (Span) -- because both route through the shared +# `_aggregate_maskable_candidates` seam (AC-6/AC-7). All tokens are synthetic. + +# A synthetic literal value: 16 chars (< _ENTROPY_MIN_LEN=20, so it is NOT a +# Tier-3 entropy candidate -- isolating the Tier-2 behavior under test) and >= 8 +# chars with mixed case+digits so it passes `_is_placeholder_value`. +SESF44_LITERAL = "Hunter2" + "Swordfish" + + +def _has_tier2_hit(line): + """True if `redact` reports a Tier-2 Hit for `line`.""" + _, hits = redact(line, mode="enforce") + return any(h.tier == 2 for h in hits) + + +def _has_assignment_span(line): + """True if `scan_spans` reports an ASSIGNMENT span for `line`.""" + _, spans = secret_redaction.scan_spans(line, mode="report") + return any(s.rule_name == "ASSIGNMENT" for s in spans) + + +# AC-1: the full reconciled author*/authority/co-author family. Each is flagged +# on current `main` (bare `auth` substring) -> RED until D-1 lands. +SESF44_AUTHOR_FAMILY = [ + "author", + "authored", + "authoredDate", + "authorAssociation", + "authority", + "gradingAuthority", + "coauthor", + "coAuthoredBy", +] + + +@pytest.mark.parametrize("key", SESF44_AUTHOR_FAMILY) +def test_sesf44_ac1_author_family_not_flagged(key): + """AC-1: author*/authority/co-author keys are NOT Tier-2 flagged (both paths). + + Value is an 8+ char literal that passes the value guard, so suppression is + proven by KEY shape (not value length). RED on current `main`. + """ + line = f'{{"{key}": "{SESF44_LITERAL}"}}' + out, hits = redact(line, mode="enforce") + assert not any(h.tier == 2 for h in hits), f"{key} should not be a Tier-2 hit" + assert SESF44_LITERAL in out # value left intact (not masked) + assert not _has_assignment_span(line), f"{key} should yield no ASSIGNMENT span" + + +def test_sesf44_ac3_command_substitution_not_flagged(): + """AC-3: a `$(…)` command-substitution value is not flagged (both paths). + + `export FOO_API_KEY=$(security …)` captures value `$(security` -- flagged on + `main` (passes the value guard) -> RED until D-2's value-start check lands. + """ + line = "export FOO_API_KEY=$(security find-generic-password svc)" + out, hits = redact(line, mode="enforce") + assert not any(h.tier == 2 for h in hits) + assert "$(security" in out # command-sub left intact + assert not _has_assignment_span(line) + + +def test_sesf44_ac4_env_interpolation_not_flagged(): + """AC-4: a `${VAR:-default}` interpolation is not flagged (both paths). + + `x=${FOO_PASSWORD:-default}` matches key `FOO_PASSWORD`, value `-default` -- + the `${` precedes the key, so a value-shape check alone misses it. RED until + D-2's interpolation-span check lands. + """ + line = "x=${FOO_PASSWORD:-default}" + out, hits = redact(line, mode="enforce") + assert not any(h.tier == 2 for h in hits) + assert not _has_assignment_span(line) + # Sanity: a pure `${VAR}` value (no default) is already unflagged (the value + # capture stops at `{`, leaving a sub-minimum value) -- must stay unflagged. + pure = "MY_SECRET=${VALUE}" + assert not _has_tier2_hit(pure) + assert not _has_assignment_span(pure) + + +# AC-2 / AC-5: genuine secret-bearing keys with a literal value STILL flag. +# Bare `oauth` is required explicitly -- it is the exact key the D-1 left +# lookbehind targets. These are positive controls: green on `main` AND after C. +SESF44_TRUE_POSITIVE_KEYS = [ + "authorization", + "oauth", # bare -- D-1 lookbehind probe + "oauth_token", + "auth_token", + "auth-token", + "authToken", + "FOO_API_KEY", +] + + +@pytest.mark.parametrize("key", SESF44_TRUE_POSITIVE_KEYS) +def test_sesf44_ac2_ac5_genuine_keys_still_flagged(key): + """AC-2/AC-5: genuine secret keys with a literal value STILL flag (both paths).""" + line = f'{key}="{SESF44_LITERAL}"' + out, hits = redact(line, mode="enforce") + assert any(h.tier == 2 for h in hits), f"{key} should still be a Tier-2 hit" + assert SESF44_LITERAL not in out # value masked + assert _has_assignment_span(line), f"{key} should still yield an ASSIGNMENT span" + + +def test_sesf44_ac2_placeholder_value_still_skipped(): + """AC-2 consistency: a genuine key with a placeholder value stays unflagged.""" + line = "authorization=none" + assert not _has_tier2_hit(line) + assert not _has_assignment_span(line) From 84d9f53fb50a204a9e688c139df11fb3e78bdd56 Mon Sep 17 00:00:00 2001 From: lbruton Date: Sun, 14 Jun 2026 18:21:46 -0500 Subject: [PATCH 2/3] fix(SESF-44): tighten Tier-2 auth keyword + reject non-literal values D-1: drop bare `auth` (substring-matched author*/authority/co-author git metadata); add left-anchored `(?[A-Za-z0-9_\-]*(?:" + _SECRET_KEYWORD + r")[A-Za-z0-9_\-]*)" @@ -110,6 +117,12 @@ ) _KEYWORD_RE = re.compile(r"(?i)(?:" + _SECRET_KEYWORD + r")") +# SESF-44: env-interpolation spans ``${…}`` carry no literal secret. A Tier-2 +# match whose key/value falls inside such a span (e.g. ``${FOO_PASSWORD:-default}``) +# is a non-literal and is dropped. Non-greedy/no-nesting by design (``[^}]*``); +# nested ``${a${b}}`` is a rare edge that is intentionally not matched. +_INTERPOLATION_SPAN_RE = re.compile(r"\$\{[^}]*\}") + # Value-shape guard (AC-7): reject placeholder shapes / sub-minimum-length values. _MIN_VALUE_LEN = 8 _PLACEHOLDER_WORDS = { @@ -199,6 +212,37 @@ def _is_placeholder_value(value: str) -> bool: return False +def _is_nonliteral_assignment(line: str, match: "re.Match[str]") -> bool: + """Return True if a Tier-2 assignment is a shell/env non-literal, not a secret (SESF-44). + + Two shapes carry no literal credential and are dropped before a Tier-2 + ASSIGNMENT candidate is recorded: + + * the captured value is a command substitution ``$(…)`` or begins an + interpolation ``${…}`` (e.g. ``KEY=$(security …)`` captures ``$(security``); + * the matched key/value falls inside a ``${…}`` interpolation span on the line + (e.g. ``x=${FOO_PASSWORD:-default}`` matches key ``FOO_PASSWORD`` with value + ``-default`` — the ``${`` precedes the key, so a value-shape check alone + misses it). + + Args: + line: The full line the Tier-2 scanner is processing. + match: The :data:`_ASSIGN_RE` match whose key/value are under evaluation. + + Returns: + True if the assignment is a non-literal (command substitution / + interpolation) and must not be reported as a secret. + """ + value = match.group("value") + if value.startswith("$(") or value.startswith("${"): + return True + start, end = match.start(), match.end() + return any( + span.start() < end and start < span.end() + for span in _INTERPOLATION_SPAN_RE.finditer(line) + ) + + def _is_structurally_allowlisted(value: str) -> bool: """Return True if ``value`` is a structurally non-secret shape (AC-9). @@ -325,6 +369,9 @@ def _collect_candidates(text: str) -> list[tuple[str, str, int, bool]]: value = match.group("value") if _is_placeholder_value(value): continue + # SESF-44: drop command-substitution / env-interpolation matches. + if _is_nonliteral_assignment(line, match): + continue candidates.append((value, "ASSIGNMENT", 2, True)) return candidates diff --git a/tests/test_secret_redaction.py b/tests/test_secret_redaction.py index ceb320d..fa5595a 100644 --- a/tests/test_secret_redaction.py +++ b/tests/test_secret_redaction.py @@ -934,7 +934,7 @@ def test_sesf44_ac4_env_interpolation_not_flagged(): D-2's interpolation-span check lands. """ line = "x=${FOO_PASSWORD:-default}" - out, hits = redact(line, mode="enforce") + _, hits = redact(line, mode="enforce") assert not any(h.tier == 2 for h in hits) assert not _has_assignment_span(line) # Sanity: a pure `${VAR}` value (no default) is already unflagged (the value From 079d327274efb168b79c2e588b212c879f9f52ae Mon Sep 17 00:00:00 2001 From: lbruton Date: Sun, 14 Jun 2026 19:06:44 -0500 Subject: [PATCH 3/3] fix(SESF-44): address PR #42 review feedback - Remove unreachable `value.startswith("${")` branch in _is_nonliteral_assignment (the value capture class excludes `{`, so ${ can never start a captured value; interpolation is handled by the span-overlap check). [codacy 3410428483] - Add `# noqa: S105` on the keyword string so CodeRabbit's S-enabled ruff passes (project ruff is D1-only; this is keyword NAMES, not a credential). [coderabbit 3410431432] - Tests: add shell-form parity to the AC-1 author-family cases, a quoted command-substitution case, and a ${VAR:?error} interpolation case. [codacy 3410428669] 378 passed, ruff clean (incl. S105). --- secret_redaction.py | 12 ++++++++---- tests/test_secret_redaction.py | 20 +++++++++++++++----- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/secret_redaction.py b/secret_redaction.py index c2c07e2..75913b0 100644 --- a/secret_redaction.py +++ b/secret_redaction.py @@ -106,7 +106,7 @@ # positive). The auth family is now a left-anchored segment (negative # lookbehind `(? bool: Two shapes carry no literal credential and are dropped before a Tier-2 ASSIGNMENT candidate is recorded: - * the captured value is a command substitution ``$(…)`` or begins an - interpolation ``${…}`` (e.g. ``KEY=$(security …)`` captures ``$(security``); + * the captured value is a command substitution ``$(…)`` (e.g. + ``KEY=$(security …)`` captures ``$(security``); * the matched key/value falls inside a ``${…}`` interpolation span on the line (e.g. ``x=${FOO_PASSWORD:-default}`` matches key ``FOO_PASSWORD`` with value ``-default`` — the ``${`` precedes the key, so a value-shape check alone misses it). + A captured value can never itself start with ``${``: ``_ASSIGN_RE`` excludes + ``{`` from the value class, so the interpolation case is handled solely by the + span-overlap check below, not by a value-prefix test. + Args: line: The full line the Tier-2 scanner is processing. match: The :data:`_ASSIGN_RE` match whose key/value are under evaluation. @@ -234,7 +238,7 @@ def _is_nonliteral_assignment(line: str, match: "re.Match[str]") -> bool: interpolation) and must not be reported as a secret. """ value = match.group("value") - if value.startswith("$(") or value.startswith("${"): + if value.startswith("$("): return True start, end = match.start(), match.end() return any( diff --git a/tests/test_secret_redaction.py b/tests/test_secret_redaction.py index fa5595a..c3a8f54 100644 --- a/tests/test_secret_redaction.py +++ b/tests/test_secret_redaction.py @@ -906,11 +906,13 @@ def test_sesf44_ac1_author_family_not_flagged(key): Value is an 8+ char literal that passes the value guard, so suppression is proven by KEY shape (not value length). RED on current `main`. """ - line = f'{{"{key}": "{SESF44_LITERAL}"}}' - out, hits = redact(line, mode="enforce") - assert not any(h.tier == 2 for h in hits), f"{key} should not be a Tier-2 hit" - assert SESF44_LITERAL in out # value left intact (not masked) - assert not _has_assignment_span(line), f"{key} should yield no ASSIGNMENT span" + # Both JSON-style and shell-style assignment forms (the live FPs appeared as + # `{"author": "..."}` PR JSON *and* shell `author=...`). + for line in (f'{{"{key}": "{SESF44_LITERAL}"}}', f"{key}={SESF44_LITERAL}"): + out, hits = redact(line, mode="enforce") + assert not any(h.tier == 2 for h in hits), f"{key!r} in {line!r}: unexpected Tier-2 hit" + assert SESF44_LITERAL in out # value left intact (not masked) + assert not _has_assignment_span(line), f"{key!r} in {line!r}: unexpected ASSIGNMENT span" def test_sesf44_ac3_command_substitution_not_flagged(): @@ -924,6 +926,10 @@ def test_sesf44_ac3_command_substitution_not_flagged(): assert not any(h.tier == 2 for h in hits) assert "$(security" in out # command-sub left intact assert not _has_assignment_span(line) + # Quoted variant: KEY="$(...)" — the captured value still begins `$(`. + quoted = 'FOO_API_KEY="$(security find-generic-password svc)"' + assert not _has_tier2_hit(quoted) + assert not _has_assignment_span(quoted) def test_sesf44_ac4_env_interpolation_not_flagged(): @@ -942,6 +948,10 @@ def test_sesf44_ac4_env_interpolation_not_flagged(): pure = "MY_SECRET=${VALUE}" assert not _has_tier2_hit(pure) assert not _has_assignment_span(pure) + # The `${VAR:?error}` (error-if-unset) operator is the same interpolation span. + err = "x=${FOO_TOKEN:?missing}" + assert not _has_tier2_hit(err) + assert not _has_assignment_span(err) # AC-2 / AC-5: genuine secret-bearing keys with a literal value STILL flag.