diff --git a/automem/utils/entity_quality.py b/automem/utils/entity_quality.py index 9cd1177..513377f 100644 --- a/automem/utils/entity_quality.py +++ b/automem/utils/entity_quality.py @@ -16,6 +16,10 @@ "projects": "projects", "concept": "concepts", "concepts": "concepts", + "event": "events", + "events": "events", + "opportunity": "opportunities", + "opportunities": "opportunities", } _ALLOWED_CATEGORIES = set(_CATEGORY_ALIASES.values()) @@ -142,7 +146,6 @@ _MARKDOWN_OR_CODE_TOKENS = { "bin", - "code", "config", "env", "file", @@ -159,10 +162,13 @@ "yml", } +# "code" is only a weak signal: real tool names end in it (claude-code, +# vs-code, code-server), so it must not condemn a slug on its own. _MARKDOWN_OR_CODE_SECONDARY_TOKENS = { "api", "bash", "cli", + "code", "css", "dockerfile", "html", @@ -175,6 +181,26 @@ "xml", } +# Everyday non-name words that show up in generated people slugs +# (bottom-line, deck-today, email-highlights, claude-desktop). Any one of +# these tokens disqualifies a people slug; none are plausible name parts. +_NON_PERSON_COMMON_TOKENS = { + "bottom", + "chrome", + "deck", + "desktop", + "email", + "emails", + "highlight", + "highlights", + "line", + "plugin", + "plugins", + "today", + "tomorrow", + "yesterday", +} + _NON_PERSON_TECH_TOKENS = { "api", "app", @@ -186,6 +212,7 @@ "docker", "hub", "model", + "pipeline", "platform", "sdk", "service", @@ -438,6 +465,12 @@ def _looks_tool_or_org_like(value: str, slug: str, context: Optional[str]) -> bo if parts and any(parts[-1].endswith(suffix) for suffix in _TOOL_OR_ORG_SUFFIXES): return True + # Context hints are too weak to condemn a multi-token person-shaped name: + # in a technical corpus nearly every memory mentions data/projects/tools, + # which would reject virtually every real person discussed at work. + if " " in (value or "").strip() and len(parts) >= 2 and _has_person_name_shape(parts): + return False + lowered_context = (context or "").lower() if lowered_context and slug in lowered_context.replace(" ", "-"): return any(hint in lowered_context for hint in _TOOL_OR_ORG_CONTEXT_HINTS) @@ -586,6 +619,7 @@ def reject(reason: str) -> EntityValidationResult: or token in _MARKDOWN_OR_CODE_TOKENS or token in _MARKDOWN_OR_CODE_SECONDARY_TOKENS or token in _NON_PERSON_TECH_TOKENS + or token in _NON_PERSON_COMMON_TOKENS for token in tokens ): return reject("low_signal_people_slug") diff --git a/tests/test_entity_quality.py b/tests/test_entity_quality.py index 8965e37..3a91af3 100644 --- a/tests/test_entity_quality.py +++ b/tests/test_entity_quality.py @@ -91,6 +91,7 @@ def test_structural_noise_slugs_are_rejected( ("concepts", "before-after"), ("people", "docker-compose"), ("people", "complete-deliverable"), + ("people", "youtube-pipeline"), ], ) def test_generic_and_tooling_noise_is_rejected(category: str, slug: str) -> None: @@ -189,3 +190,112 @@ def test_single_token_specific_entities_do_not_require_a_curated_allowlist( assert result.accepted is True assert result.canonical_slug == slug + + +@pytest.mark.parametrize("value", ["Mara Quinn", "Tobias Lehman", "Priya J Raman"]) +def test_multi_token_person_names_survive_technical_context(value: str) -> None: + """A real person discussed in technical content is still a person. + + The context-hint branch must not reject valid person-shaped names just + because the surrounding memory mentions data/projects/platform/tooling — + in an engineering corpus nearly every memory does. + """ + from automem.utils.entity_quality import validate_entity_value + + context = ( + f"Met with {value} about the data pipeline project; the platform " + "tooling and database service migration are on track." + ) + result = validate_entity_value("people", value, context=context) + + assert result.accepted is True + + +def test_single_token_brandlike_people_still_rejected_in_tool_context() -> None: + from automem.utils.entity_quality import validate_entity_value + + context = "Deployed memvault to the platform; the data pipeline project uses it." + result = validate_entity_value("people", "memvault", context=context) + + assert result.accepted is False + assert result.reason == "tool_or_organization_looking_people" + + +def test_tool_or_org_suffix_people_rejected_even_with_person_shape() -> None: + from automem.utils.entity_quality import validate_entity_value + + context = "GrowthMath specializes in B2B SaaS analytics." + result = validate_entity_value("people", "growthmath", context=context) + + assert result.accepted is False + assert result.reason == "tool_or_organization_looking_people" + + +@pytest.mark.parametrize( + ("category", "slug"), + [ + ("tools", "claude-code"), + ("tools", "vs-code"), + ("tools", "code-server"), + ], +) +def test_code_suffixed_tool_names_are_accepted(category: str, slug: str) -> None: + from automem.utils.entity_quality import validate_entity_slug + + result = validate_entity_slug(category, slug) + + assert result.accepted is True + assert result.canonical_slug == slug + + +@pytest.mark.parametrize( + ("category", "slug"), + [ + ("people", "claude-code"), + ("organizations", "claude-md"), + ("tools", "venv-bin-python-m"), + ("tools", "tmp-settings"), + ], +) +def test_code_and_markdown_fragments_still_rejected(category: str, slug: str) -> None: + from automem.utils.entity_quality import validate_entity_slug + + result = validate_entity_slug(category, slug) + + assert result.accepted is False + + +@pytest.mark.parametrize( + ("category", "normalized", "slug"), + [ + ("events", "events", "launch-summit-2026"), + ("event", "events", "launch-summit-2026"), + ("opportunities", "opportunities", "hosting-partnerships"), + ("opportunity", "opportunities", "hosting-partnerships"), + ], +) +def test_event_and_opportunity_categories_are_supported( + category: str, + normalized: str, + slug: str, +) -> None: + from automem.utils.entity_quality import validate_entity_slug + + result = validate_entity_slug(category, slug) + + assert result.accepted is True + assert result.category == normalized + assert result.canonical_tag == f"entity:{normalized}:{slug}" + + +@pytest.mark.parametrize( + "slug", + ["bottom-line", "deck-today", "email-highlights", "claude-desktop"], +) +def test_common_word_pairs_are_not_people(slug: str) -> None: + from automem.utils.entity_quality import validate_entity_slug + + result = validate_entity_slug("people", slug) + + assert result.accepted is False + assert result.reason == "low_signal_people_slug"