Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion automem/utils/entity_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
"projects": "projects",
"concept": "concepts",
"concepts": "concepts",
"event": "events",
"events": "events",
"opportunity": "opportunities",
"opportunities": "opportunities",
}

_ALLOWED_CATEGORIES = set(_CATEGORY_ALIASES.values())
Expand Down Expand Up @@ -142,7 +146,6 @@

_MARKDOWN_OR_CODE_TOKENS = {
"bin",
"code",
"config",
"env",
"file",
Expand All @@ -159,10 +162,13 @@
"yml",
}

# "code" is only a weak signal: real tool names end in it (claude-code,
# vs-code, code-server), so it must not condemn a slug on its own.
_MARKDOWN_OR_CODE_SECONDARY_TOKENS = {
"api",
"bash",
"cli",
"code",
"css",
"dockerfile",
"html",
Expand All @@ -175,6 +181,26 @@
"xml",
}

# Everyday non-name words that show up in generated people slugs
# (bottom-line, deck-today, email-highlights, claude-desktop). Any one of
# these tokens disqualifies a people slug; none are plausible name parts.
_NON_PERSON_COMMON_TOKENS = {
"bottom",
"chrome",
"deck",
"desktop",
"email",
"emails",
"highlight",
"highlights",
"line",
"plugin",
"plugins",
"today",
"tomorrow",
"yesterday",
}

_NON_PERSON_TECH_TOKENS = {
"api",
"app",
Expand All @@ -186,6 +212,7 @@
"docker",
"hub",
"model",
"pipeline",
"platform",
"sdk",
"service",
Expand Down Expand Up @@ -438,6 +465,12 @@ def _looks_tool_or_org_like(value: str, slug: str, context: Optional[str]) -> bo
if parts and any(parts[-1].endswith(suffix) for suffix in _TOOL_OR_ORG_SUFFIXES):
return True

# Context hints are too weak to condemn a multi-token person-shaped name:
# in a technical corpus nearly every memory mentions data/projects/tools,
# which would reject virtually every real person discussed at work.
if " " in (value or "").strip() and len(parts) >= 2 and _has_person_name_shape(parts):
return False
Comment thread
Copilot marked this conversation as resolved.

lowered_context = (context or "").lower()
if lowered_context and slug in lowered_context.replace(" ", "-"):
return any(hint in lowered_context for hint in _TOOL_OR_ORG_CONTEXT_HINTS)
Expand Down Expand Up @@ -586,6 +619,7 @@ def reject(reason: str) -> EntityValidationResult:
or token in _MARKDOWN_OR_CODE_TOKENS
or token in _MARKDOWN_OR_CODE_SECONDARY_TOKENS
or token in _NON_PERSON_TECH_TOKENS
or token in _NON_PERSON_COMMON_TOKENS
for token in tokens
):
return reject("low_signal_people_slug")
Expand Down
110 changes: 110 additions & 0 deletions tests/test_entity_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def test_structural_noise_slugs_are_rejected(
("concepts", "before-after"),
("people", "docker-compose"),
("people", "complete-deliverable"),
("people", "youtube-pipeline"),
],
)
def test_generic_and_tooling_noise_is_rejected(category: str, slug: str) -> None:
Expand Down Expand Up @@ -189,3 +190,112 @@ def test_single_token_specific_entities_do_not_require_a_curated_allowlist(

assert result.accepted is True
assert result.canonical_slug == slug


@pytest.mark.parametrize("value", ["Mara Quinn", "Tobias Lehman", "Priya J Raman"])
def test_multi_token_person_names_survive_technical_context(value: str) -> None:
"""A real person discussed in technical content is still a person.

The context-hint branch must not reject valid person-shaped names just
because the surrounding memory mentions data/projects/platform/tooling —
in an engineering corpus nearly every memory does.
"""
from automem.utils.entity_quality import validate_entity_value

context = (
f"Met with {value} about the data pipeline project; the platform "
"tooling and database service migration are on track."
)
result = validate_entity_value("people", value, context=context)

assert result.accepted is True


def test_single_token_brandlike_people_still_rejected_in_tool_context() -> None:
from automem.utils.entity_quality import validate_entity_value

context = "Deployed memvault to the platform; the data pipeline project uses it."
result = validate_entity_value("people", "memvault", context=context)

assert result.accepted is False
assert result.reason == "tool_or_organization_looking_people"


def test_tool_or_org_suffix_people_rejected_even_with_person_shape() -> None:
from automem.utils.entity_quality import validate_entity_value

context = "GrowthMath specializes in B2B SaaS analytics."
result = validate_entity_value("people", "growthmath", context=context)

assert result.accepted is False
assert result.reason == "tool_or_organization_looking_people"


@pytest.mark.parametrize(
("category", "slug"),
[
("tools", "claude-code"),
("tools", "vs-code"),
("tools", "code-server"),
],
)
def test_code_suffixed_tool_names_are_accepted(category: str, slug: str) -> None:
from automem.utils.entity_quality import validate_entity_slug

result = validate_entity_slug(category, slug)

assert result.accepted is True
assert result.canonical_slug == slug


@pytest.mark.parametrize(
("category", "slug"),
[
("people", "claude-code"),
("organizations", "claude-md"),
("tools", "venv-bin-python-m"),
("tools", "tmp-settings"),
],
)
def test_code_and_markdown_fragments_still_rejected(category: str, slug: str) -> None:
from automem.utils.entity_quality import validate_entity_slug

result = validate_entity_slug(category, slug)

assert result.accepted is False


@pytest.mark.parametrize(
("category", "normalized", "slug"),
[
("events", "events", "launch-summit-2026"),
("event", "events", "launch-summit-2026"),
("opportunities", "opportunities", "hosting-partnerships"),
("opportunity", "opportunities", "hosting-partnerships"),
],
)
def test_event_and_opportunity_categories_are_supported(
category: str,
normalized: str,
slug: str,
) -> None:
from automem.utils.entity_quality import validate_entity_slug

result = validate_entity_slug(category, slug)

assert result.accepted is True
assert result.category == normalized
assert result.canonical_tag == f"entity:{normalized}:{slug}"


@pytest.mark.parametrize(
"slug",
["bottom-line", "deck-today", "email-highlights", "claude-desktop"],
)
def test_common_word_pairs_are_not_people(slug: str) -> None:
from automem.utils.entity_quality import validate_entity_slug

result = validate_entity_slug("people", slug)

assert result.accepted is False
assert result.reason == "low_signal_people_slug"
Loading