Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

SHELL := /bin/bash
.PHONY: all iso iso-netinst iso-offline package sbom clean test help
.PHONY: all iso iso-netinst iso-offline package sbom clean test test-search help

# Build configuration
CODENAME := trixie
Expand Down Expand Up @@ -37,6 +37,7 @@ help:
@echo " package PKG=x Build specific package (cx-core, cx-full, cx-archive-keyring)"
@echo " sbom Generate Software Bill of Materials"
@echo " test Run build verification tests"
@echo " test-search Run package search helper tests"
@echo " clean Remove build artifacts"
@echo " deps Install build dependencies"
@echo ""
Expand Down Expand Up @@ -162,6 +163,10 @@ test:
./tests/verify-preseed.sh || true
@echo -e "$(GREEN)Tests complete$(NC)"

test-search:
@echo -e "$(GREEN)Running package search tests...$(NC)"
./tests/search-packages-test.sh

# Clean build artifacts
clean:
@echo -e "$(YELLOW)Cleaning build artifacts...$(NC)"
Expand Down
30 changes: 30 additions & 0 deletions apt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,36 @@ git commit -m "Add mypackage 1.0.0"
git push
```

## Smart Package Search

Use `apt/scripts/search-packages.py` to query generated `Packages` indexes before
publishing or while debugging the repository locally. It supports exact package
names, short natural-language queries, common synonyms, and small typos without
requiring network access or extra Python dependencies.

```bash
# Search generated repository indexes under dists/
./apt/scripts/search-packages.py "web server"

# Search a specific Packages file
./apt/scripts/search-packages.py --index dists/stable/main/binary-amd64/Packages postgress
```

Example output:

```text
Results:
1. postgresql (16+257)
object-relational SQL database
score=33 reason=fuzzy name; fuzzy term
```

Run the focused test suite with:

```bash
make test-search
```

### Method 2: Workflow dispatch

Go to Actions → Publish APT Repository → Run workflow
Expand Down
281 changes: 281 additions & 0 deletions apt/scripts/search-packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#!/usr/bin/env python3
"""Search CX APT package indexes with fuzzy and synonym-aware ranking."""

from __future__ import annotations

import argparse
import gzip
import re
import sys
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path


SYNONYMS = {
"database": {"database", "db", "sql", "postgres", "postgresql", "mysql"},
"gpu": {"gpu", "nvidia", "amd", "graphics", "cuda", "rocm"},
"monitoring": {"monitoring", "metrics", "observability", "prometheus"},
"security": {"security", "hardening", "firewall", "sandbox", "secops"},
"web": {"web", "http", "server", "nginx", "apache", "caddy"},
}

SYNONYM_ALIASES = {alias for aliases in SYNONYMS.values() for alias in aliases}


@dataclass(frozen=True)
class Package:
name: str
version: str
description: str
fields: dict[str, str]


@dataclass(frozen=True)
class SearchResult:
package: Package
score: int
reason: str


@dataclass(frozen=True)
class QueryContext:
normalized: str
tokens: set[str]
expanded_terms: set[str]


def normalize(value: str) -> str:
return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()


def tokens(value: str) -> set[str]:
return {token for token in normalize(value).split() if token}


@lru_cache(maxsize=4096)
def levenshtein(a: str, b: str) -> int:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The Levenshtein distance calculation is called for every package name and term during search. Since the scoring logic only cares about distances up to 2 (see lines 149 and 160), you can optimize this by returning early if the length difference between a and b is already greater than the threshold.

if a == b:
return 0
if not a:
return len(b)
if not b:
return len(a)

previous = list(range(len(b) + 1))
for i, char_a in enumerate(a, 1):
current = [i]
for j, char_b in enumerate(b, 1):
current.append(
min(
previous[j] + 1,
current[j - 1] + 1,
previous[j - 1] + (char_a != char_b),
)
)
previous = current
return previous[-1]


def expanded_query_terms(query_terms: set[str]) -> set[str]:
expanded = set(query_terms)
for term in query_terms:
closest_alias = min(SYNONYM_ALIASES, key=lambda alias: levenshtein(term, alias), default="")
fuzzy_alias = (
closest_alias
and len(term) >= 4
and term[0] == closest_alias[0]
and levenshtein(term, closest_alias) <= 2
)
if fuzzy_alias:
expanded.add(closest_alias)

for group in SYNONYMS.values():
if term in group or (fuzzy_alias and closest_alias in group):
expanded.update(group)
return expanded


def build_query_context(query: str) -> QueryContext:
query_tokens = tokens(query)
return QueryContext(
normalized=normalize(query),
tokens=query_tokens,
expanded_terms=expanded_query_terms(query_tokens),
)


def open_index(path: Path):
if path.suffix == ".gz":
return gzip.open(path, "rt", encoding="utf-8", errors="replace")
return path.open("r", encoding="utf-8", errors="replace")


def parse_packages_index(path: Path) -> list[Package]:
packages: list[Package] = []
current: dict[str, str] = {}
last_key: str | None = None

with open_index(path) as handle:
for raw_line in handle:
line = raw_line.rstrip("\n")
if not line:
if current:
packages.append(package_from_fields(current))
current = {}
last_key = None
continue

if line.startswith(" ") and last_key:
current[last_key] = f"{current[last_key]}\n{line.strip()}"
continue

key, separator, value = line.partition(":")
if separator:
last_key = key
current[key] = value.strip()

if current:
packages.append(package_from_fields(current))

return packages


def package_from_fields(fields: dict[str, str]) -> Package:
return Package(
name=fields.get("Package", ""),
version=fields.get("Version", ""),
description=fields.get("Description", ""),
fields=dict(fields),
)


def score_package(package: Package, query: QueryContext) -> SearchResult | None:
name_norm = normalize(package.name)
package_terms = tokens(f"{package.name} {package.description}")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

package_terms is calculated for every package on every search. This involves string concatenation, regex normalization, and tokenization. Since the package index is static during the script execution, these tokens should be pre-calculated once when the index is parsed and stored in the Package object to improve search responsiveness.


score = 0
reasons: list[str] = []

if query.normalized == name_norm:
score += 100
reasons.append("exact name")
elif query.normalized in name_norm:
score += 75
reasons.append("name contains query")

overlap = query.expanded_terms & package_terms
if overlap:
score += 12 * len(overlap)
reasons.append("matched " + ", ".join(sorted(overlap)[:4]))

closest_distance = min(
(levenshtein(term, package.name) for term in query.tokens),
default=99,
)
if closest_distance <= 2:
score += 45 - (closest_distance * 10)
reasons.append("fuzzy name")

for query_term in query.tokens:
if len(query_term) < 4:
continue
for package_term in package_terms:
if len(package_term) < 4:
continue
if abs(len(query_term) - len(package_term)) > 2:
continue
if levenshtein(query_term, package_term) <= 2:
score += 8
reasons.append("fuzzy term")
break

if score <= 0:
return None

return SearchResult(package=package, score=score, reason="; ".join(dict.fromkeys(reasons)))


def search(packages: list[Package], query: str, limit: int) -> list[SearchResult]:
query_context = build_query_context(query)
if not query_context.normalized:
return []

results = [result for package in packages if (result := score_package(package, query_context))]
return sorted(results, key=lambda result: (-result.score, result.package.name))[:limit]


def default_index_paths(repo_root: Path) -> list[Path]:
dists = repo_root / "dists"
if not dists.exists():
return []

plain_indexes = sorted(dists.glob("**/Packages"))
plain_dirs = {path.parent for path in plain_indexes}
gzip_only_indexes = sorted(
path for path in dists.glob("**/Packages.gz") if path.parent not in plain_dirs
)
return plain_indexes + gzip_only_indexes


def load_packages(repo_root: Path, indexes: list[Path]) -> list[Package]:
paths = indexes or default_index_paths(repo_root)
packages: list[Package] = []
seen_paths: set[Path] = set()
seen_packages: set[tuple[str, str, str]] = set()
for path in paths:
path_key = path.resolve()
if path_key in seen_paths:
continue
seen_paths.add(path_key)

if not path.exists():
raise FileNotFoundError(f"package index not found: {path}")
for package in parse_packages_index(path):
package_key = (package.name, package.version, package.description)
if package_key in seen_packages:
continue
seen_packages.add(package_key)
packages.append(package)
return packages


def print_results(results: list[SearchResult]) -> None:
if not results:
print("No matching packages found.")
return

print("Results:")
for index, result in enumerate(results, 1):
package = result.package
print(f" {index}. {package.name} ({package.version or 'unknown version'})")
if package.description:
first_line = package.description.splitlines()[0]
print(f" {first_line}")
print(f" score={result.score} reason={result.reason}")


def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("query", help="Package name, typo, synonym, or natural-language query")
parser.add_argument("--repo-root", type=Path, default=Path.cwd(), help="APT repository root")
parser.add_argument("--index", type=Path, action="append", default=[], help="Packages or Packages.gz file")
parser.add_argument("--limit", type=int, default=5, help="Maximum results to show")
args = parser.parse_args(argv)

try:
packages = load_packages(args.repo_root, args.index)
except OSError as error:
print(error, file=sys.stderr)
return 2

if not packages:
print("No package indexes found. Pass --index or run from an APT repository root.", file=sys.stderr)
return 2

print_results(search(packages, args.query, args.limit))
return 0


if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
Loading